# Merging Ordered and Time Series Data

---

## Loading Datasets

In [1]:
# import library
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle

In [3]:
# Loading 1st dataset
gdp = pd.read_csv("D:/git_repositories/Datacamp-Joining_data_with_pandas/Datasets/WorldBank_GDP.csv")

In [4]:
gdp.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Year,GDP
0,China,CHN,GDP (current US$),2010,6087160000000.0
1,Germany,DEU,GDP (current US$),2010,3417090000000.0
2,Japan,JPN,GDP (current US$),2010,5700100000000.0
3,United States,USA,GDP (current US$),2010,14992100000000.0
4,China,CHN,GDP (current US$),2011,7551500000000.0


In [5]:
# loading 2nd dataset

sp500 = pd.read_csv("D:/git_repositories/Datacamp-Joining_data_with_pandas/Datasets/S&P500.csv")

In [6]:
sp500.head()

Unnamed: 0,Date,Returns
0,2008,-38.49
1,2009,23.45
2,2010,12.78
3,2011,0.0
4,2012,13.41


---

### Correlation Between GDP and SP500

In [9]:
# Use merge_ordered() to merge gdp and sp500 on year and date
gdp_sp500 = pd.merge_ordered(gdp, sp500, left_on= 'Year', right_on= 'Date', 
                             how= 'left')

# Print gdp_sp500
gdp_sp500.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Year,GDP,Date,Returns
0,China,CHN,GDP (current US$),2010,6087160000000.0,2010.0,12.78
1,Germany,DEU,GDP (current US$),2010,3417090000000.0,2010.0,12.78
2,Japan,JPN,GDP (current US$),2010,5700100000000.0,2010.0,12.78
3,United States,USA,GDP (current US$),2010,14992100000000.0,2010.0,12.78
4,China,CHN,GDP (current US$),2011,7551500000000.0,2011.0,0.0


### Fill up the missing values with the previous value

In [10]:
# Use merge_ordered() to merge gdp and sp500, interpolate missing value
gdp_sp500 = pd.merge_ordered(gdp, sp500, left_on = 'Year', right_on = 'Date', how = 'left',
                fill_method = 'ffill')


# Print gdp_sp500

gdp_sp500.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Year,GDP,Date,Returns
0,China,CHN,GDP (current US$),2010,6087160000000.0,2010,12.78
1,Germany,DEU,GDP (current US$),2010,3417090000000.0,2010,12.78
2,Japan,JPN,GDP (current US$),2010,5700100000000.0,2010,12.78
3,United States,USA,GDP (current US$),2010,14992100000000.0,2010,12.78
4,China,CHN,GDP (current US$),2011,7551500000000.0,2011,0.0


**Load another dataset**

In [11]:
country = pd.read_csv("D:/git_repositories/Datacamp-Joining_data_with_pandas/Datasets/WorldBank_POP.csv")

In [12]:
country.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Year,Pop
0,Aruba,ABW,"Population, total",2010,101669.0
1,Afghanistan,AFG,"Population, total",2010,29185507.0
2,Angola,AGO,"Population, total",2010,23356246.0
3,Albania,ALB,"Population, total",2010,2913021.0
4,Andorra,AND,"Population, total",2010,84449.0


In [15]:
# Merging 2 datasets
date_country = pd.merge_ordered(
    country, gdp, on = ["Country Name", "Year"], how = 'inner', fill_method = 'ffill'
)

In [16]:
date_country.head()

Unnamed: 0,Country Name,Country Code_x,Indicator Name_x,Year,Pop,Country Code_y,Indicator Name_y,GDP
0,China,CHN,"Population, total",2010,1337705000.0,CHN,GDP (current US$),6087160000000.0
1,China,CHN,"Population, total",2011,1344130000.0,CHN,GDP (current US$),7551500000000.0
2,China,CHN,"Population, total",2012,1350695000.0,CHN,GDP (current US$),8532230000000.0
3,China,CHN,"Population, total",2012,1350695000.0,CHN,GDP (current US$),8532230000000.0
4,China,CHN,"Population, total",2012,1350695000.0,CHN,GDP (current US$),8532230000000.0


In [None]:
# Use merge_asof() to merge jpm and wells
jpm_wells = pd.merge_asof(jpm, wells, on='date_time', 
                          suffixes=('', '_wells'), direction='nearest')

# Use merge_asof() to merge jpm_wells and bac
jpm_wells_bac = pd.merge_asof(jpm_wells, bac, on='date_time', 
                              suffixes=('_jpm', '_bac'), direction='nearest')

# Compute price diff
price_diffs = jpm_wells_bac.diff()

# Plot the price diff of the close of jpm, wells and bac only
price_diffs.plot(y=['close_jpm','close_wells','close_bac'])
plt.show()

In [None]:
# Merge gdp and recession on date using merge_asof()
gdp_recession = pd.merge_asof(gdp, recession, on='date')

# Create a list based on the row value of gdp_recession['econ_status']
is_recession = ['r' if s=='recession' else 'g' for s in gdp_recession['econ_status']]

# Plot a bar chart of gdp_recession
gdp_recession.plot(kind='bar', y='gdp', x='date', color=is_recession, rot=90)
plt.show()

### Using .melt() method to unpivot everything

In [None]:
# unpivot everything besides the year column
ur_tall = ur_wide.melt(id_vars = 'year', var_name = 'month', value_name = 'unempl_rate')


# Create a date column using the month and year columns of ur_tall
ur_tall['date'] = pd.to_datetime(ur_tall['year'] + '-' + ur_tall['month'])

# Sort ur_tall by date in ascending order
ur_sorted = ur_tall.sort_values('date', ascending = True)

# Plot the unempl_rate by date
ur_sorted.plot(x = 'date', y = 'unempl_rate')
plt.show()

In [None]:
# Use melt on ten_yr, unpivot everything besides the metric column
bond_perc = ten_yr.melt(
    id_vars = 'metric',
    var_name = 'date',
    value_name = 'close'
)

# Use query on bond_perc to select only the rows where metric=close
bond_perc_close = bond_perc.query('metric == "close"')

# Merge (ordered) dji and bond_perc_close on date with an inner join
dow_bond = pd.merge_ordered(
    dji, bond_perc_close,
    on = 'date',
    how = 'inner',
    suffixes = ['_dow', '_bond']
)


# Plot only the close_dow and close_bond columns
dow_bond.plot(y = ['close_dow', 'close_bond'], x='date', rot=90)
plt.show()