# Merging DataFrames with pandas


In [1]:
from glob import glob
import pandas as pd

## Concatenating vertically to get MultiIndexed rows

When stacking a sequence of DataFrames vertically, it is sometimes desirable to construct a MultiIndex to indicate the DataFrame from which each row originated. This can be done by specifying the `keys` parameter in the call to `pd.concat()`, which generates a hierarchical index with the labels from `keys` as the outermost index label. So you don't have to rename the columns of each DataFrame as you load it. Instead, only the Index column needs to be specified.


In [2]:
medal_types = ['Bronze','Silver','Gold']
medals = []

for medal in medal_types:

    file_name = "data/summer-olympic-medals/%s.csv" % medal
    
    # Read file_name into a DataFrame: medal_df
    medal_df = pd.read_csv(file_name, index_col = "Country")
    
    # Append medal_df to medals
    medals.append(medal_df)
    
# Concatenate medals: medals
medals = pd.concat(medals, keys = ['bronze', 'silver', 'gold'])

# Print medals in entirety
print(medals)

                             NOC   Total
       Country                          
bronze United States         USA  1052.0
       Soviet Union          URS   584.0
       United Kingdom        GBR   505.0
       France                FRA   475.0
       Germany               GER   454.0
...                          ...     ...
gold   Senegal               SEN     NaN
       Sudan                 SUD     NaN
       Tonga                 TGA     NaN
       Burundi               BDI     1.0
       United Arab Emirates  UAE     1.0

[414 rows x 2 columns]


## Slicing MultiIndexed DataFrames

* Create an alias for `pd.IndexSlice` called `idx`. A slicer `pd.IndexSlice` is required when slicing on the inner level of a MultiIndex.
* Slice all the data on medals won by the United Kingdom in the DataFrame `medals_sorted`. To do this, use the `.loc[]` accessor with `idx[:,'United Kingdom'], :`.

In [3]:
# Sort the entries of medals: medals_sorted
medals_sorted = medals.sort_index(level = 0)

# Print the number of Bronze medals won by Germany
print(medals_sorted.loc[('bronze','Germany')])

# Print data about silver medals
print(medals_sorted.loc['silver'])

# Create alias for pd.IndexSlice: idx
idx = pd.IndexSlice

# Print all the data on medals won by the United Kingdom
print(medals_sorted.loc[idx[:,'United Kingdom'],:])

NOC      GER
Total    454
Name: (bronze, Germany), dtype: object
                 NOC  Total
Country                    
0                RU1    7.0
Afghanistan      AFG    NaN
Algeria          ALG    2.0
Argentina        ARG   83.0
Armenia          ARM    1.0
...              ...    ...
Virgin Islands*  ISV    1.0
West Germany     FRG  167.0
Yugoslavia       YUG  174.0
Zambia           ZAM    1.0
Zimbabwe         ZIM    4.0

[138 rows x 2 columns]
                       NOC  Total
       Country                   
bronze United Kingdom  GBR  505.0
gold   United Kingdom  GBR  498.0
silver United Kingdom  GBR  591.0


## Concatenating horizontally to get MultiIndexed columns

In [4]:
files = glob('data/sales/feb-sales-*.csv')
dataframes = [pd.read_csv(file, parse_dates = True, index_col = 'Date') for file in files]

In [5]:
# Concatenate dataframes: february
february = pd.concat(dataframes, axis = 1, keys = ['Hardware','Software','Service'])

# Print february.info()
print(february.info())

# Assign pd.IndexSlice: idx
idx = pd.IndexSlice

# Create the slice: slice_2_8
slice_2_8 = february.loc['Feb 2, 2015':'Feb 8, 2015', idx[:, 'Company']]

# Print slice_2_8
print(slice_2_8)

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 20 entries, 2015-02-02 08:33:01 to 2015-02-26 08:58:51
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   (Hardware, Company)  5 non-null      object 
 1   (Hardware, Product)  5 non-null      object 
 2   (Hardware, Units)    5 non-null      float64
 3   (Software, Company)  9 non-null      object 
 4   (Software, Product)  9 non-null      object 
 5   (Software, Units)    9 non-null      float64
 6   (Service, Company)   6 non-null      object 
 7   (Service, Product)   6 non-null      object 
 8   (Service, Units)     6 non-null      float64
dtypes: float64(3), object(6)
memory usage: 1.6+ KB
None
                            Hardware         Software Service
                             Company          Company Company
Date                                                         
2015-02-02 08:33:01              NaN            Hooli     NaN
2015-

## Concatenating DataFrames from a dict


In [6]:
jan = pd.read_csv('data/sales/sales-jan-2015.csv', parse_dates = True, index_col = 'Date')
feb = pd.read_csv('data/sales/sales-feb-2015.csv', parse_dates = True, index_col = 'Date')
mar = pd.read_csv('data/sales/sales-mar-2015.csv', parse_dates = True, index_col = 'Date')

In [7]:
# Make the list of tuples: month_list
month_list = [('january', jan),('february', feb),('march', mar)]

# Create an empty dictionary: month_dict
month_dict = {}

for month_name, month_data in month_list:

    # Group month_data: month_dict[month_name]
    month_dict[month_name] = month_data.groupby('Company').sum()

# Concatenate data in month_dict: sales
sales = pd.concat(month_dict)

# Print sales
print(sales)

# Print all sales by Mediacore
idx = pd.IndexSlice
print(sales.loc[idx[:, 'Mediacore'], :])

                          Units
         Company               
january  Acme Coporation     76
         Hooli               70
         Initech             37
         Mediacore           15
         Streeplex           50
february Acme Coporation     34
         Hooli               30
         Initech             30
         Mediacore           45
         Streeplex           36
march    Acme Coporation      5
         Hooli               37
         Initech             68
         Mediacore           68
         Streeplex           40
                    Units
         Company         
january  Mediacore     15
february Mediacore     45
march    Mediacore     68


## Concatenating DataFrames rows with inner join


In [8]:
bronze = pd.read_csv('data/summer-olympic-medals/Bronze.csv', index_col = 'Country').drop('NOC', axis = 1)
silver = pd.read_csv('data/summer-olympic-medals/Silver.csv', index_col = 'Country').drop('NOC', axis = 1)
gold = pd.read_csv('data/summer-olympic-medals/Gold.csv', index_col = 'Country').drop('NOC', axis = 1)

In [9]:
# Create the list of DataFrames: medal_list
medal_list = [bronze, silver, gold]

# Concatenate medal_list horizontally using an inner join: medals
medals = pd.concat(medal_list, keys = ['bronze', 'silver', 'gold'], axis = 1, join = 'inner')

# Print medals
print(medals)


                      bronze  silver    gold
                       Total   Total   Total
Country                                     
United States         1052.0  1195.0  2088.0
Soviet Union           584.0   627.0   838.0
United Kingdom         505.0   591.0   498.0
France                 475.0   461.0   378.0
Germany                454.0   350.0   407.0
...                      ...     ...     ...
Senegal                  NaN     1.0     NaN
Sudan                    NaN     1.0     NaN
Tonga                    NaN     1.0     NaN
Burundi                  NaN     NaN     1.0
United Arab Emirates     NaN     NaN     1.0

[138 rows x 3 columns]


## Resampling & concatenating DataFrames with inner join

```python
DataFrame.resample(self, rule, axis=0, closed: Union[str, NoneType] = None, label: Union[str, NoneType] = None, convention: str = 'start', kind: Union[str, NoneType] = None, loffset=None, base: int = 0, on=None, level=None)
```
* Resample time-series data.
* Convenience method for frequency conversion and resampling of time series. Object must have a datetime-like index (DatetimeIndex, PeriodIndex, or TimedeltaIndex), or pass datetime-like values to the on or level keyword.


### Exercise

* In this exercise, you'll compare the historical 10-year GDP (Gross Domestic Product) growth in the US and in China. The data for the US starts in 1947 and is recorded quarterly; by contrast, the data for China starts in 1961 and is recorded annually.
* Make a new DataFrame china_annual by resampling the DataFrame china with `.resample('A').last()` (i.e., with annual frequency) and chaining two method calls:
* Chain `.pct_change(10)` as an aggregation method to compute the percentage change with an offset of ten years.
* Print the result of resampling gdp every decade (i.e., using `.resample('10A')`) and aggregating with the method `.last()`. This has been done for you, so hit 'Submit Answer' to see the result!

In [10]:
china = pd.read_csv('data/GDP/gdp_china.csv', parse_dates = True, index_col = 'Year')
us = pd.read_csv('data/GDP/gdp_usa.csv', parse_dates = True, index_col = 'DATE')

In [11]:
# Resample and tidy china: china_annual
china_annual = china.resample('A').last().pct_change(10).dropna()

# Resample and tidy us: us_annual
us_annual = us.resample('A').last().pct_change(10).dropna()

# Concatenate china_annual and us_annual: gdp
gdp = pd.concat([china_annual, us_annual], join = 'inner', axis = 1)

# Resample gdp and print
print(gdp.resample('10A').last())

                 GDP     VALUE
Year                          
1970-12-31  0.546128  1.017187
1980-12-31  1.072537  1.742556
1990-12-31  0.892820  1.012126
2000-12-31  2.357522  0.738632
2010-12-31  4.011081  0.454332
2020-12-31  3.789936  0.361780


In [12]:
china_annual.head()

Unnamed: 0_level_0,GDP
Year,Unnamed: 1_level_1
1970-12-31,0.546128
1971-12-31,0.98886
1972-12-31,1.402472
1973-12-31,1.730085
1974-12-31,1.408556


In [13]:
us_annual.head()

Unnamed: 0_level_0,VALUE
DATE,Unnamed: 1_level_1
1957-12-31,0.827507
1958-12-31,0.782686
1959-12-31,0.953137
1960-12-31,0.689354
1961-12-31,0.630959


## References

1. [PyData - resample](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.resample.html)