# Merging DataFrames with pandas

### Appending pandas Series

In [1]:
import pandas as pd

In [2]:
jan = pd.read_csv('sales/sales-jan-2015.csv', parse_dates=True, index_col='Date')
feb = pd.read_csv('sales/sales-feb-2015.csv', parse_dates=True, index_col='Date')
mar = pd.read_csv('sales/sales-mar-2015.csv', parse_dates=True, index_col='Date')

In [3]:
jan_units = jan['Units']
feb_units = feb['Units']
mar_units = mar['Units']

In [4]:
quarter1 = jan_units.append(feb_units).append(mar_units)

In [5]:
quarter1.loc['jan 27, 2015':'feb 2, 2015']

Date
2015-01-27 07:11:55    18
2015-02-02 08:30:00     3
2015-02-02 21:00:00     9
Name: Units, dtype: int64

In [6]:
quarter1.loc['feb 26, 2015':'mar 7, 2015']

Date
2015-02-26 09:00:00     4
2015-03-06 10:11:45    17
2015-03-06 02:03:56    17
Name: Units, dtype: int64

In [7]:
quarter1.sum()

641

### Concatenating pandas Series along row axis

In [8]:
units=[]
for month in [jan, feb, mar]:
  units.append(month['Units'])
quarter1 = pd.concat(units, axis='rows')
quarter1.loc['jan 27, 2015':'feb 2, 2015']

Date
2015-01-27 07:11:55    18
2015-02-02 08:30:00     3
2015-02-02 21:00:00     9
Name: Units, dtype: int64

In [9]:
quarter1.loc['feb 26, 2015':'mar 7, 2015']

Date
2015-02-26 09:00:00     4
2015-03-06 10:11:45    17
2015-03-06 02:03:56    17
Name: Units, dtype: int64

### Appending DataFrames with ignore_index

In [10]:
names_1881 = pd.read_csv('baby_names/names1881.csv', header=None)
names_1981 = pd.read_csv('baby_names/names1981.csv', header=None)

In [11]:
col_names = ['name', 'gender', 'count', 'year']
names_1881['year'] = 1881
names_1981['year'] = 1981
names_1881.columns = col_names
names_1981.columns = col_names
names_1881.head()

Unnamed: 0,name,gender,count,year
0,Mary,F,6919,1881
1,Anna,F,2698,1881
2,Emma,F,2034,1881
3,Elizabeth,F,1852,1881
4,Margaret,F,1658,1881


In [12]:
combined_names = names_1881.append(names_1981, ignore_index=True)

In [13]:
print(names_1981.shape)
print(names_1881.shape)
print(combined_names.shape)

(19455, 4)
(1935, 4)
(21390, 4)


In [14]:
print(combined_names.loc[combined_names['name']=='Morgan'])

         name gender  count  year
1283   Morgan      M     23  1881
2096   Morgan      F   1769  1981
14390  Morgan      M    766  1981


### Concatenating pandas DataFrames along column axis

In [15]:
weather_4 = {'Month':['Jan','Apr','Jul','Oct'], 'Max_TemperatureF':[68,89,91,84]}
weather_max = pd.DataFrame(weather_4)
weather_max.set_index('Month', inplace=True)
weather_max.head()

Unnamed: 0_level_0,Max_TemperatureF
Month,Unnamed: 1_level_1
Jan,68
Apr,89
Jul,91
Oct,84


In [16]:
months = ['Apr','Aug','Dec','Feb','Jan','Jul','Jun','Mar','May','Nov','Oct','Sep']
mean = [53.100000,70.000000,34.935484,28.714286,32.354839,72.870968,70.133333,35.000000,
        62.612903,39.800000,55.451613,63.766667
]
weather_mean = pd.DataFrame({'Month':months, 'Mean_TemperatureF':mean})
weather_mean.set_index('Month', inplace=True)
weather_mean.head()

Unnamed: 0_level_0,Mean_TemperatureF
Month,Unnamed: 1_level_1
Apr,53.1
Aug,70.0
Dec,34.935484
Feb,28.714286
Jan,32.354839


In [18]:
weather = pd.concat([weather_max,weather_mean], axis=1, sort=False)
weather.head(7)

Unnamed: 0,Max_TemperatureF,Mean_TemperatureF
Jan,68.0,32.354839
Apr,89.0,53.1
Jul,91.0,72.870968
Oct,84.0,55.451613
Aug,,70.0
Dec,,34.935484
Feb,,28.714286


### Reading multiple files to build a DataFrame

In [22]:
medals = []
medal_types = ['bronze', 'silver', 'gold']

In [23]:
for medal in medal_types:
  file_name = "%s_top5.csv" % medal
  columns = ['Country', medal]
  medal_df = pd.read_csv('medals/'+file_name, header=0, index_col='Country', names=columns)
  medals.append(medal_df)

medals = pd.concat(medals, axis='columns',sort=False)
medals

Unnamed: 0,bronze,silver,gold
United States,1052.0,1195.0,2088.0
Soviet Union,584.0,627.0,838.0
United Kingdom,505.0,591.0,498.0
France,475.0,461.0,
Germany,454.0,,407.0
Italy,,394.0,460.0


### Concatenating vertically to get MultiIndexed rows

In [24]:
medals=[]
medal_types = ['bronze', 'silver', 'gold']

In [25]:
for medal in medal_types:
  file_name = "%s_top5.csv" % medal
  medal_df = pd.read_csv('medals/'+file_name, index_col='Country')
  medals.append(medal_df)
medals = pd.concat(medals, keys=medal_types, sort=False)
medals

Unnamed: 0_level_0,Unnamed: 1_level_0,Total
Unnamed: 0_level_1,Country,Unnamed: 2_level_1
bronze,United States,1052.0
bronze,Soviet Union,584.0
bronze,United Kingdom,505.0
bronze,France,475.0
bronze,Germany,454.0
silver,United States,1195.0
silver,Soviet Union,627.0
silver,United Kingdom,591.0
silver,France,461.0
silver,Italy,394.0


### Slicing MultiIndexed DataFrames

In [26]:
medals_sorted = medals.sort_index(level=0)
medals_sorted.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Total
Unnamed: 0_level_1,Country,Unnamed: 2_level_1
bronze,France,475.0
bronze,Germany,454.0
bronze,Soviet Union,584.0
bronze,United Kingdom,505.0
bronze,United States,1052.0


In [27]:
medals_sorted.loc[('bronze','Germany')]

Total    454.0
Name: (bronze, Germany), dtype: float64

In [28]:
medals_sorted.loc['silver']

Unnamed: 0_level_0,Total
Country,Unnamed: 1_level_1
France,461.0
Italy,394.0
Soviet Union,627.0
United Kingdom,591.0
United States,1195.0


In [29]:
idx = pd.IndexSlice
medals_sorted.loc[idx[:,'United Kingdom'], :]

Unnamed: 0_level_0,Unnamed: 1_level_0,Total
Unnamed: 0_level_1,Country,Unnamed: 2_level_1
bronze,United Kingdom,505.0
gold,United Kingdom,498.0
silver,United Kingdom,591.0


### Concatenating horizontally to get MultiIndexed columns

In [30]:
dataframes = []
files = ['feb-sales-Hardware.csv', 'feb-sales-Software.csv', 'feb-sales-Service.csv']
for file in files:
  sales_df = pd.read_csv('sales/'+file, parse_dates=True, index_col='Date')
  dataframes.append(sales_df)
dataframes

[                             Company   Product  Units
 Date                                                 
 2015-02-04 21:52:45  Acme Coporation  Hardware     14
 2015-02-07 22:58:10  Acme Coporation  Hardware      1
 2015-02-19 10:59:33        Mediacore  Hardware     16
 2015-02-02 20:54:49        Mediacore  Hardware      9
 2015-02-21 20:41:47            Hooli  Hardware      3,
                              Company   Product  Units
 Date                                                 
 2015-02-16 12:09:19            Hooli  Software     10
 2015-02-03 14:14:18          Initech  Software     13
 2015-02-02 08:33:01            Hooli  Software      3
 2015-02-05 01:53:06  Acme Coporation  Software     19
 2015-02-11 20:03:08          Initech  Software      7
 2015-02-09 13:09:55        Mediacore  Software      7
 2015-02-11 22:50:44            Hooli  Software      4
 2015-02-04 15:36:29        Streeplex  Software     13
 2015-02-21 05:01:26        Mediacore  Software      3,
        

In [31]:
february = pd.concat(dataframes, keys=['Hardware', 'Software', 'Service'], axis=1)
february.head()

Unnamed: 0_level_0,Hardware,Hardware,Hardware,Software,Software,Software,Service,Service,Service
Unnamed: 0_level_1,Company,Product,Units,Company,Product,Units,Company,Product,Units
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
2015-02-02 08:33:01,,,,Hooli,Software,3.0,,,
2015-02-02 20:54:49,Mediacore,Hardware,9.0,,,,,,
2015-02-03 14:14:18,,,,Initech,Software,13.0,,,
2015-02-04 15:36:29,,,,Streeplex,Software,13.0,,,
2015-02-04 21:52:45,Acme Coporation,Hardware,14.0,,,,,,


In [32]:
february.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 20 entries, 2015-02-02 08:33:01 to 2015-02-26 08:58:51
Data columns (total 9 columns):
(Hardware, Company)    5 non-null object
(Hardware, Product)    5 non-null object
(Hardware, Units)      5 non-null float64
(Software, Company)    9 non-null object
(Software, Product)    9 non-null object
(Software, Units)      9 non-null float64
(Service, Company)     6 non-null object
(Service, Product)     6 non-null object
(Service, Units)       6 non-null float64
dtypes: float64(3), object(6)
memory usage: 1.6+ KB


In [33]:
idx = pd.IndexSlice
slice_2_8 = february.loc['2015-2-2':'2015-2-8', idx[:,'Company']]
slice_2_8

Unnamed: 0_level_0,Hardware,Software,Service
Unnamed: 0_level_1,Company,Company,Company
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
2015-02-02 08:33:01,,Hooli,
2015-02-02 20:54:49,Mediacore,,
2015-02-03 14:14:18,,Initech,
2015-02-04 15:36:29,,Streeplex,
2015-02-04 21:52:45,Acme Coporation,,
2015-02-05 01:53:06,,Acme Coporation,
2015-02-05 22:05:03,,,Hooli
2015-02-07 22:58:10,Acme Coporation,,


### Concatenating DataFrames from a dict

In [34]:
month_list = [('january', jan), ('february', feb), ('march', mar)]
month_dict = {}

In [35]:
for month_name, month_data in month_list:
  month_dict[month_name] = month_data.groupby('Company').sum()
sales = pd.concat(month_dict)

In [36]:
sales

Unnamed: 0_level_0,Unnamed: 1_level_0,Units
Unnamed: 0_level_1,Company,Unnamed: 2_level_1
february,Acme Coporation,34
february,Hooli,30
february,Initech,30
february,Mediacore,45
february,Streeplex,36
january,Acme Coporation,76
january,Hooli,70
january,Initech,37
january,Mediacore,15
january,Streeplex,50


In [37]:
idx = pd.IndexSlice
print(sales.loc[idx[:, 'Mediacore'], :])

                    Units
         Company         
february Mediacore     45
january  Mediacore     15
march    Mediacore     68


### Concatenating DataFrames with inner join

In [41]:
bronze = pd.read_csv('medals/bronze_top5.csv', index_col='Country')
silver = pd.read_csv('medals/silver_top5.csv', index_col='Country')
gold = pd.read_csv('medals/gold_top5.csv', index_col='Country')

In [42]:
bronze

Unnamed: 0_level_0,Total
Country,Unnamed: 1_level_1
United States,1052.0
Soviet Union,584.0
United Kingdom,505.0
France,475.0
Germany,454.0


In [43]:
medal_list = [bronze, silver, gold]
medals = pd.concat(medal_list, keys=['bronze', 'silver', 'gold'], axis=1,sort=False)
medals

Unnamed: 0_level_0,bronze,silver,gold
Unnamed: 0_level_1,Total,Total,Total
United States,1052.0,1195.0,2088.0
Soviet Union,584.0,627.0,838.0
United Kingdom,505.0,591.0,498.0
France,475.0,461.0,
Germany,454.0,,407.0
Italy,,394.0,460.0


In [44]:
medals = pd.concat(medal_list, keys=['bronze', 'silver', 'gold'], axis=1, join='inner',sort=False)
medals

Unnamed: 0_level_0,bronze,silver,gold
Unnamed: 0_level_1,Total,Total,Total
Country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
United States,1052.0,1195.0,2088.0
Soviet Union,584.0,627.0,838.0
United Kingdom,505.0,591.0,498.0


### Resampling & concatenating DataFrames with inner join

In [45]:
china = pd.read_csv('GDP/gdp_china.csv', parse_dates=True, index_col='Year')
usa = pd.read_csv('GDP/gdp_usa.csv', parse_dates=True, index_col='DATE')
usa.index.name = 'Year'
china.columns = ['China']
usa.columns = ['US']

In [46]:
china_annual = china.resample('A').mean().pct_change(10).dropna()
us_annual = usa.resample('A').mean().pct_change(10).dropna()
gdp = pd.concat([china_annual, us_annual], axis=1, join='inner')
gdp.resample('10A').last()

Unnamed: 0_level_0,China,US
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1970-12-31,0.546128,0.980397
1980-12-31,1.072537,1.66054
1990-12-31,0.89282,1.088953
2000-12-31,2.357522,0.71998
2010-12-31,4.011081,0.455009
2020-12-31,3.789936,0.377506
