# Merging Data in pandas

In [1]:
import pandas as pd

### Merging on a specific column

In [2]:
branch_id = [10,20,30,47]
city = ['Austin','Denver','Springfield','Mendocino']
revenue = [100,83,4,200]
revenue = pd.DataFrame({'branch_id':branch_id, 'city':city, 'revenue':revenue})

In [3]:
branch_id = [10,20,47,31]
city = ['Austin','Denver','Mendocino','Springfield']
manager = ['Charles','Joel','Brett','Sally']
managers = pd.DataFrame({'branch_id':branch_id,'city':city,'manager':manager})

In [4]:
merge_by_city = pd.merge(revenue,managers,on='city')
merge_by_city

Unnamed: 0,branch_id_x,city,revenue,branch_id_y,manager
0,10,Austin,100,10,Charles
1,20,Denver,83,20,Joel
2,30,Springfield,4,31,Sally
3,47,Mendocino,200,47,Brett


In [5]:
merge_by_id = pd.merge(revenue,managers,on='branch_id')
merge_by_id

Unnamed: 0,branch_id,city_x,revenue,city_y,manager
0,10,Austin,100,Austin,Charles
1,20,Denver,83,Denver,Joel
2,47,Mendocino,200,Mendocino,Brett


### Merging on columns with non-matching labels

In [6]:
managers = managers.rename(columns={'city':'branch'})

In [7]:
combined = pd.merge(revenue,managers,left_on='city',right_on='branch')
combined

Unnamed: 0,branch_id_x,city,revenue,branch_id_y,branch,manager
0,10,Austin,100,10,Austin,Charles
1,20,Denver,83,20,Denver,Joel
2,30,Springfield,4,31,Springfield,Sally
3,47,Mendocino,200,47,Mendocino,Brett


### Merging on multiple columns

In [8]:
managers['state'] = ['TX','CO','CA','MO']
managers = managers.rename(columns={'branch':'city'})
revenue['state'] = ['TX','CO','IL','CA']

In [9]:
combined = pd.merge(revenue,managers,on=['branch_id','city','state'])
combined

Unnamed: 0,branch_id,city,revenue,state,manager
0,10,Austin,100,TX,Charles
1,20,Denver,83,CO,Joel
2,47,Mendocino,200,CA,Brett


### Left & right merging on multiple columns

In [10]:
city = ['Mendocino','Denver','Austin','Springfield','Springfield']
state = ['CA','CO','TX','MO','IL']
units = [1,4,2,5,1]
sales = pd.DataFrame({'city':city,'state':state,'uints':units})
managers = managers.rename(columns={'city':'branch'})

In [11]:
revenue_and_sales = pd.merge(revenue,sales,how='right',on=['city','state'])
revenue_and_sales

Unnamed: 0,branch_id,city,revenue,state,uints
0,10.0,Austin,100.0,TX,2
1,20.0,Denver,83.0,CO,4
2,30.0,Springfield,4.0,IL,1
3,47.0,Mendocino,200.0,CA,1
4,,Springfield,,MO,5


In [12]:
sales_and_managers = pd.merge(sales,managers,how='left',left_on=['city', 'state'],right_on=['branch', 'state'])
sales_and_managers

Unnamed: 0,city,state,uints,branch_id,branch,manager
0,Mendocino,CA,1,47.0,Mendocino,Brett
1,Denver,CO,4,20.0,Denver,Joel
2,Austin,TX,2,10.0,Austin,Charles
3,Springfield,MO,5,31.0,Springfield,Sally
4,Springfield,IL,1,,,


### Merging DataFrames with outer join

In [13]:
merge_default = pd.merge(sales_and_managers,revenue_and_sales)
merge_default

Unnamed: 0,city,state,uints,branch_id,branch,manager,revenue
0,Mendocino,CA,1,47.0,Mendocino,Brett,200.0
1,Denver,CO,4,20.0,Denver,Joel,83.0
2,Austin,TX,2,10.0,Austin,Charles,100.0


In [14]:
merge_outer = pd.merge(sales_and_managers,revenue_and_sales, how='outer')
merge_outer

Unnamed: 0,city,state,uints,branch_id,branch,manager,revenue
0,Mendocino,CA,1,47.0,Mendocino,Brett,200.0
1,Denver,CO,4,20.0,Denver,Joel,83.0
2,Austin,TX,2,10.0,Austin,Charles,100.0
3,Springfield,MO,5,31.0,Springfield,Sally,
4,Springfield,IL,1,,,,
5,Springfield,IL,1,30.0,,,4.0
6,Springfield,MO,5,,,,


In [15]:
merge_outer_on = pd.merge(sales_and_managers,revenue_and_sales,how='outer',on=['city','state'])
merge_outer_on

Unnamed: 0,city,state,uints_x,branch_id_x,branch,manager,branch_id_y,revenue,uints_y
0,Mendocino,CA,1,47.0,Mendocino,Brett,47.0,200.0,1
1,Denver,CO,4,20.0,Denver,Joel,20.0,83.0,4
2,Austin,TX,2,10.0,Austin,Charles,10.0,100.0,2
3,Springfield,MO,5,31.0,Springfield,Sally,,,5
4,Springfield,IL,1,,,,30.0,4.0,1


### Using merge_ordered()

In [16]:
austin = pd.DataFrame({
    'date':['2016-01-01', '2016-02-08', '2016-01-17'],
    'ratings': ['Cloudy','Cloudy','Sunny']
})

In [17]:
houston = pd.DataFrame({
    'date':['2016-01-04', '2016-01-01', '2016-03-01'],
    'ratings': ['Rainy','Cloudy','Sunny']
})

In [18]:
tx_weather = pd.merge_ordered(austin,houston)
tx_weather

Unnamed: 0,date,ratings
0,2016-01-01,Cloudy
1,2016-01-04,Rainy
2,2016-01-17,Sunny
3,2016-02-08,Cloudy
4,2016-03-01,Sunny


In [19]:
tx_weather_suff = pd.merge_ordered(austin,houston,on='date',suffixes=['_aus','_hus'])
tx_weather_suff

Unnamed: 0,date,ratings_aus,ratings_hus
0,2016-01-01,Cloudy,Cloudy
1,2016-01-04,,Rainy
2,2016-01-17,Sunny,
3,2016-02-08,Cloudy,
4,2016-03-01,,Sunny


In [20]:
tx_weather_ffill = pd.merge_ordered(austin,houston,on='date',suffixes=['_aus','_hus'],fill_method='ffill')
tx_weather_ffill

Unnamed: 0,date,ratings_aus,ratings_hus
0,2016-01-01,Cloudy,Cloudy
1,2016-01-04,Cloudy,Rainy
2,2016-01-17,Sunny,Rainy
3,2016-02-08,Cloudy,Rainy
4,2016-03-01,Cloudy,Sunny


### Using merge_asof()

In [21]:
auto = pd.read_csv('automobiles.csv')
oil = pd.read_csv('oil_price.csv')

In [22]:
auto.head()

Unnamed: 0,mpg,cyl,displ,hp,weight,accel,yr,origin,name
0,18.0,8,307.0,130,3504,12.0,1970-01-01,US,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,1970-01-01,US,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,1970-01-01,US,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,1970-01-01,US,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,1970-01-01,US,ford torino


In [23]:
oil.head()

Unnamed: 0,Date,Price
0,1970-01-01,3.35
1,1970-02-01,3.35
2,1970-03-01,3.35
3,1970-04-01,3.35
4,1970-05-01,3.35


In [24]:
oil.Date = oil.Date.astype('datetime64[ns]')
auto.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 9 columns):
mpg       392 non-null float64
cyl       392 non-null int64
displ     392 non-null float64
hp        392 non-null int64
weight    392 non-null int64
accel     392 non-null float64
yr        392 non-null object
origin    392 non-null object
name      392 non-null object
dtypes: float64(3), int64(3), object(3)
memory usage: 27.6+ KB


In [25]:
auto.yr = auto.yr.astype('datetime64[ns]')
auto.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 9 columns):
mpg       392 non-null float64
cyl       392 non-null int64
displ     392 non-null float64
hp        392 non-null int64
weight    392 non-null int64
accel     392 non-null float64
yr        392 non-null datetime64[ns]
origin    392 non-null object
name      392 non-null object
dtypes: datetime64[ns](1), float64(3), int64(3), object(2)
memory usage: 27.6+ KB


In [26]:
merged = pd.merge_asof(auto,oil,left_on='yr',right_on='Date')

In [27]:
merged.tail()

Unnamed: 0,mpg,cyl,displ,hp,weight,accel,yr,origin,name,Date,Price
387,27.0,4,140.0,86,2790,15.6,1982-01-01,US,ford mustang gl,1982-01-01,33.85
388,44.0,4,97.0,52,2130,24.6,1982-01-01,Europe,vw pickup,1982-01-01,33.85
389,32.0,4,135.0,84,2295,11.6,1982-01-01,US,dodge rampage,1982-01-01,33.85
390,28.0,4,120.0,79,2625,18.6,1982-01-01,US,ford ranger,1982-01-01,33.85
391,31.0,4,119.0,82,2720,19.4,1982-01-01,US,chevy s-10,1982-01-01,33.85


In [28]:
yearly = merged.resample('A',on='Date')[['mpg','Price']].mean()

In [29]:
yearly

Unnamed: 0_level_0,mpg,Price
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1970-12-31,17.689655,3.35
1971-12-31,21.111111,3.56
1972-12-31,18.714286,3.56
1973-12-31,17.1,3.56
1974-12-31,22.769231,10.11
1975-12-31,20.266667,11.16
1976-12-31,21.573529,11.16
1977-12-31,23.375,13.9
1978-12-31,24.061111,14.85
1979-12-31,25.093103,14.85


In [30]:
yearly.corr()

Unnamed: 0,mpg,Price
mpg,1.0,0.948677
Price,0.948677,1.0
