# [How to Join Pandas DataFrames using Merge?](https://www.geeksforgeeks.org/how-to-join-pandas-dataframes-using-merge/)

In [3]:
import numpy as np
import pandas as pd

In [4]:
left = pd.DataFrame({'Sr.no': ['1', '2', '3', '4', '5'],
                    'Name': ['Rashmi', 'Arun', 'John',
                             'Kshitu', 'Bresha'],
                    'Roll No': ['1', '2', '3', '4', '5']})
left

Unnamed: 0,Sr.no,Name,Roll No
0,1,Rashmi,1
1,2,Arun,2
2,3,John,3
3,4,Kshitu,4
4,5,Bresha,5


In [5]:
right = pd.DataFrame({'Sr.no': ['2', '4', '6', '7', '8'],
                    'Gender': ['F', 'M', 'M', 'F', 'F'],
                    'Interest': ['Writing', 'Cricket', 'Dancing',
                                 'Chess', 'Sleeping']})
right

Unnamed: 0,Sr.no,Gender,Interest
0,2,F,Writing
1,4,M,Cricket
2,6,M,Dancing
3,7,F,Chess
4,8,F,Sleeping


Merging the dataframes
------------                  

![ss](https://i.stack.imgur.com/3qpXx.gif)

In [11]:
inner_join = pd.merge(left, right, how ='inner', on ='Sr.no')
inner_join # intersection of `on = 'Sr.no,`

Unnamed: 0,Sr.no,Name,Roll No,Gender,Interest
0,2,Arun,2,F,Writing
1,4,Kshitu,4,M,Cricket


![ss](https://i.stack.imgur.com/dG8mw.gif)

In [10]:
outer_join = pd.merge(left, right, how ='outer', on ='Sr.no')
outer_join # union of `on = 'Sr.no,`

Unnamed: 0,Sr.no,Name,Roll No,Gender,Interest
0,1,Rashmi,1.0,,
1,2,Arun,2.0,F,Writing
2,3,John,3.0,,
3,4,Kshitu,4.0,M,Cricket
4,5,Bresha,5.0,,
5,6,,,M,Dancing
6,7,,,F,Chess
7,8,,,F,Sleeping


![ss](https://i.stack.imgur.com/s5hgJ.gif)

In [9]:
left_join = pd.merge(left, right, how ='left', on ='Sr.no')
left_join # only left df of `on = 'Sr.no,`

Unnamed: 0,Sr.no,Name,Roll No,Gender,Interest
0,1,Rashmi,1,,
1,2,Arun,2,F,Writing
2,3,John,3,,
3,4,Kshitu,4,M,Cricket
4,5,Bresha,5,,


![ss](https://i.stack.imgur.com/JpPRH.gif)

In [12]:
right_join = pd.merge(left, right, how ='right', on ='Sr.no')
right_join # only right df of `on = 'Sr.no,`

Unnamed: 0,Sr.no,Name,Roll No,Gender,Interest
0,2,Arun,2.0,F,Writing
1,4,Kshitu,4.0,M,Cricket
2,6,,,M,Dancing
3,7,,,F,Chess
4,8,,,F,Sleeping


#### Question

The air quality data for this segment has been divided into three different csv files. 

`info.csv` has the data hour by hour data about the concentration of polutants in the air and the status of the intruments. 
`item_info` has the data for items and levels of concetration. 
`station_info` has the data for measuring stations. 

Read in all the three datasets and then print the first five rows.

You can download the dataset from kaggle website: https://www.kaggle.com/bappekim/air-pollution-in-seoul

In [13]:
import pandas as pd

In [None]:
data = pd.read_csv("Measurement_info.csv", header = 0)
data

Unnamed: 0,Measurement date,Station code,Item code,Average value,Instrument status
0,2017-01-01 00:00,101,1,0.004,0
1,2017-01-01 00:00,101,3,0.059,0
2,2017-01-01 00:00,101,5,1.200,0
3,2017-01-01 00:00,101,6,0.002,0
4,2017-01-01 00:00,101,8,73.000,0
...,...,...,...,...,...
3885061,2019-12-31 23:00,123,9,13.000,0
3885062,2019-12-31 23:00,118,9,24.000,0
3885063,2019-12-31 23:00,105,8,19.000,0
3885064,2019-12-31 23:00,125,3,0.037,0


In [16]:
item = pd.read_csv("Measurement_item_info.csv")
item

Unnamed: 0,Item code,Item name,Unit of measurement,Good(Blue),Normal(Green),Bad(Yellow),Very bad(Red)
0,1,SO2,ppm,0.02,0.05,0.15,1.0
1,3,NO2,ppm,0.03,0.06,0.2,2.0
2,5,CO,ppm,2.0,9.0,15.0,50.0
3,6,O3,ppm,0.03,0.09,0.15,0.5
4,8,PM10,Mircrogram/m3,30.0,80.0,150.0,600.0
5,9,PM2.5,Mircrogram/m3,15.0,35.0,75.0,500.0


In [17]:
station = pd.read_csv("Measurement_station_info.csv")
station

Unnamed: 0,Station code,Station name(district),Address,Latitude,Longitude
0,101,Jongno-gu,"19, Jong-ro 35ga-gil, Jongno-gu, Seoul, Republ...",37.572016,127.005008
1,102,Jung-gu,"15, Deoksugung-gil, Jung-gu, Seoul, Republic o...",37.564263,126.974676
2,103,Yongsan-gu,"136, Hannam-daero, Yongsan-gu, Seoul, Republic...",37.540033,127.00485
3,104,Eunpyeong-gu,"215, Jinheung-ro, Eunpyeong-gu, Seoul, Republi...",37.609823,126.934848
4,105,Seodaemun-gu,"32, Segeomjeong-ro 4-gil, Seodaemun-gu, Seoul,...",37.593742,126.949679
5,106,Mapo-gu,"10, Poeun-ro 6-gil, Mapo-gu, Seoul, Republic o...",37.55558,126.905597
6,107,Seongdong-gu,"18, Ttukseom-ro 3-gil, Seongdong-gu, Seoul, Re...",37.541864,127.049659
7,108,Gwangjin-gu,"571, Gwangnaru-ro, Gwangjin-gu, Seoul, Republi...",37.54718,127.092493
8,109,Dongdaemun-gu,"43, Cheonho-daero 13-gil, Dongdaemun-gu, Seoul...",37.575743,127.028885
9,110,Jungnang-gu,"369, Yongmasan-ro, Jungnang-gu, Seoul, Republi...",37.584848,127.094023


#### Question

Create a new Dataframe whcih has information about the item code and item name

In [18]:
sub_item = item[['Item code', 'Item name']]
sub_item

Unnamed: 0,Item code,Item name
0,1,SO2
1,3,NO2
2,5,CO
3,6,O3
4,8,PM10
5,9,PM2.5


#### Question

Create a new Dataframe whcih has information about the station code and station name

In [19]:
sub_station = station[['Station code', 'Station name(district)']]
sub_station

Unnamed: 0,Station code,Station name(district)
0,101,Jongno-gu
1,102,Jung-gu
2,103,Yongsan-gu
3,104,Eunpyeong-gu
4,105,Seodaemun-gu
5,106,Mapo-gu
6,107,Seongdong-gu
7,108,Gwangjin-gu
8,109,Dongdaemun-gu
9,110,Jungnang-gu


### Question 

In the `data` DataFrame add in a column displaying the names of the items. 

In [None]:
data_i = data.merge(sub_item, on = "Item code", how = "left")
data_i.head()

Unnamed: 0,Measurement date,Station code,Item code,Average value,Instrument status,Item name
0,2017-01-01 00:00,101,1,0.004,0,SO2
1,2017-01-01 00:00,101,3,0.059,0,NO2
2,2017-01-01 00:00,101,5,1.2,0,CO
3,2017-01-01 00:00,101,6,0.002,0,O3
4,2017-01-01 00:00,101,8,73.0,0,PM10


### Question 

In the `data_i` DataFrame add in a column displaying the names of the stations. 

In [None]:
data_s = data_i.merge(sub_station, on = "Station code", how = "left")
data_s.head()

Unnamed: 0,Measurement date,Station code,Item code,Average value,Instrument status,Item name,Station name(district)
0,2017-01-01 00:00,101,1,0.004,0,SO2,Jongno-gu
1,2017-01-01 00:00,101,3,0.059,0,NO2,Jongno-gu
2,2017-01-01 00:00,101,5,1.2,0,CO,Jongno-gu
3,2017-01-01 00:00,101,6,0.002,0,O3,Jongno-gu
4,2017-01-01 00:00,101,8,73.0,0,PM10,Jongno-gu


### Question 

In the `data_s` DataFrame drop the columns `Station code` and `Item code`. As these columns have not become redundant. You can find the relevant function in the pandas library [here](https://pandas.pydata.org/docs/reference/index.html). 

In [None]:
data = data_s.drop(['Station code', 'Item code'], axis = 1)
data.head()

Unnamed: 0,Measurement date,Average value,Instrument status,Item name,Station name(district)
0,2017-01-01 00:00,0.004,0,SO2,Jongno-gu
1,2017-01-01 00:00,0.059,0,NO2,Jongno-gu
2,2017-01-01 00:00,1.2,0,CO,Jongno-gu
3,2017-01-01 00:00,0.002,0,O3,Jongno-gu
4,2017-01-01 00:00,73.0,0,PM10,Jongno-gu


### Question 

Given below are the meanings of the values in the `Instrument status`. 

- 0: Normal 
- 1: Need for calibration 
- 2: Abnormal
- 4: Power cut off 
- 8: Under repair
- 9: Abnormal data

Using the information given above, add a column in the `data` DataFrame to give the status of the intsruments. Then drop the `Instrument status` column.  

Hint: First create a dictionary from the data, then use the same dictionary to create a DataFrame and then merge the DataFrame. with `data`

In [20]:
status_dict = {
    'Instrument status' : [0, 1, 2, 4, 8, 9],
    'Status' : ['Normal', 'need for calibration', 'Abnormal', 'Power cut off', 'Under repair', 'Abnormal data']
}
status_dict            

{'Instrument status': [0, 1, 2, 4, 8, 9],
 'Status': ['Normal',
  'need for calibration',
  'Abnormal',
  'Power cut off',
  'Under repair',
  'Abnormal data']}

In [21]:
dictdf = pd.DataFrame(status_dict)
dictdf

Unnamed: 0,Instrument status,Status
0,0,Normal
1,1,need for calibration
2,2,Abnormal
3,4,Power cut off
4,8,Under repair
5,9,Abnormal data


In [None]:
data = data.merge(dictdf, on = 'Instrument status', how = 'left')

In [None]:
data.head()

Unnamed: 0,Measurement date,Average value,Instrument status,Item name,Station name(district),Status
0,2017-01-01 00:00,0.004,0,SO2,Jongno-gu,Normal
1,2017-01-01 00:00,0.059,0,NO2,Jongno-gu,Normal
2,2017-01-01 00:00,1.2,0,CO,Jongno-gu,Normal
3,2017-01-01 00:00,0.002,0,O3,Jongno-gu,Normal
4,2017-01-01 00:00,73.0,0,PM10,Jongno-gu,Normal


In [None]:
data = data.drop(['Instrument status'], axis = 1)

In [None]:
data.head()

Unnamed: 0,Measurement date,Average value,Item name,Station name(district),Status
0,2017-01-01 00:00,0.004,SO2,Jongno-gu,Normal
1,2017-01-01 00:00,0.059,NO2,Jongno-gu,Normal
2,2017-01-01 00:00,1.2,CO,Jongno-gu,Normal
3,2017-01-01 00:00,0.002,O3,Jongno-gu,Normal
4,2017-01-01 00:00,73.0,PM10,Jongno-gu,Normal


### Question 

Extract the time series data, that is year, month, date and hour form the `Measurement date` column. Once all the data is extrcted drop the `Measurement date` column. 

This operation might take some time as the dataset we are working with is very large. 

In [None]:
data['Year']  = pd.DatetimeIndex(data['Measurement date']).year
data['Month'] = pd.DateTimeIndex(data['Measurement date']).month
data['Date']  = pd.DateTimeIndex(data['Measurement date']).day
data['Hour']  = pd.DateTimeIndex(data['Measurement date']).hour
data.head()

Unnamed: 0,Measurement date,Average value,Item name,Station name(district),Status,Year,Month,Date,Hour
0,2017-01-01 00:00,0.004,SO2,Jongno-gu,Normal,2017,1,1,0
1,2017-01-01 00:00,0.059,NO2,Jongno-gu,Normal,2017,1,1,0
2,2017-01-01 00:00,1.2,CO,Jongno-gu,Normal,2017,1,1,0
3,2017-01-01 00:00,0.002,O3,Jongno-gu,Normal,2017,1,1,0
4,2017-01-01 00:00,73.0,PM10,Jongno-gu,Normal,2017,1,1,0


In [None]:
data = data.drop(['Measurement date'], axis = 1)

In [None]:
data.head()

Unnamed: 0,Average value,Item name,Station name(district),Status,Year,Month,Date,Hour
0,0.004,SO2,Jongno-gu,Normal,2017,1,1,0
1,0.059,NO2,Jongno-gu,Normal,2017,1,1,0
2,1.2,CO,Jongno-gu,Normal,2017,1,1,0
3,0.002,O3,Jongno-gu,Normal,2017,1,1,0
4,73.0,PM10,Jongno-gu,Normal,2017,1,1,0
