<a href="https://colab.research.google.com/github/jongoh-Jeong/MachineLearning-for-AI/blob/main/0929.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
cd /content/drive/MyDrive/Hands-On-Data-Analysis-with-Pandas-2nd-edition/ch_03

/content/drive/MyDrive/Hands-On-Data-Analysis-with-Pandas-2nd-edition/ch_03


# Task 1
- Open ch3,4-reshaping_data.ipynb
- Practice pivoting and unstack
- Practice melting and stack

In [3]:
import pandas as pd

# Load Data
long_df = pd.read_csv(
    'data/long_data.csv', usecols = ['date','datatype','value']
).rename(
    columns = {'value': 'temp_C'}
).assign(
    date = lambda x: pd.to_datetime(x.date),
    temp_F = lambda x : (x.temp_C *9/5 )+32
)

long_df.head()

Unnamed: 0,datatype,date,temp_C,temp_F
0,TMAX,2018-10-01,21.1,69.98
1,TMIN,2018-10-01,8.9,48.02
2,TOBS,2018-10-01,13.9,57.02
3,TMAX,2018-10-02,23.9,75.02
4,TMIN,2018-10-02,13.9,57.02


### Pivoting

In [4]:
pivot_df = long_df.pivot(
    index = 'date',columns = 'datatype', values = 'temp_C'
)

pivot_df.head()

datatype,TMAX,TMIN,TOBS
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-10-01,21.1,8.9,13.9
2018-10-02,23.9,13.9,17.2
2018-10-03,25.0,15.6,16.1
2018-10-04,22.8,11.7,11.7
2018-10-05,23.3,11.7,18.9


In [5]:
pivot_df_2 = long_df.pivot(
    index = "date",columns = 'datatype',values = ['temp_F','temp_C']
)
pivot_df_2.head()

Unnamed: 0_level_0,temp_F,temp_F,temp_F,temp_C,temp_C,temp_C
datatype,TMAX,TMIN,TOBS,TMAX,TMIN,TOBS
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2018-10-01,69.98,48.02,57.02,21.1,8.9,13.9
2018-10-02,75.02,57.02,62.96,23.9,13.9,17.2
2018-10-03,77.0,60.08,60.98,25.0,15.6,16.1
2018-10-04,73.04,53.06,53.06,22.8,11.7,11.7
2018-10-05,73.94,53.06,66.02,23.3,11.7,18.9


### Unstacking

For unstacking, we need to set multi-index dataframe

In [6]:
multi_index_df = long_df.set_index(['date','datatype'])
multi_index_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,temp_C,temp_F
date,datatype,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-10-01,TMAX,21.1,69.98
2018-10-01,TMIN,8.9,48.02
2018-10-01,TOBS,13.9,57.02
2018-10-02,TMAX,23.9,75.02
2018-10-02,TMIN,13.9,57.02


In [7]:
unstacked_df = multi_index_df.unstack()
unstacked_df.head()

Unnamed: 0_level_0,temp_C,temp_C,temp_C,temp_F,temp_F,temp_F
datatype,TMAX,TMIN,TOBS,TMAX,TMIN,TOBS
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2018-10-01,21.1,8.9,13.9,69.98,48.02,57.02
2018-10-02,23.9,13.9,17.2,75.02,57.02,62.96
2018-10-03,25.0,15.6,16.1,77.0,60.08,60.98
2018-10-04,22.8,11.7,11.7,73.04,53.06,53.06
2018-10-05,23.3,11.7,18.9,73.94,53.06,66.02


### Melting

In [8]:
wide_df = pd.read_csv('data/wide_data.csv')
wide_df.head()

Unnamed: 0,date,TMAX,TMIN,TOBS
0,2018-10-01,21.1,8.9,13.9
1,2018-10-02,23.9,13.9,17.2
2,2018-10-03,25.0,15.6,16.1
3,2018-10-04,22.8,11.7,11.7
4,2018-10-05,23.3,11.7,18.9


In [9]:
melted_df = wide_df.melt(
    id_vars='date',
    value_vars=['TMAX', 'TMIN', 'TOBS'],
    value_name='temp_C',
    var_name='measurement'
)
melted_df.head()

Unnamed: 0,date,measurement,temp_C
0,2018-10-01,TMAX,21.1
1,2018-10-02,TMAX,23.9
2,2018-10-03,TMAX,25.0
3,2018-10-04,TMAX,22.8
4,2018-10-05,TMAX,23.3


### Stack

In [10]:
wide_df.set_index('date', inplace=True)
wide_df.head()

Unnamed: 0_level_0,TMAX,TMIN,TOBS
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-10-01,21.1,8.9,13.9
2018-10-02,23.9,13.9,17.2
2018-10-03,25.0,15.6,16.1
2018-10-04,22.8,11.7,11.7
2018-10-05,23.3,11.7,18.9


In [11]:
stacked_series = wide_df.stack()
stacked_series.head()

date            
2018-10-01  TMAX    21.1
            TMIN     8.9
            TOBS    13.9
2018-10-02  TMAX    23.9
            TMIN    13.9
dtype: float64

# Task 2
- Open ch3,5-handling_data_issues.ipynb
- Practice finding problematic data part
- Practice mitigating issues part

In [12]:
df = pd.read_csv('data/dirty_data.csv')

### Problematic data part 
- Find Missing values

In [13]:
df.head()

Unnamed: 0,date,station,PRCP,SNOW,SNWD,TMAX,TMIN,TOBS,WESF,inclement_weather
0,2018-01-01T00:00:00,?,0.0,0.0,-inf,5505.0,-40.0,,,
1,2018-01-01T00:00:00,?,0.0,0.0,-inf,5505.0,-40.0,,,
2,2018-01-01T00:00:00,?,0.0,0.0,-inf,5505.0,-40.0,,,
3,2018-01-02T00:00:00,GHCND:USC00280907,0.0,0.0,-inf,-8.3,-16.1,-12.2,,False
4,2018-01-03T00:00:00,GHCND:USC00280907,0.0,0.0,-inf,-4.4,-13.9,-13.3,,False


In [14]:
df.describe()

  diff_b_a = subtract(b, a)


Unnamed: 0,PRCP,SNOW,SNWD,TMAX,TMIN,TOBS,WESF
count,765.0,577.0,577.0,765.0,765.0,398.0,11.0
mean,5.360392,4.202773,,2649.175294,-15.914379,8.632161,16.290909
std,10.002138,25.086077,,2744.156281,24.242849,9.815054,9.489832
min,0.0,0.0,-inf,-11.7,-40.0,-16.1,1.8
25%,0.0,0.0,,13.3,-40.0,0.15,8.6
50%,0.0,0.0,,32.8,-11.1,8.3,19.3
75%,5.8,0.0,,5505.0,6.7,18.3,24.9
max,61.7,229.0,inf,5505.0,23.9,26.1,28.7


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 765 entries, 0 to 764
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   date               765 non-null    object 
 1   station            765 non-null    object 
 2   PRCP               765 non-null    float64
 3   SNOW               577 non-null    float64
 4   SNWD               577 non-null    float64
 5   TMAX               765 non-null    float64
 6   TMIN               765 non-null    float64
 7   TOBS               398 non-null    float64
 8   WESF               11 non-null     float64
 9   inclement_weather  408 non-null    object 
dtypes: float64(7), object(3)
memory usage: 59.9+ KB


In [16]:
df.shape

(765, 10)

In [17]:
contain_nulls = df[
    df.SNOW.isna() | df.SNWD.isna() | df.TOBS.isna()
    | df.WESF.isna() | df.inclement_weather.isna()
]
contain_nulls.head()
contain_nulls.shape[0]

765

In [18]:
df.isna().sum()

date                   0
station                0
PRCP                   0
SNOW                 188
SNWD                 188
TMAX                   0
TMIN                   0
TOBS                 367
WESF                 754
inclement_weather    357
dtype: int64

- We can also find "-inf","inf" values in SNWD column.  
=> So we need to take a look about the values

In [19]:
import numpy as np

In [20]:
def get_inf_count(df):
    """Find the number of inf/-inf values per column in the dataframe"""
    return {
        col: df[df[col].isin([np.inf, -np.inf])].shape[0] for col in df.columns
    }

get_inf_count(df)

{'date': 0,
 'station': 0,
 'PRCP': 0,
 'SNOW': 0,
 'SNWD': 577,
 'TMAX': 0,
 'TMIN': 0,
 'TOBS': 0,
 'WESF': 0,
 'inclement_weather': 0}

In [21]:
pd.DataFrame({
    'np.inf Snow Depth': df[df.SNWD == np.inf].SNOW.describe(),
    '-np.inf Snow Depth': df[df.SNWD == -np.inf].SNOW.describe()
}).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
np.inf Snow Depth,24.0,101.041667,74.498018,13.0,25.0,120.5,152.0,229.0
-np.inf Snow Depth,553.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- Find the unique value of observation

In [22]:
df.describe(include='object')  #top : 최빈값 / freq : 최빈값의 빈도

Unnamed: 0,date,station,inclement_weather
count,765,765,408
unique,324,2,2
top,2018-07-05T00:00:00,GHCND:USC00280907,False
freq,8,398,384


- Find Duplicated values(중복값)

In [23]:
df[df.duplicated()].shape[0]    

284

In [24]:
df[df.duplicated(keep=False)].shape[0]    # keep = False : return true of all duplicated values 
                                          # keep = 'first'(Default) : return false of first duplicated value, return true of rest of the duplicated values 

482

In [25]:
df[df.duplicated()].head()    #keep: first

Unnamed: 0,date,station,PRCP,SNOW,SNWD,TMAX,TMIN,TOBS,WESF,inclement_weather
1,2018-01-01T00:00:00,?,0.0,0.0,-inf,5505.0,-40.0,,,
2,2018-01-01T00:00:00,?,0.0,0.0,-inf,5505.0,-40.0,,,
5,2018-01-03T00:00:00,GHCND:USC00280907,0.0,0.0,-inf,-4.4,-13.9,-13.3,,False
6,2018-01-03T00:00:00,GHCND:USC00280907,0.0,0.0,-inf,-4.4,-13.9,-13.3,,False
8,2018-01-04T00:00:00,?,20.6,229.0,inf,5505.0,-40.0,,19.3,True


In [26]:
df[df.duplicated(['date', 'station'])].shape[0]

284

### Mitigating Issues

In [27]:
# 1. make the date a datetime
df.date = pd.to_datetime(df.date) # invert the 'date' to'datetime'

# 2. save this information for later
station_qm_wesf = df[df.station == '?'].drop_duplicates('date').set_index('date').WESF

# 3. sort ? to the bottom
df.sort_values('station', ascending=False, inplace=True)

# 4. drop duplicates based on the date column keeping the first occurrence 
# which will be the valid station if it has data
df_deduped = df.drop_duplicates('date')

# 5. remove the station column because we are done with it
df_deduped = df_deduped.drop(columns='station').set_index('date').sort_index()

# 6. take valid station's WESF and fall back on station ? if it is null
df_deduped = df_deduped.assign(
    WESF=lambda x: x.WESF.combine_first(station_qm_wesf)
)

df_deduped.shape

(324, 8)

In [28]:
df_deduped.dropna().shape

(4, 8)

In [29]:
df_deduped\
    .reindex(pd.date_range('2018-01-01', '2018-12-31', freq='D'))\
    .apply(lambda x: x.interpolate())\
    .head(10)

Unnamed: 0,PRCP,SNOW,SNWD,TMAX,TMIN,TOBS,WESF,inclement_weather
2018-01-01,0.0,0.0,-inf,5505.0,-40.0,,,
2018-01-02,0.0,0.0,-inf,-8.3,-16.1,-12.2,,False
2018-01-03,0.0,0.0,-inf,-4.4,-13.9,-13.3,,False
2018-01-04,20.6,229.0,inf,5505.0,-40.0,-13.6,19.3,True
2018-01-05,14.2,127.0,inf,-4.4,-13.9,-13.9,18.626923,True
2018-01-06,0.0,0.0,-inf,-10.0,-15.6,-15.0,17.953846,False
2018-01-07,0.0,0.0,-inf,-11.7,-17.2,-16.1,17.280769,False
2018-01-08,0.0,0.0,-inf,-7.8,-16.7,-8.3,16.607692,False
2018-01-09,0.0,0.0,-inf,-1.4,-12.25,-8.05,15.934615,
2018-01-10,0.0,0.0,-inf,5.0,-7.8,-7.8,15.261538,False


# Task 3
- Open ch4,1-querying_and _merging.ipynb
- Pracitce Filtering method
- Find all entries where datatype is "PRCP" and station ID containing "S0007"
- How many unique values in "attributes"variable?
- What is the most frequent one?

In [30]:
cd /content/drive/MyDrive/Hands-On-Data-Analysis-with-Pandas-2nd-edition/ch_04

/content/drive/MyDrive/Hands-On-Data-Analysis-with-Pandas-2nd-edition/ch_04


In [31]:
weather = pd.read_csv('data/nyc_weather_2018.csv')
weather.head()

Unnamed: 0,date,datatype,station,attributes,value
0,2018-01-01T00:00:00,PRCP,GHCND:US1CTFR0039,",,N,",0.0
1,2018-01-01T00:00:00,PRCP,GHCND:US1NJBG0015,",,N,",0.0
2,2018-01-01T00:00:00,SNOW,GHCND:US1NJBG0015,",,N,",0.0
3,2018-01-01T00:00:00,PRCP,GHCND:US1NJBG0017,",,N,",0.0
4,2018-01-01T00:00:00,SNOW,GHCND:US1NJBG0017,",,N,",0.0


In [32]:
snow_data_1 = weather[(weather.datatype == 'SNOW')&(weather.value>0) & (weather.station.str.contains('US1NY'))]
snow_data_1.head()

Unnamed: 0,date,datatype,station,attributes,value
114,2018-01-01T00:00:00,SNOW,GHCND:US1NYWC0019,",,N,",25.0
789,2018-01-04T00:00:00,SNOW,GHCND:US1NYNS0007,",,N,",41.0
794,2018-01-04T00:00:00,SNOW,GHCND:US1NYNS0018,",,N,",10.0
798,2018-01-04T00:00:00,SNOW,GHCND:US1NYNS0024,",,N,",89.0
800,2018-01-04T00:00:00,SNOW,GHCND:US1NYNS0030,",,N,",102.0


In [33]:
snow_data_2 = weather.query('datatype == "SNOW" and value > 0 and station.str.contains("US1NY")',engine = "python")
snow_data_2.head()

Unnamed: 0,date,datatype,station,attributes,value
114,2018-01-01T00:00:00,SNOW,GHCND:US1NYWC0019,",,N,",25.0
789,2018-01-04T00:00:00,SNOW,GHCND:US1NYNS0007,",,N,",41.0
794,2018-01-04T00:00:00,SNOW,GHCND:US1NYNS0018,",,N,",10.0
798,2018-01-04T00:00:00,SNOW,GHCND:US1NYNS0024,",,N,",89.0
800,2018-01-04T00:00:00,SNOW,GHCND:US1NYNS0030,",,N,",102.0


In [34]:
import sqlite3

with sqlite3.connect('data/weather.db') as connection:
    prcp_data = pd.read_sql(
        'SELECT * FROM weather WHERE datatype == "PRCP" AND station LIKE "%S0007%"', 
        connection
    )

prcp_data.head()

Unnamed: 0,date,datatype,station,attributes,value
0,2018-01-01T00:00:00,PRCP,GHCND:US1NYNS0007,",,N,",0.0
1,2018-01-02T00:00:00,PRCP,GHCND:US1NYNS0007,",,N,",0.0
2,2018-01-03T00:00:00,PRCP,GHCND:US1NYNS0007,",,N,",0.0
3,2018-01-04T00:00:00,PRCP,GHCND:US1NYNS0007,",,N,",4.1
4,2018-01-05T00:00:00,PRCP,GHCND:US1NYNS0007,",,N,",18.0


In [35]:
with sqlite3.connect('data/weather.db') as connection:
    prcp_data = pd.read_sql(
        'SELECT DISTINCT attributes FROM weather', 
        connection
    )

prcp_data.shape[0]

26

## Merging

In [36]:
station_info = pd.read_csv('data/weather_stations.csv')
station_info.head()

Unnamed: 0,id,name,latitude,longitude,elevation
0,GHCND:US1CTFR0022,"STAMFORD 2.6 SSW, CT US",41.0641,-73.577,36.6
1,GHCND:US1CTFR0039,"STAMFORD 4.2 S, CT US",41.037788,-73.568176,6.4
2,GHCND:US1NJBG0001,"BERGENFIELD 0.3 SW, NJ US",40.921298,-74.001983,20.1
3,GHCND:US1NJBG0002,"SADDLE BROOK TWP 0.6 E, NJ US",40.902694,-74.083358,16.8
4,GHCND:US1NJBG0003,"TENAFLY 1.3 W, NJ US",40.91467,-73.9775,21.6


In [37]:
weather.head()

Unnamed: 0,date,datatype,station,attributes,value
0,2018-01-01T00:00:00,PRCP,GHCND:US1CTFR0039,",,N,",0.0
1,2018-01-01T00:00:00,PRCP,GHCND:US1NJBG0015,",,N,",0.0
2,2018-01-01T00:00:00,SNOW,GHCND:US1NJBG0015,",,N,",0.0
3,2018-01-01T00:00:00,PRCP,GHCND:US1NJBG0017,",,N,",0.0
4,2018-01-01T00:00:00,SNOW,GHCND:US1NJBG0017,",,N,",0.0


In [38]:
station_info.id.describe()

count                   279
unique                  279
top       GHCND:US1CTFR0022
freq                      1
Name: id, dtype: object

In [39]:
def get_row_count(*dfs):
    return [df.shape[0] for df in dfs]
get_row_count(station_info, weather)

[279, 78780]

- Inner Join

In [40]:
#inner join
inner_join = weather.merge(station_info, left_on='station', right_on='id')
inner_join.head()

Unnamed: 0,date,datatype,station,attributes,value,id,name,latitude,longitude,elevation
0,2018-01-01T00:00:00,PRCP,GHCND:US1CTFR0039,",,N,",0.0,GHCND:US1CTFR0039,"STAMFORD 4.2 S, CT US",41.037788,-73.568176,6.4
1,2018-01-02T00:00:00,PRCP,GHCND:US1CTFR0039,",,N,",0.0,GHCND:US1CTFR0039,"STAMFORD 4.2 S, CT US",41.037788,-73.568176,6.4
2,2018-01-03T00:00:00,PRCP,GHCND:US1CTFR0039,",,N,",0.0,GHCND:US1CTFR0039,"STAMFORD 4.2 S, CT US",41.037788,-73.568176,6.4
3,2018-01-05T00:00:00,DAPR,GHCND:US1CTFR0039,",,N,",2.0,GHCND:US1CTFR0039,"STAMFORD 4.2 S, CT US",41.037788,-73.568176,6.4
4,2018-01-05T00:00:00,MDPR,GHCND:US1CTFR0039,",,N,",15.5,GHCND:US1CTFR0039,"STAMFORD 4.2 S, CT US",41.037788,-73.568176,6.4


In [41]:
weather.merge(station_info.rename(dict(id='station'), axis=1), on='station').head()

Unnamed: 0,date,datatype,station,attributes,value,name,latitude,longitude,elevation
0,2018-01-01T00:00:00,PRCP,GHCND:US1CTFR0039,",,N,",0.0,"STAMFORD 4.2 S, CT US",41.037788,-73.568176,6.4
1,2018-01-02T00:00:00,PRCP,GHCND:US1CTFR0039,",,N,",0.0,"STAMFORD 4.2 S, CT US",41.037788,-73.568176,6.4
2,2018-01-03T00:00:00,PRCP,GHCND:US1CTFR0039,",,N,",0.0,"STAMFORD 4.2 S, CT US",41.037788,-73.568176,6.4
3,2018-01-05T00:00:00,DAPR,GHCND:US1CTFR0039,",,N,",2.0,"STAMFORD 4.2 S, CT US",41.037788,-73.568176,6.4
4,2018-01-05T00:00:00,MDPR,GHCND:US1CTFR0039,",,N,",15.5,"STAMFORD 4.2 S, CT US",41.037788,-73.568176,6.4


- Left Join

In [42]:
#left join
left_join = station_info.merge(weather, left_on='id', right_on='station', how='left')
left_join.head()

Unnamed: 0,id,name,latitude,longitude,elevation,date,datatype,station,attributes,value
0,GHCND:US1CTFR0022,"STAMFORD 2.6 SSW, CT US",41.0641,-73.577,36.6,,,,,
1,GHCND:US1CTFR0039,"STAMFORD 4.2 S, CT US",41.037788,-73.568176,6.4,2018-01-01T00:00:00,PRCP,GHCND:US1CTFR0039,",,N,",0.0
2,GHCND:US1CTFR0039,"STAMFORD 4.2 S, CT US",41.037788,-73.568176,6.4,2018-01-02T00:00:00,PRCP,GHCND:US1CTFR0039,",,N,",0.0
3,GHCND:US1CTFR0039,"STAMFORD 4.2 S, CT US",41.037788,-73.568176,6.4,2018-01-03T00:00:00,PRCP,GHCND:US1CTFR0039,",,N,",0.0
4,GHCND:US1CTFR0039,"STAMFORD 4.2 S, CT US",41.037788,-73.568176,6.4,2018-01-05T00:00:00,DAPR,GHCND:US1CTFR0039,",,N,",2.0


- Right Join

In [43]:
#right join
right_join = weather.merge(station_info, left_on='station', right_on='id', how='right')
right_join.head()

Unnamed: 0,date,datatype,station,attributes,value,id,name,latitude,longitude,elevation
0,,,,,,GHCND:US1CTFR0022,"STAMFORD 2.6 SSW, CT US",41.0641,-73.577,36.6
1,2018-01-01T00:00:00,PRCP,GHCND:US1CTFR0039,",,N,",0.0,GHCND:US1CTFR0039,"STAMFORD 4.2 S, CT US",41.037788,-73.568176,6.4
2,2018-01-02T00:00:00,PRCP,GHCND:US1CTFR0039,",,N,",0.0,GHCND:US1CTFR0039,"STAMFORD 4.2 S, CT US",41.037788,-73.568176,6.4
3,2018-01-03T00:00:00,PRCP,GHCND:US1CTFR0039,",,N,",0.0,GHCND:US1CTFR0039,"STAMFORD 4.2 S, CT US",41.037788,-73.568176,6.4
4,2018-01-05T00:00:00,DAPR,GHCND:US1CTFR0039,",,N,",2.0,GHCND:US1CTFR0039,"STAMFORD 4.2 S, CT US",41.037788,-73.568176,6.4


- Outer Join

In [44]:
#outer join
outer_join = weather.merge(
    station_info[station_info.id.str.contains('US1NY')], 
    left_on='station', right_on='id', how='outer', indicator=True
)

outer_join.head()

Unnamed: 0,date,datatype,station,attributes,value,id,name,latitude,longitude,elevation,_merge
0,2018-01-01T00:00:00,PRCP,GHCND:US1CTFR0039,",,N,",0.0,,,,,,left_only
1,2018-01-02T00:00:00,PRCP,GHCND:US1CTFR0039,",,N,",0.0,,,,,,left_only
2,2018-01-03T00:00:00,PRCP,GHCND:US1CTFR0039,",,N,",0.0,,,,,,left_only
3,2018-01-05T00:00:00,DAPR,GHCND:US1CTFR0039,",,N,",2.0,,,,,,left_only
4,2018-01-05T00:00:00,MDPR,GHCND:US1CTFR0039,",,N,",15.5,,,,,,left_only


In [45]:
get_row_count(inner_join, left_join, right_join,outer_join)

[78780, 78949, 78949, 78802]