# Pandas Data Manipulation

Some methods: `.isin()`, `sort_values()`, `.reset_index()`, `pd.merge()`, `pd.concat()`, `.set_index()`, `.get_dummies()` 

---

In [None]:
import pandas as pd

In [None]:
df1 = pd.read_csv('./datasets/football_deaths.csv') 
# https://www.kaggle.com/shivamb/football-deaths

df1.info()

In [None]:
print(df1.columns)
df1.head()

In [None]:
df1['player_country'].value_counts()

In [None]:
df1['player_team_name'].value_counts()

In [None]:
df1['incident_description'].value_counts()

In [None]:
df1_data = df1[['incident_date', 'player_country', 'player_age']]
df1_data.info()

In [None]:
df1_data['incident_date'].astype('datetime64[ns]')
df1_data.head()

In [None]:
# We like functions!!!

def row_filter(df, cat_var, cat_values, sort_by):
    df = df[df[cat_var].isin(cat_values)].sort_values(by=sort_by, ascending=False)
    return df.reset_index(drop=True)

In [None]:
col = 'player_country'
var = ['Spain', 'France', 'Italy']
order = 'incident_date'

In [None]:
row_filter(df1_data, col, var, order)

---

### pandas.merge()  & pandas.concat()
df.join() & df.append()

https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html

In [None]:
df_m1 = pd.DataFrame({'feature_key':['A','B','C'],
                      'name':['Name 1','Name 2','Name 3']})
df_m1

In [None]:
import random as r
feature_options = ['A','B','C']
value_random = [r.randint(1,5) for i in range(20)]
feature_random = [feature_options[r.randint(0,2)] for i in range(20)]

In [None]:
df_m2 = pd.DataFrame({'feature_key':feature_random,
                      'value':value_random})
df_m2

In [None]:
# pd.merge() https://pandas.pydata.org/docs/reference/api/pandas.merge.html?highlight=merge#pandas.merge

df_m = pd.merge(df_m1, df_m2)
df_m

In [None]:
df_c1 = pd.DataFrame({'feature_key':feature_random,
                      'value':value_random})
df_c1.head()

In [None]:
df_c2 = pd.DataFrame({'feature_key':feature_random,
                      'value':value_random})
df_c2.head()

In [None]:
# pd.concat() https://pandas.pydata.org/docs/reference/api/pandas.concat.html

df_c = pd.concat([df_c1,df_c2])
df_c

---

In [None]:
# .set_index() https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.set_index.html

df_c.set_index('feature_key', inplace=True)
df_c

---

__JOIN Types__

![Image](./img/etl_pandas_manipulation_01.jpg)

![Image](./img/etl_pandas_manipulation_02.jpg)

In [None]:
df_j1 = pd.DataFrame({'animal':['dog','cat','horse'],
                      'breed':['pug','european','andalusian']})
df_j1

In [None]:
df_j2 = pd.DataFrame({'animal':['dog','cat','tardigrade'],
                      'strength':[100, 200, 8000]})
df_j2

In [None]:
# Inner JOIN

df_ij = pd.merge(df_j1, df_j2)
df_ij

In [None]:
# Outer JOIN

df_oj = pd.merge(df_j1, df_j2, how='outer')
df_oj

In [None]:
# Left JOIN

df_lj = pd.merge(df_j1, df_j2, how='left')
df_lj

In [None]:
# Right JOIN

df_rj = pd.merge(df_j1, df_j2, how='right')
df_rj

---

### pandas.get_dummies()

(a.k.a. One-Hot-Encoding)

https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html

In [None]:
df1.loc[df1['heart_related'] != 0,['player_name','heart_related','cardiac_related','collapsed','lightning','collision']]

In [None]:
df_teams = df1.loc[df1['team_country'].isin(['England', 'Spain']),['player_name', 'team_country']]
df_teams

In [None]:
# One-Hot_encoding

df_dummies = pd.get_dummies(df_teams, columns=['team_country'])
df_dummies

---