# Feature Engineering 3: Missing Values

#### Exercise:

* Run the code
* Explain what the 3 Feature Engineering methods below do

In [None]:
import seaborn as sns
import pandas as pd

In [None]:
df = sns.load_dataset('titanic')  # passenger list of the Titanic
df.head()

In [None]:
# useful one-liner to count missing values
df.isna().sum().plot.barh()

#### Technique 1: imputation with fixed value

In [None]:
age = df[['age']]
df['age_median'] = age.fillna(age.median())
df[['age', 'age_median']].head(7)

#### Technique 2: Preserve missing info

In [None]:
df['age'].isna().head(7)

In [None]:
df['age_missing'] = df['age'].isna().astype(int)
df[['age', 'age_median', 'age_missing']].head(7)

#### Technique 3: Group Replacement

In [None]:
# step 1: calculate group means first
groups = df.groupby('class')['age'].mean()
groups

In [None]:
# step 2: create pseudo-column
temp = df['class'].replace(groups)
temp.head()

In [None]:
# step 3: insert
df['age_groups'] = df['age'].fillna(temp)
df[['age', 'age_median', 'age_missing', 'age_groups']].head(7)

In [None]:
# for more complex groupings, create a temporary category first:
df['class/sex'] = df['class'].astype(str) + '_' + df['sex']
df['class/sex'].head()

# next steps 1-3

#### Technique 4: Subsampling

In [None]:
N = df.shape[0]
sample = df['age'].dropna().sample(N, replace=True)
sample.head()

In [None]:
sample = pd.Series(sample.values, index=df.index)  # we have to replace the index column, otherwise values don't fit!
sample.head()

In [None]:
df['age_sampled'] = df['age'].fillna(sample)

#### Bonus: check the histograms

In [None]:
import seaborn as sns

sns.histplot(df[['age']], bins=20)

In [None]:
sns.histplot(df[['age_median']], bins=20)

In [None]:
sns.histplot(df[['age_groups']], bins=20)

In [None]:
sns.histplot(df[['age_sampled']], bins=20)