# Handling Missing Values

In [1]:
import pandas as pd

In [2]:
titanic_data = pd.read_csv("titanic-train.csv")

In [3]:
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
titanic_data.shape

(891, 12)

In [5]:
missing_values = titanic_data.isnull().sum()
missing_values

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [8]:
missing_values / titanic_data.shape[0]

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.198653
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Cabin          0.771044
Embarked       0.002245
dtype: float64

In [11]:
titanic_data.isnull().sum(axis=1).sort_values(ascending=False)

502    2
773    2
517    2
783    2
359    2
      ..
659    0
662    0
438    0
215    0
445    0
Length: 891, dtype: int64

* Deletion (column / row)
* Imputation:
  * Mean/Median/Mode Imputation
* Prediction Models
* Assign a unique category

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
train_data, test_data = train_test_split(titanic_data, test_size=0.3, random_state=42)

In [14]:
train_data.shape, test_data.shape

((623, 12), (268, 12))

In [19]:
# train_data.drop('Cabin', axis='columns', inplace=True)

In [20]:
train_data['Cabin'].fillna('Unknown', inplace=True)
test_data['Cabin'].fillna('Unknown', inplace=True)

In [22]:
test_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
709,710,1,3,"Moubarek, Master. Halim Gonios (""William George"")",male,,1,1,2661,15.2458,Unknown,C
439,440,0,2,"Kvillner, Mr. Johan Henrik Johannesson",male,31.0,0,0,C.A. 18723,10.5000,Unknown,S
840,841,0,3,"Alhomaki, Mr. Ilmari Rudolf",male,20.0,0,0,SOTON/O2 3101287,7.9250,Unknown,S
720,721,1,2,"Harper, Miss. Annie Jessie ""Nina""",female,6.0,0,1,248727,33.0000,Unknown,S
39,40,1,3,"Nicola-Yarred, Miss. Jamila",female,14.0,1,0,2651,11.2417,Unknown,C
...,...,...,...,...,...,...,...,...,...,...,...,...
821,822,1,3,"Lulic, Mr. Nikola",male,27.0,0,0,315098,8.6625,Unknown,S
633,634,0,1,"Parr, Mr. William Henry Marsh",male,,0,0,112052,0.0000,Unknown,S
456,457,0,1,"Millet, Mr. Francis Davis",male,65.0,0,0,13509,26.5500,E38,S
500,501,0,3,"Calic, Mr. Petar",male,17.0,0,0,315086,8.6625,Unknown,S


In [24]:
train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            124
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin            0
Embarked         1
dtype: int64

## Age

In [26]:
train_data['Age'].median()

28.0

In [27]:
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
test_data['Age'].fillna(train_data['Age'].median(), inplace=True)

In [30]:
test_data.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       1
dtype: int64

In [33]:
most_frequent_embarked = train_data['Embarked'].mode()[0]
train_data['Embarked'].fillna(most_frequent_embarked, inplace=True)
test_data['Embarked'].fillna(most_frequent_embarked, inplace=True)

In [34]:
test_data.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64