# Handling Missing Values

In [2]:
import pandas as pd

In [3]:
titanic_data = pd.read_csv("titanic-train.csv")

In [4]:
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
titanic_data.shape

(891, 12)

In [6]:
missing_values = titanic_data.isnull().sum()
missing_values

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
missing_values / titanic_data.shape[0]

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.198653
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Cabin          0.771044
Embarked       0.002245
dtype: float64

In [8]:
titanic_data.isnull().sum(axis=1).sort_values(ascending=False)

502    2
773    2
517    2
783    2
359    2
      ..
659    0
662    0
438    0
215    0
445    0
Length: 891, dtype: int64

* Deletion (column / row)
* Imputation:
  * Mean/Median/Mode Imputation
* Prediction Models
* Assign a unique category

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
train_data, test_data = train_test_split(titanic_data, test_size=0.3, random_state=42)

In [11]:
train_data.shape, test_data.shape

((623, 12), (268, 12))

In [12]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

In [13]:
strat_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [15]:
X = train_data.drop('Survived', axis='columns')
y = train_data['Survived']

In [17]:
scores = []

for train_index, val_index in strat_kfold.split(X, y):
  X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
  y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]
  
  X_train_fold['Age'].fillna(X_train_fold['Age'].median(), inplace=True)
  X_val_fold['Age'].fillna(X_train_fold['Age'].median(), inplace=True)
  
  X_train_fold['Cabin'].fillna('Unknown', inplace=True)
  X_val_fold['Cabin'].fillna('Unknown', inplace=True)
  
  most_frequent_embarked_fold = X_train_fold['Embarked'].mode()[0]
  X_train_fold['Embarked'].fillna(most_frequent_embarked_fold, inplace=True)
  X_val_fold['Embarked'].fillna(most_frequent_embarked_fold, inplace=True)
  
  X_train_fold = X_train_fold[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]
  X_val_fold = X_val_fold[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]
  
  model = LogisticRegression(max_iter=1000)
  model.fit(X_train_fold, y_train_fold)
  
  y_pred = model.predict(X_val_fold)
  
  accuracy = accuracy_score(y_val_fold, y_pred)
  
  scores.append(accuracy)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold['Age'].fillna(X_train_fold['Age'].median(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_val_fold['Age'].fillna(X_train_fold['Age'].median(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_fold['Cabin'].fillna('Unknown', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returni

In [18]:
scores

[0.752, 0.696, 0.672, 0.6935483870967742, 0.7016129032258065]

In [19]:
average_accuracy = np.mean(scores)
average_accuracy

0.7030322580645161

In [20]:
X = train_data.drop('Survived', axis='columns')
y = train_data['Survived']

In [23]:
X_test = test_data.drop('Survived', axis='columns')
y_test = test_data['Survived']

In [24]:
X['Age'].fillna(X['Age'].median(), inplace=True)
X_test['Age'].fillna(X['Age'].median(), inplace=True)

X['Cabin'].fillna('Unknown', inplace=True)
X_test['Cabin'].fillna('Unknown', inplace=True)

most_frequent_embarked_fold = X['Embarked'].mode()[0]
X['Embarked'].fillna(most_frequent_embarked_fold, inplace=True)
X_test['Embarked'].fillna(most_frequent_embarked_fold, inplace=True)

X = X[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]
X_test = X_test[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]

In [25]:
model = LogisticRegression(max_iter=1000)
model.fit(X, y)

LogisticRegression(max_iter=1000)

In [26]:
y_pred = model.predict(X_test)
  
accuracy = accuracy_score(y_test, y_pred)

In [27]:
accuracy

0.7276119402985075