In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Titanic Dataset

#### dataset - https://www.kaggle.com/competitions/titanic

### Demonstrating simple imputing methods of using mean,median and mode on the datasets

In [2]:
df1 = pd.read_csv("./titanic dataset/train.csv")
df1.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Checking what columns have missing values

In [3]:
df1.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

### Cabin columns has almost all values missing, thus we outright drop it.

In [4]:
df1.drop(columns=['Cabin'], inplace=True)
df1.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         2
dtype: int64

### For 'Age' we will substitute the missing values with the mean of the present values, and for 'Embarked' we will substitute the two missing values with mode of the present values

In [5]:
imp_mean = SimpleImputer(strategy='mean')
df1['Age'] = imp_mean.fit_transform(df1[['Age']])

imp_mf = SimpleImputer(strategy='most_frequent')
df1['Embarked'] = imp_mf.fit_transform(df1[['Embarked']])

df1.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

# Travel Times Dataset

#### dataset - https://openmv.net/info/travel-times

### Demonstrating imputing by estimating missing feature based on the other available features done using sklearn iterative imputer

In [6]:
df2 = pd.read_csv('./traveltime openmv/travel-times.csv')
df2.head()

Unnamed: 0,Date,StartTime,DayOfWeek,GoingTo,Distance,MaxSpeed,AvgSpeed,AvgMovingSpeed,FuelEconomy,TotalTime,MovingTime,Take407All,Comments
0,1/6/2012,16:37,Friday,Home,51.29,127.4,78.3,84.8,,39.3,36.3,No,
1,1/6/2012,8:20,Friday,GSK,51.63,130.3,81.8,88.9,,37.9,34.9,No,
2,1/4/2012,16:17,Wednesday,Home,51.27,127.4,82.0,85.8,,37.5,35.9,No,
3,1/4/2012,7:53,Wednesday,GSK,49.17,132.3,74.2,82.9,,39.8,35.6,No,
4,1/3/2012,18:57,Tuesday,Home,51.15,136.2,83.4,88.1,,36.8,34.8,No,


### Check for null values

In [7]:
df2.isnull().sum()

Date                0
StartTime           0
DayOfWeek           0
GoingTo             0
Distance            0
MaxSpeed            0
AvgSpeed            0
AvgMovingSpeed      0
FuelEconomy        19
TotalTime           0
MovingTime          0
Take407All          0
Comments          181
dtype: int64

### The column for comments is mostly missing so we can safely drop it

In [8]:
df2.drop(columns=['Comments'], inplace=True)

### For 'FuelEconomy' column we can use - distance, maxspeed, avgspeed, avgmovingspeed, totaltime, movingtime to predict its missing values 

In [9]:
df2_sub = df2[['Distance', 'MaxSpeed', 'AvgSpeed', 'AvgMovingSpeed', 'TotalTime', 'MovingTime','FuelEconomy']]
df2_sub.head()

Unnamed: 0,Distance,MaxSpeed,AvgSpeed,AvgMovingSpeed,TotalTime,MovingTime,FuelEconomy
0,51.29,127.4,78.3,84.8,39.3,36.3,
1,51.63,130.3,81.8,88.9,37.9,34.9,
2,51.27,127.4,82.0,85.8,37.5,35.9,
3,49.17,132.3,74.2,82.9,39.8,35.6,
4,51.15,136.2,83.4,88.1,36.8,34.8,


### Now using iterative imputer to impute the missing values

In [10]:
imp_iter = IterativeImputer(random_state=0)
df2['FuelEconomy'] = imp_iter.fit_transform(df2_sub)
df2.isnull().sum()

Date              0
StartTime         0
DayOfWeek         0
GoingTo           0
Distance          0
MaxSpeed          0
AvgSpeed          0
AvgMovingSpeed    0
FuelEconomy       0
TotalTime         0
MovingTime        0
Take407All        0
dtype: int64