## load data

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

df_train = pd.read_csv('titanic_data/train.csv')
df_test = pd.read_csv('titanic_data/test.csv')
df_train.name = 'Training Set'
df_test.name = 'Test Set'
df_all = pd.concat([df_train, df_test])

print("Number of train examples: {}".format(len(df_train.index)))
print("Number of test examples: {}".format(len(df_test.index)))
print("train + test examples: {}".format(len(df_all.index)))

Number of train examples: 891
Number of test examples: 418
train + test examples: 1309


## get basic info

In [3]:
print(df_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB
None


## Missing value stats

In [8]:
def display_missing(df):    
    for col in df.columns.tolist():          
        print('{} column missing values: {}'.format(col, df[col].isnull().sum()))
    print('\n')
    
for df in [df_train, df_test]:
    print('{}'.format(df.name))
    display_missing(df)

Training Set
PassengerId column missing values: 0
Survived column missing values: 0
Pclass column missing values: 0
Name column missing values: 0
Sex column missing values: 0
Age column missing values: 177
SibSp column missing values: 0
Parch column missing values: 0
Ticket column missing values: 0
Fare column missing values: 0
Cabin column missing values: 687
Embarked column missing values: 2


Test Set
PassengerId column missing values: 0
Pclass column missing values: 0
Name column missing values: 0
Sex column missing values: 0
Age column missing values: 86
SibSp column missing values: 0
Parch column missing values: 0
Ticket column missing values: 0
Fare column missing values: 1
Cabin column missing values: 327
Embarked column missing values: 0




## Data imputation

In [41]:
age_by_pclass_sex = df_all.groupby(["Sex", "Pclass"]).median()["Age"]
print(age_by_pclass_sex)

pClass_enums = sorted(df_all["Pclass"].drop_duplicates().tolist())
sex_enums = df_all["Sex"].drop_duplicates().tolist()
print(pClass_enums)

for pClass in pClass_enums:
    for sex in sex_enums:
        print("median age of passenger class {}, {}: {}".format(pClass, sex,  age_by_pclass_sex[sex][pClass]))

Sex     Pclass
female  1         36.0
        2         28.0
        3         22.0
male    1         42.0
        2         29.5
        3         25.0
Name: Age, dtype: float64
[1, 2, 3]
median age of passenger class 1, male: 42.0
median age of passenger class 1, female: 36.0
median age of passenger class 2, male: 29.5
median age of passenger class 2, female: 28.0
median age of passenger class 3, male: 25.0
median age of passenger class 3, female: 22.0
