In [415]:
import pandas as pd
import numpy as np

# Importing the data

In [416]:
df_train_1 = pd.read_csv('./dataset/train.csv')
df_train_1.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# # Task 1: Cleaning

### Drop unnecessary columns and NAN rows

In [417]:
df_train_2 = df_train_1.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
df_train_2.head()
df_train_2['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [418]:
mean_age = df_train_2['Age'].mean()

df_train_3 = df_train_2.copy()

print(df_train_3.count()['Age'])
df_train_3['Age'] = df_train_3['Age'].replace(np.nan, mean_age)
df_train_3.count()['Age']

714


891

In [419]:
df_train_3[df_train_3['Embarked'].isnull()]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
61,1,1,female,38.0,0,0,80.0,
829,1,1,female,62.0,0,0,80.0,


In [420]:
df_train_2.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [421]:
np.count_nonzero(df_train_3.isnull())

2

#### We have just 2 NAN rows, we can drop it safely

In [422]:
df_train_3.dropna(inplace=True)
np.count_nonzero(df_train_3.isnull())

0

### Convert Strings

#### Male and Female to 0 and 1

In [423]:
df_train_4 = df_train_3.copy()
df_train_4['Sex'] = df_train_4['Sex'].replace({'male': 0, 'female': 1}).to_numpy()

df_train_4.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.25,S
1,1,1,1,38.0,1,0,71.2833,C
2,1,3,1,26.0,0,0,7.925,S
3,1,1,1,35.0,1,0,53.1,S
4,0,3,0,35.0,0,0,8.05,S


#### Pclass and Embarked aren't numeric values, we can't convert to 0, 1 and 2 because it would be treated as numeric by our model. For example, the age is a numeric value, it's important for our model know that a 70 y.o person is older than a 17 y.o person, or that a 10k fare is greater than a 5k fare. Therefore, if we treat embarked as 0, 1 and 2 the model will consider that 2 embark is greater than the 0, but its just classes just like the Pclass. To solve it, we can use arrays, for instance in the class we can use [1, 0, 0] for class 1, [0, 1, 0] for class 2 and [0, 0, 1] for class 3

In [424]:
def pclass_to_array(Pclass):
    cls_arr = np.zeros(3)
    
    try:
        cls_arr[Pclass - 1] = 1
        return cls_arr
    except:
        return None

def embarked_to_array(embarked):
    cls_arr = np.zeros(3)
    
    if str(embarked) in 'CQS':
        cls_arr = np.zeros(3)
        if embarked == 'C':
            cls_arr[0] = 1  
        elif embarked == 'Q':
            cls_arr[1] = 1
        elif embarked == 'S':
            cls_arr[2] = 1
        return cls_arr
    else:
        return None

In [425]:
df_train_5 = df_train_4.copy()

df_train_5['Pclass'] = df_train_5['Pclass'].apply(lambda pclass : pclass_to_array(pclass))
df_train_5['Embarked'] = df_train_5['Embarked'].apply(lambda embarked : embarked_to_array(embarked))


df_train_5.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,"[0.0, 0.0, 1.0]",0,22.0,1,0,7.25,"[0.0, 0.0, 1.0]"
1,1,"[1.0, 0.0, 0.0]",1,38.0,1,0,71.2833,"[1.0, 0.0, 0.0]"
2,1,"[0.0, 0.0, 1.0]",1,26.0,0,0,7.925,"[0.0, 0.0, 1.0]"
3,1,"[1.0, 0.0, 0.0]",1,35.0,1,0,53.1,"[0.0, 0.0, 1.0]"
4,0,"[0.0, 0.0, 1.0]",0,35.0,0,0,8.05,"[0.0, 0.0, 1.0]"


In [426]:
df_train_5.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked'],
      dtype='object')

# # Task 2: Classification
- Next step