In [39]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [40]:
df = pd.read_csv("../../../Data/titanic_Dataset.csv")
df.drop(columns=['PassengerId','Name','Ticket','Cabin'], inplace=True)
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
888,0,3,female,,1,2,23.4500,S
889,1,1,male,26.0,0,0,30.0000,C


In [41]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [42]:
df['Embarked'].value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [43]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 1:], df.iloc[:, 1], test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
X_train

(712, 7) (179, 7) (712,) (179,)


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5000,S
733,2,male,23.0,0,0,13.0000,S
382,3,male,32.0,0,0,7.9250,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.2750,S
...,...,...,...,...,...,...,...
106,3,female,21.0,0,0,7.6500,S
270,1,male,,0,0,31.0000,S
860,3,male,41.0,2,0,14.1083,S
435,1,female,14.0,1,2,120.0000,S


# 1. Age -> mean impute ; Embarked -> median impute


In [44]:
from  sklearn.impute import SimpleImputer
impute_num = SimpleImputer()
X_train_age = impute_num.fit_transform(X_train[['Age']])
X_test_age = impute_num.transform(X_test[['Age']])

impute_cat = SimpleImputer(strategy='most_frequent')
X_train_embarked = impute_cat.fit_transform(X_train[['Embarked']])
X_test_embarked = impute_cat.transform(X_test[['Embarked']])

# 2. Fare -> Standard Scaler ;  Age -> MinMax Scaler

In [45]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
stdScale = StandardScaler()
X_train_fare = stdScale.fit_transform(X_train[['Fare']])
X_test_fare = stdScale.transform(X_test[['Fare']])

minScale = MinMaxScaler()
X_train_age = minScale.fit_transform(X_train_age)
X_test_age = minScale.transform(X_test_age)

In [46]:
X_train_age

array([[0.56647399],
       [0.28373963],
       [0.39683338],
       [0.32143755],
       [0.07011812],
       [0.2963056 ],
       [0.560191  ],
       [0.35913546],
       [0.36540395],
       [0.36540395],
       [0.52249309],
       [0.44709726],
       [0.40939935],
       [0.2083438 ],
       [0.35913546],
       [0.62302086],
       [0.43453129],
       [0.4722292 ],
       [0.42196532],
       [0.2083438 ],
       [0.13294798],
       [0.76124654],
       [0.37170143],
       [0.08268409],
       [0.78637849],
       [0.24604172],
       [0.36540395],
       [0.35913546],
       [0.44709726],
       [0.36540395],
       [0.62302086],
       [0.33400352],
       [0.37170143],
       [0.40939935],
       [0.36540395],
       [0.36540395],
       [0.01985423],
       [0.30887158],
       [0.63558683],
       [0.30887158],
       [0.36540395],
       [0.36540395],
       [0.2963056 ],
       [0.22090978],
       [0.36540395],
       [0.30887158],
       [0.2963056 ],
       [0.271

# 3. Sex, Embaked -> OHE

In [47]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(drop='first',dtype=np.int32, sparse_output=False)
X_train_sex_emb = ohe.fit_transform(X_train[['Sex','Embarked']])
X_test_sex_emb = ohe.transform(X_test[['Sex','Embarked']])

In [48]:
print(X_train_age.shape, X_train_embarked.shape, X_train_fare.shape, X_train_sex_emb.shape, X_train['Pclass'].values.shape, X_train['SibSp'].values.shape)

(712, 1) (712, 1) (712, 1) (712, 4) (712,) (712,)


In [49]:
print(X_train_age.shape, X_train_embarked.shape, X_train_fare.shape, X_train_sex_emb.shape, np.expand_dims(X_train['Pclass'].values, axis=1).shape, np.expand_dims(X_train['SibSp'].values, axis=1).shape)

(712, 1) (712, 1) (712, 1) (712, 4) (712, 1) (712, 1)


# 4. Merge

In [50]:
X_train_concat = np.concatenate((X_train_age, X_train_fare, X_train_sex_emb, np.expand_dims(X_train['Pclass'].values , axis=1),np.expand_dims( X_train['SibSp'].values, axis=1), np.expand_dims(X_train['Parch'].values, axis=1)), axis=1)
X_test_concat = np.concatenate((X_test_age, X_test_fare, X_test_sex_emb, np.expand_dims(X_test['Pclass'].values , axis=1),np.expand_dims( X_test['SibSp'].values, axis=1), np.expand_dims(X_test['Parch'].values, axis=1)), axis=1)

In [51]:
X_train_concat

array([[ 0.56647399, -0.07868358,  1.        , ...,  1.        ,
         0.        ,  0.        ],
       [ 0.28373963, -0.37714494,  1.        , ...,  2.        ,
         0.        ,  0.        ],
       [ 0.39683338, -0.47486697,  1.        , ...,  3.        ,
         0.        ,  0.        ],
       ...,
       [ 0.50992712, -0.35580399,  1.        , ...,  3.        ,
         2.        ,  0.        ],
       [ 0.17064589,  1.68320121,  0.        , ...,  1.        ,
         1.        ,  2.        ],
       [ 0.25860769,  0.86074761,  1.        , ...,  1.        ,
         0.        ,  1.        ]])

# 5. Label Encoding

In [52]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train_encod = le.fit_transform(y_train)
y_test_encod = le.transform(y_test)

# 5. Feature Selection

In [53]:
from sklearn.feature_selection import SelectKBest,f_classif
selector = SelectKBest(score_func=f_classif, k=6)

In [54]:
X_train_selected = selector.fit_transform(X_train_concat, y_train_encod)
X_test_selected = selector.transform(X_test_concat)

  f = msb / msw


In [55]:
X_train_selected

array([[ 0.56647399, -0.07868358,  1.        ,  0.        ,  1.        ,
         1.        ],
       [ 0.28373963, -0.37714494,  1.        ,  0.        ,  1.        ,
         2.        ],
       [ 0.39683338, -0.47486697,  1.        ,  0.        ,  1.        ,
         3.        ],
       ...,
       [ 0.50992712, -0.35580399,  1.        ,  0.        ,  1.        ,
         3.        ],
       [ 0.17064589,  1.68320121,  0.        ,  0.        ,  1.        ,
         1.        ],
       [ 0.25860769,  0.86074761,  1.        ,  0.        ,  1.        ,
         1.        ]])

# 6. Model train 

In [56]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
clf = DecisionTreeClassifier()
clf.fit(X_train_selected, y_train_encod)
y_pred = clf.predict(X_test_selected)
accuracy_score(y_test_encod,y_pred)

1.0