In [54]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [55]:
df = pd.read_csv("../../../Data/titanic_Dataset_modi.csv")
df.drop(columns=['PassengerId','Name','Ticket','Cabin'], inplace=True)
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,education
0,0,3,male,22.0,1,0,7.2500,S,School
1,1,1,female,38.0,1,0,71.2833,C,UG
2,1,3,female,26.0,0,0,7.9250,S,PG
3,1,1,female,35.0,1,0,53.1000,S,PG
4,0,3,male,35.0,0,0,8.0500,S,UG
...,...,...,...,...,...,...,...,...,...
395,0,3,male,22.0,0,0,7.7958,S,PG
396,0,3,female,31.0,0,0,7.8542,S,PG
397,0,2,male,46.0,0,0,26.0000,S,PG
398,0,2,male,23.0,0,0,10.5000,S,UG


In [56]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age          78
SibSp         0
Parch         0
Fare          0
Embarked      1
education     0
dtype: int64

In [57]:
df['Embarked'].value_counts()

Embarked
S    280
C     80
Q     39
Name: count, dtype: int64

In [58]:
df['education'].value_counts()

education
PG        144
School    128
UG        128
Name: count, dtype: int64

In [59]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 1:], df.iloc[:, 1], test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
X_train

(320, 8) (80, 8) (320,) (80,)


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,education
3,1,female,35.0,1,0,53.1000,S,PG
18,3,female,31.0,1,0,18.0000,S,School
202,3,male,34.0,0,0,6.4958,S,PG
250,3,male,,0,0,7.2500,S,School
274,3,female,,0,0,7.7500,Q,PG
...,...,...,...,...,...,...,...,...
71,3,female,16.0,5,2,46.9000,S,PG
106,3,female,21.0,0,0,7.6500,S,School
270,1,male,,0,0,31.0000,S,School
348,3,male,3.0,1,1,15.9000,S,UG


##### 1. Age -> mean impute ; Embarked -> median impute
##### 2. Fare -> Standard Scaler ;  Age -> MinMax Scaler
##### 3. Sex, Embaked -> OHE
##### 4. education -> Ordinal 
##### 5. Label Encoding
##### 5. Feature Selection
##### 6. Model train 

In [60]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler,StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

### 1. Feature by Feature Pipeline


### Age (impute, scale)

In [61]:
age_pipe = Pipeline([
    ('imputer',SimpleImputer()),
    ('scaling',MinMaxScaler())
])

### Fare (scale)

In [62]:
fare_pipe = Pipeline([
    ('scaling',StandardScaler())
])

### Embarked (impute,OHE)

In [63]:
embk_pipe = Pipeline([
    ('impute',SimpleImputer(strategy='most_frequent')),
    ('ohe',OneHotEncoder(drop='first',sparse_output=False,dtype=np.int32))
])

### Sex (OHE)

In [64]:
sex_pipe = Pipeline([
    ('ohe',OneHotEncoder(drop='first',sparse_output=False,dtype=np.int32))
])

### Education (ordinal)

In [65]:
edu_pipe = Pipeline([
    ("ordinal",OrdinalEncoder(categories=[['School','UG','PG']]))
])

# Preprocessor

In [66]:
preprocessor = ColumnTransformer(transformers=[
    ('age_pipe',age_pipe,['Age']),
    ('fare_pipe',fare_pipe,['Fare']),
    ('embk_pipe',embk_pipe,['Embarked']),
    ('sex_pipe',sex_pipe,['Sex']),
    ('edu_pipe',edu_pipe,['education']),
],remainder='passthrough')

# Model Build

In [67]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

model_pipe = Pipeline([
    ("preprocessor",preprocessor),
    ('feature_select_trf',SelectKBest(score_func=f_classif, k=6)),
    ('model_trf',DecisionTreeClassifier())
])

In [68]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [69]:
model_pipe.fit(X_train,y_train)

  f = msb / msw
The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [70]:
y_pred = model_pipe.predict(X_test)
y_pred

array([0, 2, 1, 2, 2, 1, 0, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 2, 1,
       2, 2, 2, 2, 2, 2, 0, 2, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2, 0, 2, 2,
       2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 0, 2, 0, 2, 2, 1, 2, 0,
       2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2], dtype=int64)

In [71]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

1.0