In [6]:
import numpy as np # library for numerical computing in python
import pandas as pd # A library for data manipulation and analysis (data-frames)


# Scikit-learn : opensource ML-library for python that simplifies the implementation of various ML and
# Data Modeling Task. Designed for both supervised and unsupervised learning, 
# it provides tools for classification, regression, clustering, and more.

from sklearn.model_selection import train_test_split # split a dataset into training and testing subeset
from sklearn.impute import SimpleImputer # Handles missing data  in dataset
from sklearn.preprocessing import OneHotEncoder # convert categorical data into binary(format suitable for ml)
from sklearn.preprocessing import MinMaxScaler # scales features to a specific range, typically between 0 & 1
from sklearn.tree import DecisionTreeClassifier # implements decision tree algorithm for classification task

from sklearn.compose import ColumnTransformer #allows you to apply different transformations 
#to specific columns of a dataset

from sklearn.pipeline import Pipeline, make_pipeline

# pipeline allows you to bundle preprocessing steps and modeling steps into a single object


from sklearn.feature_selection import SelectKBest, chi2

In [7]:
df = pd.read_csv("train.csv")

In [8]:
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S


In [9]:
# drop the columns
df.drop(columns= ["PassengerId","Name", "Ticket", "Cabin"], inplace= True)


In [13]:
#step 1 -> train/test/split
X_train, X_test, Y_train, Y_test = train_test_split(df.drop(columns = ['Survived']),
                                                   df['Survived'], test_size = 0.2,
                                                    random_state = 42
                                                   )

# X = includes all columns of dF except Survived column(Y - target-variable)
# test-size = 20% of the data is reserved for the testing remaining 80% will be used for trainin model
# random-state = It keeps the training and testing datasets the same across multiple runs.



In [74]:
X_train.sample(5)


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
116,3,male,,0,0,6.4375,C
368,1,female,45.0,0,1,59.4,C
176,2,female,20.0,0,0,36.75,S
279,2,male,22.0,0,0,10.5,S
236,1,male,64.0,1,0,75.25,C


In [18]:
X_train.isnull().sum()

Pclass       0
Sex          0
Age         72
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

In [40]:
# column transformer 
trf1 = ColumnTransformer([
    ('impute_age', SimpleImputer(), [2]),  # Impute 'Age' column (index 2)
    ('impute_fare', SimpleImputer(strategy='mean'), [5])  # Impute 'Fare' column (index 5)
], remainder='passthrough')  # Keep the other columns unchanged


In [41]:
#one hot encoding

trf2 = ColumnTransformer([
    ("ohe_sex_embarked", OneHotEncoder (sparse_output = False , handle_unknown = "ignore"), [1,6])
], remainder='passthrough')

In [42]:
#scaling

trf3 = ColumnTransformer([
    ('scale', MinMaxScaler(),slice(0,10))
    
])

In [43]:
#feature selection
trf4 = SelectKBest(score_func = chi2, k = 8 )
#selecting the top 8 features.

In [44]:
#train the model

trf5 = DecisionTreeClassifier()

In [45]:
#create pipeline

pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4),
    ('trf5',trf5),
])

pipeline Vs make_pipeline


In [49]:
pipe.fit(X_train,Y_train)

In [48]:
#display piperline
from sklearn import set_config
set_config(display = 'diagram')

In [51]:
# explore pipline
pipe.named_steps

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_fare', SimpleImputer(), [5])]),
 'trf2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 6])]),
 'trf3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 'trf4': SelectKBest(k=8, score_func=<function chi2 at 0x000002940F5E5D00>),
 'trf5': DecisionTreeClassifier()}

In [52]:
#predict
Y_pred = pipe.predict(X_test) 

In [53]:
Y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [57]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, Y_pred)

0.5952380952380952

cross validation

In [59]:
#cross validation using pipeline

from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X_train, Y_train,cv=5, scoring = "accuracy").mean()

0.6437358661239257

Grid Search using pipeline


In [60]:
# gridsearchcv
params = {
    'trf5__max_depth':[1,2,3,4,5, None]
}

In [61]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe,params, cv = 5, scoring = 'accuracy')
grid.fit(X_train, Y_train)

In [62]:
grid.best_score_

0.6467209407507915

In [63]:
grid.best_params_

{'trf5__max_depth': 1}

exporting the pipeline

In [66]:
import pickle
pickle.dump(pipe,open('pipe1.pkl','wb'))