### By Using Pipeline
- Use the titanic dataset - Create a model that predicts whether the passenger has survived or not.
- Use column transformer for all the transformations below - from sklearn.compose import ColumnTransformer
- 1. First prepare the data using train_test_split - from sklearn.model_selection import train_test_split
- 2. Then Impute Missing values in Age & Embarked columns using SimpleImputer - from sklearn.impute import SimpleImputer
- 3. Then encode Categorical Nominal Variables Sex & Embarked using OneHotEncoder - from sklearn.preprocessing import OneHotEncoder
- 4. Then scale the numeric columns (Pclass, Age, SibSp, Parch, Fare) using MinMaxScaler - from sklearn.preprocessing import MinMaxScaler
- 5. Then perform Feature Selection - from sklearn.feature_selection import SelectKBest, chi2
- 6. Train the model using DecisionTreeClassifier - from sklearn.tree import DecisionTreeClassifier
- 7. Use the pipeline for all the above steps - from sklearn.pipeline import Pipeline, make_pipeline
- So pipeline will have the following steps Prepare data > Impute Missing Values > OneHotEncoder > Scale Data > Feature Selection > Train Model.

In [81]:
import pandas as pd
import numpy as np

Titanic_DF = pd.read_csv(r"C:\Users\ACER\Desktop\Kranthi\DataScience_Desktop\MachineLearningFiles\Titanic_Data.csv")
Titanic_DF.drop(columns = ['PassengerId','Name','Ticket','Cabin'],inplace=True)
Titanic_DF.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S


#### Step1 - Prepare the data using train test split

In [84]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(Titanic_DF.drop(columns=['Survived']),Titanic_DF['Survived'],test_size=0.2)
#,random_state=42)

x_train.head(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
261,3,male,3.0,4,2,31.3875,S
636,3,male,32.0,0,0,7.925,S


#### Use Column Transformer for
#### Step2 - Age & Embarked - Check for null values and handle them using SimpleImputer
#### Step3 - Encode Categorical Nominal Variables(Sex & Embarked) using OneHotEncoder
#### Step4 - Scale the numeric columns(Pclass, Age, SibSp, Parch, Fare) using MinMaxScaler

In [87]:
from sklearn.compose import ColumnTransformer

In [89]:
x_train.head(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
261,3,male,3.0,4,2,31.3875,S
636,3,male,32.0,0,0,7.925,S


In [91]:
# SimpleImputer Transformer - Age and Embarked has null values.
# x_train Pclass(0), Sex(1), Age(2), SibSp(3), Parch(4), Fare(5), Embarked(6)

from sklearn.impute import SimpleImputer

# Below we are creating two objects for SimpleImputer as we have different strategy, otherwise we could have created only one SimpleImputerObject.
ColTrans_AgeEmbarked_SI = ColumnTransformer([
    ('SimpleImputer_Age',SimpleImputer(),[2]),
    ('SimpleImputer_Embarked',SimpleImputer(strategy='most_frequent'),[6])
],remainder='passthrough')
# In above step, we have used index number instead of column name, as after each transformation, the output is np.array and since np.array
# Doesn't have column_name, array column name changes to 0,1...

In [93]:
# OneHotEncoder Transformer - Sex and Embarked are Categorical nominal values.

from sklearn.preprocessing import OneHotEncoder

# Below we are creating only one object of OneHotEncoder for both Sex and Embarked, as the same columns is updated in above step.
ColTrans_SexEmbarked_OHE = ColumnTransformer([
    ('OneHotEncoder_SexEmbarked',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[3,1])
],remainder='passthrough')
# 0 - Age, 
# 1 - Embarked, 
# 2 - 1/2/3
# 3 - Sex

#ColTrans_Embarked_OHE = ColumnTransformer([
#    ('OneHotEncoder_Embarked',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[1])
#],remainder='passthrough')

In [95]:
# MinMaxScaler Transformer - Pclass, Age, SibSp, Parch, Fare are Numeric columns, perform scaling.
# x_train Pclass(0), Sex(1), Age(2), SibSp(3), Parch(4), Fare(5), Embarked(6)
# Above column order gets changed to 
# x_train Pclass(0), Sex_1(1), Sex_2(2), Age(3), SibSp(4), Parch(5), Fare(6), Embarked_1(7), Embarked_2(8), Embarked_3(9)

from sklearn.preprocessing import MinMaxScaler

ColTrans_ScaleAll_MMS = ColumnTransformer([
#    ('MinMaxScaler_ScaleAll',MinMaxScaler(),[0,3,4,5,6])    
    ('MinMaxScaler_ScaleAll',MinMaxScaler(),slice(0,10)) # Scale all the columns
],remainder='passthrough')

#### Step5 - Perform Feature Selection

In [98]:
# Feature Selection

from sklearn.feature_selection import SelectKBest, chi2

SelectKBest_Obj = SelectKBest(score_func=chi2, k=1)
# k=5, select only top 5 columns that might be useful for the model.

#### Step6 - Apply DecisionTreeClassifier to predict the values

In [101]:
# Create DecisionTreeClassifier Object

from sklearn.tree import DecisionTreeClassifier
DecisionTreeClassifierObj = DecisionTreeClassifier()

#from sklearn.linear_model import LinearRegression
#LR = LinearRegression()

#### Step7 - Use the pipeline to join all the above steps.

In [104]:
from sklearn.pipeline import Pipeline, make_pipeline

Pipeline_Titanic = Pipeline([
    ('Trans_ColTrans_AgeEmbarked_SI',ColTrans_AgeEmbarked_SI),
    ('Trans_ColTrans_SexEmbarked_OHE',ColTrans_SexEmbarked_OHE),
#    ('Trans_ColTrans_Embarked_OHE',ColTrans_Embarked_OHE)
    ('Trans_ColTrans_ScaleAll_MMS',ColTrans_ScaleAll_MMS),
    ('Trans_SelectKBest_Obj',SelectKBest_Obj),
    ('Trans_DecisionTreeClassifierObj',DecisionTreeClassifierObj)
])

# Alternate syntax using make_pipeline
# make_pipeline_Titanic = make_pipeline(ColTrans_AgeEmbarked_SI, ColTrans_SexEmbarked_OHE, ColTrans_ScaleAll_MMS, SelectKBest_Obj, DecisionTreeClassifierObj)

#### Pipeline(name, transformerObj) class vs make_pipeline(transformerObj)
- Pipeline requires naming of the steps, make_pipeline doesn't
- Same applies to ColumnTransformer(coltransname, object, column_name) vs make_column_transformer( object, column_name)

#### Step8 - Train the model

In [108]:
from sklearn import set_config
set_config(display='diagram')

In [110]:
Pipeline_Titanic.fit(x_train, y_train)

In [112]:
# Check after Impute on Age and Embarked columns - Check Mean and Most_frequent values

# Pipeline_Titanic.named_steps
# Mean of Age
Pipeline_Titanic.named_steps['Trans_ColTrans_AgeEmbarked_SI'].transformers_[0][1].statistics_
# Age Mean = 29.49

array([29.97183099])

In [114]:
# most_frequent of Embarked
Pipeline_Titanic.named_steps['Trans_ColTrans_AgeEmbarked_SI'].transformers_[1][1].statistics_
# Embarked most_frequent = S

array(['S'], dtype=object)

In [116]:
# Check OneHotEncoder on Sex and Embarked - since we encoded on single object, both can be see below

Pipeline_Titanic.named_steps['Trans_ColTrans_SexEmbarked_OHE'].transformers_[0][1].categories_

[array(['female', 'male'], dtype=object), array(['C', 'Q', 'S'], dtype=object)]

In [118]:
# Check MinMaxScaler on all columns

Pipeline_Titanic.named_steps['Trans_ColTrans_ScaleAll_MMS'].transformers_[0][1].min_
Pipeline_Titanic.named_steps['Trans_ColTrans_ScaleAll_MMS'].transformers_[0][1].data_min_
Pipeline_Titanic.named_steps['Trans_ColTrans_ScaleAll_MMS'].transformers_[0][1].data_max_

array([  1.    ,   1.    ,   1.    ,   1.    ,   1.    ,  80.    ,
         3.    ,   8.    ,   5.    , 512.3292])

In [120]:
# Check Feature selection step attributes

Pipeline_Titanic.named_steps['Trans_SelectKBest_Obj']
Pipeline_Titanic.named_steps['Trans_SelectKBest_Obj'].scores_
Pipeline_Titanic.named_steps['Trans_SelectKBest_Obj'].pvalues_
Pipeline_Titanic.named_steps['Trans_SelectKBest_Obj'].n_features_in_

10

In [122]:
# check after decision tree model attributes

Pipeline_Titanic.named_steps['Trans_DecisionTreeClassifierObj'].classes_
Pipeline_Titanic.named_steps['Trans_DecisionTreeClassifierObj'].n_classes_

2

In [124]:
# Now predict the values

y_predict = Pipeline_Titanic.predict(x_test)
y_predict

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 0], dtype=int64)

In [126]:
# Check the accuracy score

from sklearn.metrics import accuracy_score
accuracy_score(y_predict,y_test)

0.770949720670391

### Cross Validation Using Pipeline
- cv = 5 >>> 5 times different train test split and run alogirithm and then get the score and then take mean of score

In [63]:
# Cross validation using cross_val_score

from sklearn.model_selection import cross_val_score
cross_val_score(Pipeline_Titanic,x_train,y_train,cv=5,scoring='accuracy').mean()

0.7865556978233034

### Grid search Using Pipeline
- Hyper Parameter Tuning

In [66]:
#gridsearchcv

params = {
    'Trans_DecisionTreeClassifierObj__max_depth':[1,2,3,4,5,None]
}

In [68]:
from sklearn.model_selection import GridSearchCV
GridSearchCV_Obj = GridSearchCV(Pipeline_Titanic,params,cv=5,scoring='accuracy')
GridSearchCV_Obj.fit(x_train,y_train)

In [69]:
GridSearchCV_Obj.best_score_

0.7865556978233034

In [70]:
GridSearchCV_Obj.best_params_

{'Trans_DecisionTreeClassifierObj__max_depth': 1}

#### Step7 - Export the data to the website
- Accept the age, gender, etc. all the inputs and predict the outcome
- Since we used pipeline, we need not export onehotencoder, scaler, etc. separately, since all of them are within pipeline, just export pipeline.

In [72]:
import pickle

pickle.dump(Pipeline_Titanic,open('models/Pipeline_Titanic.pk1','wb'))