### Example below - Without Using Pipeline
- In the titanic dataset
- 1. First prepare the data using train_test_split - from sklearn.model_selection import train_test_split
- 2. Then Impute Missing values in Age & Embarked columns using SimpleImputer - from sklearn.impute import SimpleImputer
- 3. Then encode Categorical Nominal Variables Sex & Embarked using OneHotEncoder - from sklearn.preprocessing import OneHotEncoder
- 4. Then scale the numeric columns (Pclass, Age, SibSp, Parch, Fare) using MinMaxScaler - from sklearn.preprocessing import MinMaxScaler
- 5. Get all the non transformed and transformed(imputed,scaled, encoded) together for final dataset to train.
- 6. Train the model using DecisionTreeClassifier - from sklearn.tree import DecisionTreeClassifier

In [3]:
import pandas as pd
import numpy as np

In [5]:
Titanic_DF = pd.read_csv(r"C:\Users\ACER\Desktop\Kranthi\DataScience_Desktop\MachineLearningFiles\Titanic_Data.csv")[['Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]
Titanic_DF.head(2)

# Create a model that predicts whether the passenger has survived or not.

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C


#### Step1 - Prepare the data using train test split

In [8]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(Titanic_DF.drop(columns=['Survived']),Titanic_DF.Survived,test_size=0.2,random_state=42)
x_train.head(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S


#### Step2 - Check for null values and handle them using SimpleImputer

In [10]:
Titanic_DF.isnull().sum()
# There are 177 nulls in Age column and 2 nulls in Embarked column

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [13]:
# Apply SimpleImputer
# Impute Age with Mean
# Impute Embarked with most_frequent strategy

from sklearn.impute import SimpleImputer

SimpleImputerObj_Age = SimpleImputer()
SimpleImputerObj_Embarked = SimpleImputer(strategy='most_frequent')

x_train_AgeImputed = SimpleImputerObj_Age.fit_transform(x_train[['Age']])
x_test_AgeImputed = SimpleImputerObj_Age.transform(x_test[['Age']])

x_train_EmbarkedImputed = SimpleImputerObj_Embarked.fit_transform(x_train[['Embarked']])
x_test_EmbarkedImputed = SimpleImputerObj_Embarked.transform(x_test[['Embarked']])

In [15]:
SimpleImputerObj_Age.statistics_
# Shows the mean for Age column

array([29.49884615])

In [17]:
SimpleImputerObj_Embarked.statistics_
# Shows the most frequent column in Embarked column

array(['S'], dtype=object)

#### Step3 - Encode Categorical Nominal Variables using OneHotEncoder

In [20]:
# Encode Sex and Embarked columns Using OneHotEncoder
# First check the number of distinct values in Sex and Embarked

Titanic_DF.Sex.value_counts()

Sex
male      577
female    314
Name: count, dtype: int64

In [22]:
Titanic_DF.Embarked.value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [24]:
from sklearn.preprocessing import OneHotEncoder

# drop = 'first' is not used as we are using Decision Tree model, not linear model.
OneHotEncoder_SexObj = OneHotEncoder(sparse_output=False,handle_unknown='ignore')
OneHotEncoder_EmbarkedObj = OneHotEncoder(sparse_output=False,handle_unknown='ignore')
# Sex = 2-1 = 1 column, if drop='first' was used.
# Embarked = 3-1 = 2 columns, if drop='first' was used.

x_train_SexEncoded = OneHotEncoder_SexObj.fit_transform(x_train[['Sex']])
x_test_SexEncoded = OneHotEncoder_SexObj.transform(x_test[['Sex']])

# Since Embarked has null values, use the Imputed data to encode.
x_train_EmbarkedEncoded = OneHotEncoder_EmbarkedObj.fit_transform(x_train_EmbarkedImputed)
x_test_EmbarkedEncoded = OneHotEncoder_EmbarkedObj.transform(x_test_EmbarkedImputed)
x_test_EmbarkedEncoded[0]

array([1., 0., 0.])

#### Step4 - Scale the numeric columns using MinMaxScaler

In [27]:
# Scale the numeric columns Pclass, Age, SibSp, Parch, Fare
# For Age use x_train_AgeImputed/x_test_AgeImputed as we have imputed the nulls with Mean

from sklearn.preprocessing import MinMaxScaler

MinMaxScaler_Pclass_Obj = MinMaxScaler()
x_train_Pclass_Scaled = MinMaxScaler_Pclass_Obj.fit_transform(x_train[['Pclass']])
x_test_Pclass_Scaled = MinMaxScaler_Pclass_Obj.transform(x_test[['Pclass']])

MinMaxScaler_Age_Obj = MinMaxScaler()
x_train_Age_Scaled = MinMaxScaler_Age_Obj.fit_transform(x_train_AgeImputed)
x_test_Age_Scaled = MinMaxScaler_Age_Obj.transform(x_test_AgeImputed)

MinMaxScaler_SibSp_Obj = MinMaxScaler()
x_train_SibSp_Scaled = MinMaxScaler_SibSp_Obj.fit_transform(x_train[['SibSp']])
x_test_SibSp_Scaled = MinMaxScaler_SibSp_Obj.transform(x_test[['SibSp']])

MinMaxScaler_Parch_Obj = MinMaxScaler()
x_train_Parch_Scaled = MinMaxScaler_Parch_Obj.fit_transform(x_train[['Parch']])
x_test_Parch_Scaled = MinMaxScaler_Parch_Obj.transform(x_test[['Parch']])

MinMaxScaler_Fare_Obj = MinMaxScaler()
x_train_Fare_Scaled = MinMaxScaler_Fare_Obj.fit_transform(x_train[['Fare']])
x_test_Fare_Scaled = MinMaxScaler_Fare_Obj.transform(x_test[['Fare']])

#### Step5 - Get all the non transformed and transformed(imputed,scaled, encoded) together for final dataset to train.

In [30]:
Titanic_DF.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C


In [32]:
# Now get together the final data i.e. Concatenate from
# Pclass - x_train_Pclass_Scaled, x_test_Pclass_Scaled
# SibSp - x_train_SibSp_Scaled, x_test_SibSp_Scaled
# Parch - x_train_Parch_Scaled, x_test_Parch_Scaled
# Fare - x_train_Fare_Scaled, x_test_Fare_Scaled
# Age - x_train_Age_Scaled, x_test_Age_Scaled
# Sex - x_train_SexEncoded, x_test_SexEncoded
# Embarked - x_train_EmbarkedEncoded, x_test_EmbarkedEncoded

# Pclass, SibSp, Parch, Fare, Age, Sex, Embarked
x_train_transformed = np.concatenate((x_train_Pclass_Scaled,x_train_SibSp_Scaled,x_train_Parch_Scaled,x_train_Fare_Scaled,x_train_Age_Scaled,
                                      x_train_SexEncoded,x_train_EmbarkedEncoded),axis=1)
x_test_transformed = np.concatenate((x_test_Pclass_Scaled,x_test_SibSp_Scaled,x_test_Parch_Scaled,x_test_Fare_Scaled,x_test_Age_Scaled,
                                      x_test_SexEncoded,x_test_EmbarkedEncoded),axis=1)
x_test_transformed[0]

array([1.        , 0.125     , 0.16666667, 0.02975782, 0.36540395,
       0.        , 1.        , 1.        , 0.        , 0.        ])

#### Step6 - Apply DecisionTreeClassifier to predict the values

In [35]:
from sklearn.tree import DecisionTreeClassifier

DecisionTreeClassifierObj = DecisionTreeClassifier()
DecisionTreeClassifierObj.fit(x_train_transformed,y_train)

In [37]:
# Predict the values and store in y_predict

y_predict = DecisionTreeClassifierObj.predict(x_test_transformed)
y_predict.shape

(179,)

In [39]:
# Test the accuracy of the model

from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_predict)

0.776536312849162

#### Step7 - Export the data to the website
- Accept the age, gender, etc. all the inputs and predict the outcome

In [42]:
import pickle

pickle.dump(OneHotEncoder_SexObj,open('models/OneHotEncoder_SexObj.pk1','wb'))
pickle.dump(OneHotEncoder_EmbarkedObj,open('models/OneHotEncoder_EmbarkedObj.pk1','wb'))
pickle.dump(DecisionTreeClassifierObj,open('models/DecisionTreeClassifierObj.pk1','wb'))

pickle.dump(MinMaxScaler_Pclass_Obj,open('models/MinMaxScaler_Pclass_Obj.pk1','wb'))
pickle.dump(MinMaxScaler_Age_Obj,open('models/MinMaxScaler_Age_Obj.pk1','wb'))
pickle.dump(MinMaxScaler_SibSp_Obj,open('models/MinMaxScaler_SibSp_Obj.pk1','wb'))
pickle.dump(MinMaxScaler_Parch_Obj,open('models/MinMaxScaler_Parch_Obj.pk1','wb'))
pickle.dump(MinMaxScaler_Fare_Obj,open('models/MinMaxScaler_Fare_Obj.pk1','wb'))