In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
train_path = '../input/titanic/train.csv'
test_path = '../input/titanic/test.csv'
submission_path = '../input/titanic/gender_submission.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
submission_df = pd.read_csv(submission_path)

print(train_df.columns)
display(train_df.head())
display(submission_df.head())

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


Problem: Binary Classification </br>
Methodology: Impelment classification model to predict whether 'Survived' or not based </br>
    1. ticket class(Pclass)</br>
    2. sex (Sex)</br>
    3. # of siblings (SibSp)</br>
    4. # of parents (Parch)</br>
    5. Fare</br>
    6. Cabin #</br>
    7. Port of Embarkation (Embarked)</br>
    
'Survived' column will be used as target value (0=No, 1=Yes)



# Data Exploration

In [3]:
import plotly.express as px
import matplotlib.pyplot as plt

In [4]:
#Checking the Survived Distribution
fig_Sex = px.pie(train_df, 
                 names = train_df['Survived'].value_counts().reset_index().index,
                 values = train_df['Survived'].value_counts().reset_index().Survived, 
                 title = 'Survived', width=400, height=400) 
fig_Sex.show()

It is about 60:40 ratio and it is not strictly imbalanced dataset. </br>
We may measure the classification with either F-1 score or ROC AUC

In [5]:
print("---------Sex------------")
print(pd.pivot_table(train_df, index='Survived', columns='Sex', values='PassengerId', aggfunc='count'))
#Checking the Age of the population
fig_Sex = px.histogram(train_df, x="Sex", color="Survived",
                       title = 'Sex', width=400, height=400, barmode='group') 
fig_Sex.show()

---------Sex------------
Sex       female  male
Survived              
0             81   468
1            233   109


In [6]:
age_range = 10
if max(train_df['Age'])%age_range == 0:
    max_bin = int(max(train_df['Age']))+1
else:
    max_bin = int((max(train_df['Age'])//age_range+1)*age_range)+1

counts, bins = np.histogram(train_df['Age'], bins=range(0, max_bin, age_range))
print(f"Total number of counts    : {sum(counts)}")
print(f"Missing ages              : {len(train_df[train_df['Age'].isnull()])}")
print(f"Total number of passengers: {len(train_df)}")
bins = 0.5 * (bins[:-1] + bins[1:])
fig_Age = px.bar(x=bins, y=counts, labels={'x':'Age', 'y':'count'}, 
             title='Age', width=800, height=400)
fig_Age.show()

Total number of counts    : 714
Missing ages              : 177
Total number of passengers: 891


In [7]:
print("---------PClass---------")
print(pd.pivot_table(train_df, index='Survived', columns='Pclass', values='PassengerId', aggfunc='count'))
#Checking the Age of the population
fig_Pclass = px.histogram(train_df, x="Pclass", color="Survived",
                       title = 'Pclass (Ticket Class)', width=400, height=300, barmode='group') 
fig_Pclass.show()
fig_Fare = px.histogram(train_df, x='Fare', color="Pclass", nbins=20, 
             title="Fare by ticket class", width=800, height=300)
fig_Fare.show()

---------PClass---------
Pclass      1   2    3
Survived              
0          80  97  372
1         136  87  119


In [8]:
print("---------Embarked-------")
print(pd.pivot_table(train_df, index='Survived', columns='Embarked', values='PassengerId', aggfunc='count'))
fig_Embarked = px.histogram(train_df, x="Embarked", color="Survived",
                       title = 'Embarked', width=400, height=300, barmode='group') 
fig_Embarked.show()

---------Embarked-------
Embarked   C   Q    S
Survived             
0         75  47  427
1         93  30  217


In [9]:
fig = px.violin(train_df, x='Embarked', y="Pclass", box=True,
                title = 'Pclass by port of Embarkation', width=400, height=300,
                hover_data=train_df.columns)
fig.show()

# Feature Engineering

## Feature Extraction

In [10]:
def feature_extraction(df):
    df['CabinLetter'] = df['Cabin'].apply(lambda x: "None" if pd.isna(x) else str(x)[0])
    df['numCabins'] = df['Cabin'].apply(lambda x: 0 if pd.isna(x) else len(x.split(' ')))
    df['Name_title'] = df['Name'].apply(lambda x: "None" if pd.isna(x) else x.split(',')[1].split('.')[0].strip())
    return df

train_df = feature_extraction(train_df)
test_df = feature_extraction(test_df)

In [11]:
print("---------Cabin Letter-----------")
print(pd.pivot_table(train_df, index='Survived', columns='CabinLetter', values='PassengerId', aggfunc='count'))

print("---------Number of Cabins-------")
print(pd.pivot_table(train_df, index='Survived', columns='numCabins', values='PassengerId', aggfunc='count'))

---------Cabin Letter-----------
CabinLetter    A     B     C     D     E    F    G   None    T
Survived                                                      
0            8.0  12.0  24.0   8.0   8.0  5.0  2.0  481.0  1.0
1            7.0  35.0  35.0  25.0  24.0  8.0  2.0  206.0  NaN
---------Number of Cabins-------
numCabins      0      1    2    3    4
Survived                              
0          481.0   58.0  7.0  3.0  NaN
1          206.0  122.0  9.0  3.0  2.0


In [12]:
train_df['Name_title'].unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess',
       'Jonkheer'], dtype=object)

In [13]:
print(train_df['Name_title'].value_counts())
print("\n---------Number of Cabins-------")
print(pd.pivot_table(train_df, index='Survived', columns='Name_title', values='PassengerId', aggfunc='count'))

Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Mlle              2
Major             2
Col               2
the Countess      1
Capt              1
Ms                1
Sir               1
Lady              1
Mme               1
Don               1
Jonkheer          1
Name: Name_title, dtype: int64

---------Number of Cabins-------
Name_title  Capt  Col  Don   Dr  Jonkheer  Lady  Major  Master   Miss  Mlle  \
Survived                                                                      
0            1.0  1.0  1.0  4.0       1.0   NaN    1.0    17.0   55.0   NaN   
1            NaN  1.0  NaN  3.0       NaN   1.0    1.0    23.0  127.0   2.0   

Name_title  Mme     Mr   Mrs   Ms  Rev  Sir  the Countess  
Survived                                                   
0           NaN  436.0  26.0  NaN  6.0  NaN           NaN  
1           1.0   81.0  99.0  1.0  NaN  1.0           1.0  


Title extracted from Name represents pretty much same thing that Sex attribute represents

## Handle Missing values
Pclass -> Imputation - Replace with mode </br>
Sex -> Imputation - Replace with mode </br>
Age -> Imputation - Replace with mean </br>
SibSp -> Imputation - Replace with 0 </br>
Parch -> Imputation - Replace with 0 </br>
Fare -> Imputation - Replace with median value based on Pclass </br>
Cabin -> Imputation - Replace with 0 </br>
Embarked -> Imputation - Replace with mode </br>

In [14]:
train_y = train_df['Survived']
train_id = train_df['PassengerId']
test_id = test_df['PassengerId']

train_x = train_df.drop(['PassengerId', 'Survived', 'Name', 'Name_title', 'Ticket', 'Cabin'], axis=1)
test_x = test_df.drop(['PassengerId', 'Name', 'Name_title', 'Ticket', 'Cabin'], axis=1)

print(f"Train X columns = Test X columns : {list(train_x.columns) == list(test_x.columns)}")
print("Train X.........")
display(train_x.head())
print("Test X..........")
display(test_x.head())

Train X columns = Test X columns : True
Train X.........


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,CabinLetter,numCabins
0,3,male,22.0,1,0,7.25,S,,0
1,1,female,38.0,1,0,71.2833,C,C,1
2,3,female,26.0,0,0,7.925,S,,0
3,1,female,35.0,1,0,53.1,S,C,1
4,3,male,35.0,0,0,8.05,S,,0


Test X..........


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,CabinLetter,numCabins
0,3,male,34.5,0,0,7.8292,Q,,0
1,3,female,47.0,1,0,7.0,S,,0
2,2,male,62.0,0,0,9.6875,Q,,0
3,3,male,27.0,0,0,8.6625,S,,0
4,3,female,22.0,1,1,12.2875,S,,0


In [15]:
from sklearn.impute import SimpleImputer

def impute(df, imputers=None, fareDict=None, isTrain=True):
    if isTrain:
        # Impute using SimpleImputer
        imputers = dict()
        imputers['Pclass'] = SimpleImputer(strategy='most_frequent')
        imputers['Sex'] = SimpleImputer(strategy='most_frequent')
        imputers['Age'] = SimpleImputer(strategy='mean')
        imputers['SibSp'] = SimpleImputer(strategy='most_frequent')
        imputers['Parch'] = SimpleImputer(strategy='most_frequent')
        imputers['Embarked'] = SimpleImputer(strategy='most_frequent')
        for col in imputers.keys():
            df[col] = imputers[col].fit_transform(df[col].values.reshape(-1,1))
        
        # Impute Fare Column by using Median values for each Pclass
        fareDict = round(df.groupby('Pclass')['Fare'].median(), 0)
        for key in fareDict.keys():
            rows_Pclass = df['Pclass'] == key
            df.loc[rows_Pclass,'Fare'] = df.loc[rows_Pclass,'Fare'].fillna(fareDict[key])

    else:
        for col in imputers.keys():
            df[col] = imputers[col].transform(df[col].values.reshape(-1,1))
        for key in fareDict.keys():
            rows_Pclass = df['Pclass'] == key
            df.loc[rows_Pclass,'Fare'] = df.loc[rows_Pclass,'Fare'].fillna(fareDict[key])
    df['Age'] = df['Age'].round(0)
    
    return df, imputers, fareDict

In [16]:
print("Train X check before Impute")
print(train_x.isnull().any())
print("\nTest X check before Impute")
print(test_x.isnull().any())

Train X check before Impute
Pclass         False
Sex            False
Age             True
SibSp          False
Parch          False
Fare           False
Embarked        True
CabinLetter    False
numCabins      False
dtype: bool

Test X check before Impute
Pclass         False
Sex            False
Age             True
SibSp          False
Parch          False
Fare            True
Embarked       False
CabinLetter    False
numCabins      False
dtype: bool


Train data have some NaN values in columns of 'Age', 'Cabin', and 'Embarked'. </br>
Test data have some NaN values in columns of 'Age', 'Fare', and 'Cabin'.

In [17]:
train_x_imputed, imputers, fareDict = impute(train_x, imputers=None, fareDict=None, isTrain=True)
test_x_imputed, imputers, fareDict = impute(test_x, imputers, fareDict, False)

In [18]:
print("Train X check")
print(train_x_imputed.isnull().any())
print("\nTest X check")
print(test_x_imputed.isnull().any())

Train X check
Pclass         False
Sex            False
Age            False
SibSp          False
Parch          False
Fare           False
Embarked       False
CabinLetter    False
numCabins      False
dtype: bool

Test X check
Pclass         False
Sex            False
Age            False
SibSp          False
Parch          False
Fare           False
Embarked       False
CabinLetter    False
numCabins      False
dtype: bool


## Label Encode

In [19]:
from sklearn import preprocessing
label_enc = dict()
col_labels = ["Sex", "Embarked", "CabinLetter"]
for col in col_labels:
    le = preprocessing.LabelEncoder()
    train_x_imputed[col] = le.fit_transform(train_x_imputed[col])
    test_x_imputed[col] = le.transform(test_x_imputed[col])
    label_enc[col] = le

In [20]:
display(train_x_imputed.head())
display(test_x_imputed.head())

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,CabinLetter,numCabins
0,3,1,22.0,1,0,7.25,2,7,0
1,1,0,38.0,1,0,71.2833,0,2,1
2,3,0,26.0,0,0,7.925,2,7,0
3,1,0,35.0,1,0,53.1,2,2,1
4,3,1,35.0,0,0,8.05,2,7,0


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,CabinLetter,numCabins
0,3,1,34.0,0,0,7.8292,1,7,0
1,3,0,47.0,1,0,7.0,2,7,0
2,2,1,62.0,0,0,9.6875,1,7,0
3,3,1,27.0,0,0,8.6625,2,7,0
4,3,0,22.0,1,1,12.2875,2,7,0


# Model Deployment

## Model Selection
Below 6 classifiers were tested
1. Random Forest
2. Gradient Boosting
3. Logistic Regression
4. Naive Bayes 
5. KNN
6. Support Vector Machine


In [21]:
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
import statistics

In [22]:
cv_scores = dict()
model = RandomForestClassifier()
cv_scores['Random_Forest'] = statistics.mean(cross_val_score(model, train_x_imputed, train_y, cv=5))
model = LGBMClassifier()
cv_scores['Gradient_Boosting'] = statistics.mean(cross_val_score(model, train_x_imputed, train_y, cv=5))
model = LogisticRegression()
cv_scores['Logistic_Regression'] = statistics.mean(cross_val_score(model, train_x_imputed, train_y, cv=5))
model = GaussianNB()
cv_scores['Naive_Bayes'] = statistics.mean(cross_val_score(model, train_x_imputed, train_y, cv=5))
model = KNeighborsClassifier()
cv_scores['KNN'] = statistics.mean(cross_val_score(model, train_x_imputed, train_y, cv=5))
model = SVC()
cv_scores['Support_Vector_Machine'] = statistics.mean(cross_val_score(model, train_x_imputed, train_y, cv=5))


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to th

In [23]:
from operator import itemgetter
#sorted(cv_scores.items(), key=itemgetter(1), reverse=True)

cv_scores_list = []
for key in cv_scores.keys():
    cv_scores_list.append({"classifier": key, "score": cv_scores[key]})
cv_scores_df = pd.DataFrame(cv_scores_list)
cv_scores_df.sort_values(by='score', ascending=False, ignore_index=True)

Unnamed: 0,classifier,score
0,Gradient_Boosting,0.820451
1,Random_Forest,0.810363
2,Logistic_Regression,0.793484
3,Naive_Bayes,0.737468
4,KNN,0.708236
5,Support_Vector_Machine,0.671245


The result shows that Gradient Boosting outperformed the others and we will deploy the model with Gradient Boosting

## Find the best hyperparameters for Gradient Boosting(Light) model

In [24]:
from sklearn.model_selection import GridSearchCV

gridParams = {
    'boosting_type': ['gbdt'],
    'learning_rate': [0.5, 0.1, 0.05],
    'n_estimators': [50, 100, 500, 1000],
    'max_depth': [3, 5, 10],
    'objective' :['binary'],
    'colsample_bytree' : [0.5, 0.75, 1.0],
    'subsample' : [0.75, 1.0],
    'reg_alpha' : [0, 1, 1.2],
    'reg_lambda' : [0, 1,1.2,1.4],  
    'random_state' : [42]
}

In [25]:
model = LGBMClassifier()
lgvmclf_cv = GridSearchCV(model, gridParams, verbose=1, cv=10, n_jobs=-1)
# Run the grid
lgvmclf_cv_model = lgvmclf_cv.fit(train_x_imputed, train_y)

# Print the best parameters found
print(lgvmclf_cv.best_params_)
print(lgvmclf_cv.best_score_)

Fitting 10 folds for each of 2592 candidates, totalling 25920 fits
{'boosting_type': 'gbdt', 'colsample_bytree': 0.75, 'learning_rate': 0.5, 'max_depth': 10, 'n_estimators': 50, 'objective': 'binary', 'random_state': 42, 'reg_alpha': 1.2, 'reg_lambda': 1.2, 'subsample': 0.75}
0.8462671660424469


## Train and Predict with hyper-parameters searched

In [26]:
best_lgbmClf_params = {'boosting_type': 'gbdt', 
                       'colsample_bytree': 0.5, 
                       'learning_rate': 0.1, 
                       'n_estimators': 500, 
                       'objective': 'binary', 
                       'random_state': 42, 
                       'reg_alpha': 1, 
                       'reg_lambda': 0, 
                       'subsample': 0.75}
#model = LGBMClassifier(**best_lgbmClf_params)
#model.fit(train_x_imputed, train_y)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits </br>
{'boosting_type': 'gbdt', 'colsample_bytree': 0.64, 'learning_rate': 0.1, 'n_estimators': 500, 'objective': 'binary', 'reg_alpha': 1, 'reg_lambda': 1.4, 'subsample': 0.7}
0.8406628585776159

In [27]:
best_lgbm_model = lgvmclf_cv_model.best_estimator_
y_pred = best_lgbm_model.predict(test_x_imputed)

In [28]:
submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": y_pred
    })
submission.to_csv('./submission.csv', index=False)

In [29]:
submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
