In [None]:
# Import Libraries
import numpy as np 
import pandas as pd 

# Visualisation
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
%matplotlib inline

# Data Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics 
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import learning_curve
from sklearn.model_selection import KFold 
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import RandomizedSearchCV 

# Models

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

# Ensemble
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier

# Warnings
import warnings
warnings.filterwarnings('ignore')


In [None]:
# Read the data

train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')
sample_submission = pd.read_csv('/kaggle/input/titanic/gender_submission.csv')

In [None]:
# Inspect the train dataset

train.head()

In [None]:
test.head()

In [None]:
sample_submission.head()

In [None]:
# Merge train and test datasets, so easy for process them together

all_data = pd.concat([train,test])

all_data.head()

In [None]:
# Get the information of all dataset

all_data.info()

In [None]:
# Summarise of the statistical information

all_data.describe()

In [None]:
# Find the total number of missing value in each column

all_data.isnull().sum()

In [None]:
# Create a pie chart to see the percentage of Survived (Target variable)

survival = all_data['Survived'].value_counts()

plt.pie(survival,labels=survival.index, autopct="%1.1f%%")

plt.legend(title='Survived')
plt.title('The percentage of Survival')

In [None]:
# Get the exsample of 'Name' column

all_data['Name']

In [None]:
# Create 'Title' column from 'Name' column

all_data['Title'] = all_data['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())

In [None]:
# Find the total number of value in each title

all_data['Title'].value_counts()

In [None]:
# Group the 'Title' values

all_data['Title'] = all_data['Title'].replace(['Miss','Ms', 'Mlle'], 'Miss')
all_data['Title'] = all_data['Title'].replace(['Mrs', 'Mme'], 'Mrs')
all_data['Title'] = all_data['Title'].replace(['Capt','Col', 'Major','Dr', 'Rev'], 'Officer')
all_data['Title'] = all_data['Title'].replace(['Sir','Lady','the Countess','Jonkheer','Dona','Don'], 'Royalty')

In [None]:
# Find the total number of value after grouping

all_data['Title'].value_counts()

In [None]:
# Find the total number of Survived group by Title

all_data.groupby(['Title','Survived'])['Title'].value_counts()

In [None]:
# Try to create pivot table 

pd.pivot_table(all_data, index = 'Survived', columns = 'Title', values = 'Fare' ,aggfunc ='count')

In [None]:
# Try to use crosstab function

pd.crosstab(all_data['Title'],all_data['Survived']).T

In [None]:
# Create a bar chart for the Distribution of Survived by Title

sns.countplot(x=all_data['Title'], hue='Survived', data=all_data)

plt.title('Distribution of Survived by Title')

In [None]:
# Create a bar chart to see the Distribution of Survived by Sex

sns.countplot(x=all_data['Sex'], hue='Survived', data=all_data)

plt.title('Distribution of Survived by Sex')

In [None]:
# Check the missing value of 'Embarked' column

all_data['Embarked'].isnull().sum()

In [None]:
# Get the 2 rows of missing value 

all_data[all_data['Embarked'].isnull()]

In [None]:
# Fill missing value with mode

all_data['Embarked'].fillna(all_data['Embarked'].mode()[0], inplace = True)

# Check the missing value again

all_data['Embarked'].isnull().sum()

In [None]:
# Total number of Survived group by Embarked

all_data.groupby(['Embarked','Survived'])['Survived'].value_counts()

In [None]:
# Try to use crosstab 

pd.crosstab(all_data['Embarked'],all_data['Survived']).T

In [None]:
# Create a bar chart for the Distribution of Survived by Embarked

sns.countplot(x=all_data['Embarked'], hue='Survived', data=all_data)

plt.title('Distribution of Survived by Embarked')

In [None]:
# Check the total number of missing value of 'Cabin' column

all_data['Cabin'].isnull().sum()

In [None]:
# Percentage of the missing values of 'Cabin' column

(all_data['Cabin'].isnull().sum() / len(all_data) *100).round(2)

In [None]:
# Get the unique of 'Cabin' column

all_data['Cabin'].unique()

In [None]:
# Get total number of unique

all_data['Cabin'].nunique()

In [None]:
# Fill missing values with 'X'
# And group the cabin by select the first letter of Cabin

all_data['Cabin'] = all_data['Cabin'].apply(lambda a: 'X' if pd.isnull(a) else a[0])

In [None]:
# Fill missing values with 'X'
# And group the cabin by select the first letter of Cabin

all_data['Cabin'] = all_data['Cabin'].apply(lambda a: 'X' if pd.isnull(a) else a[0])

In [None]:
# Total number of Survived group by Cabin

all_data.groupby(['Cabin','Survived'])['Survived'].value_counts()

In [None]:
# Try to use crosstab 

pd.crosstab(all_data['Cabin'],all_data['Survived']).T

In [None]:
# Create a bar chart for the Distribution of Survived by Cabin

sns.countplot(x=all_data['Cabin'], hue='Survived', data=all_data)

plt.title('Distribution of Survived by Cabin')

In [None]:
# Check the missing value of 'Age' column

all_data['Age'].isnull().sum()

In [None]:
# Percentage of the missing values of 'Age' column

(all_data['Age'].isnull().sum() / len(all_data) *100).round(2)

In [None]:
# Find the mean vales of 'Age' group by 'Title'

all_data.groupby(['Title']).Age.mean()

In [None]:
#Filling the missing values of Age with mean values group by Title.

all_data["Age"].fillna(all_data.groupby(['Title'])['Age'].transform("mean"), inplace=True)

# Check the missing value again

all_data['Age'].isnull().sum()

In [None]:
# Creating a Categorical variable for Ages

all_data['AgeGroup'] = ''
all_data['AgeGroup'].loc[(all_data['Age'] < 18)] = 'Young'
all_data['AgeGroup'].loc[(all_data['Age'] >= 18) & (all_data['Age'] < 40)] = 'Adult'
all_data['AgeGroup'].loc[(all_data['Age'] >= 40) & (all_data['Age'] < 56)] = 'Middle age'
all_data['AgeGroup'].loc[(all_data['Age'] >= 56)] = 'Senior'

# Total number of Survived group by Age

pd.crosstab(all_data['AgeGroup'], all_data['Survived']).T

In [None]:
# Create a bar chart for the Distribution of Survived by 'Age Group' 

order = ['Young','Adult','Middle age', 'Senior']

sns.countplot(x=all_data['AgeGroup'], hue='Survived', data=all_data, order=order)

plt.title('Distribution of Survived by Age AgeGroup')

In [None]:
# Try to create Age Bin

all_data['AgeBin'] = pd.qcut(all_data['Age'], 4)

# Total number of Survived group by Age Bin

pd.crosstab(all_data['AgeBin'], all_data['Survived']).T

In [None]:
# Change the label of Age Bin

label = LabelEncoder()

all_data['AgeBin'] = label.fit_transform(all_data['AgeBin'])

# Check the table again

pd.crosstab(all_data['AgeBin'], all_data['Survived']).T

In [None]:
# Create a bar chart for the Distribution of Survived by 'Age Bin' 

sns.countplot(x=all_data['AgeBin'], hue='Survived', data=all_data)

plt.title('Distribution of Survived by Age Bin')

In [None]:
# Check the missing value of 'Fare' column

all_data['Fare'].isnull().sum()

In [None]:
# Get the 1 row of missing value 

all_data[all_data['Fare'].isnull()]

In [None]:
# Fill missing fare with median

all_data['Fare'].fillna(all_data['Fare'].median(), inplace = True)

# Create a histogram of Fare

sns.histplot(data = all_data, x = 'Fare', hue = 'Survived',kde = True)

In [None]:
# Create Fare Bin

all_data['FareBin'] = pd.qcut(all_data['Fare'], 5)

# Total number of Survived group by Fare Bin

pd.crosstab(all_data['FareBin'], all_data['Survived']).T

In [None]:
# Change the label of Fare Bin

label = LabelEncoder()

all_data['FareBin'] = label.fit_transform(all_data['FareBin'])

# Check the table again

pd.crosstab(all_data['FareBin'], all_data['Survived']).T

In [None]:
# Create a bar chart for the Distribution of Survived by 'FareBin' 

sns.countplot(x=all_data['FareBin'], hue='Survived', data=all_data)

plt.title('Distribution of Survived by FareBin')

In [None]:
# Create a bar chart for the Distribution of Survived by 'Pclass' (Ticket class) 

sns.countplot(x=all_data['Pclass'], hue='Survived', data=all_data)

plt.title('Distribution of Survived by Ticket class')

In [None]:
# Create a bar chart between Pclass and Title

sns.countplot(x=all_data['Pclass'], hue='Title', data=all_data)

plt.title('Distribution of Pclass by Title')

In [None]:
# Check the correlation between Pclass and Sex

pd.crosstab(all_data['Pclass'], all_data['Sex'])

In [None]:
# Create a bar chart between Pclass and Sex

sns.countplot(x=all_data['Pclass'], hue='Sex', data=all_data)

plt.title('Distribution of Pclass by Sex')

In [None]:
# Create jointplot between 'Parent/Childern' and 'Siblings/Spouses'


sns.jointplot(x='Parch',y='SibSp',data=all_data,color='blue')


In [None]:
# Create 'Family size' column by combine 'Parent/Childern' and 'Siblings/Spouses' and plus 1 is yourself

all_data['FamilySize'] = all_data['Parch'] + all_data['SibSp'] + 1

all_data.head()

In [None]:
# Total number of Survived group by Family Size 

pd.crosstab(all_data['FamilySize'],all_data['Survived']).T

In [None]:
# Create a bar chart for the Distribution of Survived by 'Family Size' 

sns.countplot(x=all_data['FamilySize'], hue='Survived', data=all_data)

plt.title('Distribution of Survived by Family Size')

In [None]:
# Create a group of family

family_map = {1: 'Alone', 
              2: 'Small', 
              3: 'Small', 
              4: 'Small', 
              5: 'Medium', 
              6: 'Medium', 
              7: 'Large', 
              8: 'Large', 
              11: 'Large'}

all_data['FamilySizeGroup'] = all_data['FamilySize'].map(family_map)

all_data.head()

In [None]:
# Create a bar chart for the Distribution of Survived by 'FamilySizeGroup' 

order=['Alone','Small','Medium','Large']

sns.countplot(x=all_data['FamilySizeGroup'], hue='Survived', data=all_data, order=order)

plt.title('Distribution of Survived by Family Size Group')

In [None]:
# Get the example of 'Ticket' column

all_data['Ticket']

In [None]:
all_data['Ticket'].describe()

In [None]:
# Create 'LastName' column from 'Name' column

all_data['LastName'] = all_data['Name'].str.split(',').str[0]
# Find the total number of value in each Last Name

all_data['LastName'].value_counts()


In [None]:
all_data['LastName'].describe()

In [None]:
# Find the duplicate ticket

duplicate_ticket = []
for tk in all_data['Ticket'].unique():
    same_ticket = all_data[all_data['Ticket'] == tk]['Ticket']
    if (same_ticket.count()>1):
        duplicate_ticket.append(all_data.loc[all_data['Ticket']==tk, ['Ticket','LastName', 'Fare', 'FamilySize', 'Survived']])

duplicate_ticket = pd.concat(duplicate_ticket)
duplicate_ticket.head(14)

In [None]:
# Create the ConnectedSurvival column

all_data['ConnectedSurvival'] = 0.5  #(Default values)

for _, data_same in all_data.groupby('Ticket'):
    if (len(data_same) > 1):
        for ind, row in data_same.iterrows():
            smax = data_same.drop(ind)['Survived'].max()
            smin = data_same.drop(ind)['Survived'].min()
            passid = row['PassengerId']
            
            if (smax == 1):
                all_data.loc[all_data['PassengerId'] == passid, 'ConnectedSurvival'] = 1
            elif (smin == 0):
                all_data.loc[all_data['PassengerId'] == passid, 'ConnectedSurvival'] = 0

all_data.groupby('ConnectedSurvival')['Survived'].mean()

In [None]:
pd.crosstab(all_data['ConnectedSurvival'],all_data['Survived']).T

In [None]:
all_data.head()

In [None]:
all_data.info()

In [None]:
all_data.columns

In [None]:
# Select only useful columns
# (also can use drop unnecessory columns)
# (You can try to change the feature selection and see the different results)

useful_columns = ['Survived', 
                  'Pclass', 
                  'AgeBin', 
                  'FareBin',
                  'Sex', 
                  'FamilySize',
                  'ConnectedSurvival']

data_final = all_data[useful_columns]

data_final.head()

In [None]:
# Create dummy variables from categories 
# (also can use One Hot Encoder)

data_final = pd.get_dummies(data_final)

data_final.head()

In [None]:
# Create heatmap to see the correlation between variables

sns.heatmap(data_final.corr(),cmap = "coolwarm")

In [None]:
# The correlation to Survived

data_final.corr()['Survived'].sort_values(ascending=False).to_frame()

In [None]:
# Reform splits to train, test dataset

df_train = data_final[:len(train)]
df_test = data_final[len(train):]

df_test.drop('Survived', axis=1, inplace=True)
df_train.head()

In [None]:
# Select X = Columns as Features

X = df_train.drop('Survived', axis=1)

# Select y = Target Variable

y = df_train['Survived']

In [None]:
# # Feature Importances

# importances = pd.DataFrame(rf.feature_importances_,index = X.columns)
# importances.sort_values(by = 0, inplace=True, ascending = False)

# plt.figure(figsize=(8, 5)) 
# sns.barplot(x=0, y=importances.index, data=importances).set_title('Feature Importances')

In [None]:
# Compare model to see how various different models perform with default parameters

classifiers=['Logistic Regression','KNN','Decision Tree','Naive Bayes','Random Forest','SVC','XGB','GradientB','AdaB','ExtraTrees']

models=[LogisticRegression(),
        KNeighborsClassifier(),
        DecisionTreeClassifier(),
        GaussianNB(),
        RandomForestClassifier(),
        SVC(),
        XGBClassifier(),
        GradientBoostingClassifier(),
        AdaBoostClassifier(),
        ExtraTreesClassifier()]

cv_mean = []
cv_std = []

for model in models:
    model=model
    cv = cross_val_score(model,X,y,cv=10)
    cv_std.append(cv.std())
    cv_mean.append(cv.mean())

cv_dataframe = pd.DataFrame({'CV Mean':cv_mean,'Std':cv_std},index=classifiers)       
cv_dataframe=cv_dataframe.sort_values(by=['CV Mean'], ascending = False)
cv_dataframe

In [None]:
# # # Extra Trees Classifier

# parameters = {'n_estimators':[100,200,300,500,1000],
#                'max_depth':range(1,10),
#                'criterion': ['gini', 'entropy']
#                 }

# et_tune = GridSearchCV(estimator=ExtraTreesClassifier(),
#                                   param_grid=parameters,
#                                   cv=10,
#                                   n_jobs = -1)
# et_tune.fit(X,y) 

# et_model = et_tune.best_estimator_

# print (et_tune.best_score_)
# print (et_tune.best_params_)
# print (et_tune.best_estimator_)

In [None]:
# # After run the code above, I will get the best parameter for the model

# et_model = ExtraTreesClassifier(criterion='gini',
#                                 n_estimators=300,
#                                 max_depth=6,
#                                 random_state=0)

# et_model.fit(X,y)

# et_cv =cross_val_score(et_model,X,y, cv = 10,scoring = "accuracy").mean()

# et_cv

In [None]:
# # Decision Tree Classifier

# parameters = {'max_depth':range(1,30), 
#               'criterion': ["entropy", "gini"]}

# dt_tune = GridSearchCV(estimator=DecisionTreeClassifier(), 
#                         param_grid=parameters, 
#                         verbose=False, 
#                         cv=10,
#                         n_jobs = -1)

# dt_tune.fit(X, y) 

# dt_model = dt_tune.best_estimator_

# print (dt_tune.best_score_)
# print (dt_tune.best_params_)
# print (dt_tune.best_estimator_)

In [None]:
# # Support Vector Classifier

# parameters = {'kernel': ['rbf'], 
#               'gamma': [0.01,0.02,0.05,0.1,0.5],
#               'C': [0.1, 0.5,1,2,3,4,5,6,7,8]},

# svc_tune = GridSearchCV(SVC(probability=True), 
#                          param_grid=parameters, 
#                          cv=10) 

# svc_tune.fit(X,y)

# svc_model = svc_tune.best_estimator_

# print(svc_tune.best_score_)
# print(svc_tune.best_params_)
# print(svc_tune.best_estimator_)

In [None]:
# # K Neighbors Classifier

# parameters = {'n_neighbors':range(1,41), 
#               'weights':['uniform','distance']}

# knn_tune = GridSearchCV(estimator=KNeighborsClassifier(), 
#                          param_grid=parameters,
#                          cv=10,
#                          verbose = False, 
#                          n_jobs=-1)

# knn_tune.fit(X,y)

# knn_model = knn_tune.best_estimator_

# print(knn_tune.best_score_)
# print(knn_tune.best_params_)
# print(knn_tune.best_estimator_)

In [None]:
# # Extream Gradient Boosting Classifier

# parameters = {'n_estimators':[100,200,300,400,500],
#               'max_depth':range(1,10),
#               'learning_rate': [0.01,0.02,0.05,1],
#               'colsample_bytree': [0.75,0.8,0.85],
#               'subsample': [0.55, 0.6, 0.65,0.7],
#               'min_child_weight':[0.01,0.1]
#              }

# xgb_tune = GridSearchCV(estimator=XGBClassifier(),
#                          param_grid=parameters,
#                          cv=10,
#                          n_jobs = -1)
# xgb_tune.fit(X,y) 

# print ('XGB Best score :', xgb_tune.best_score_)
# print ('XGB Best parameter :', xgb_tune.best_params_)
# print ('XGB Best estimator :', xgb_tune.best_estimator_)

In [None]:
# # After run the code above, I will get the best parameter for the model

# xgb_model = XGBClassifier(n_estimators=100,
#                           learning_rate= 0.01,
#                           max_depth=6,
#                           colsample_bytree= 0.75,
#                           min_child_weight= 0.01,
#                           subsample= 0.6,
#                           random_state=0)

# xgb_model.fit(X,y)

# xgb_cv =cross_val_score(xgb_model,X,y, cv = 10,scoring = "accuracy").mean()
# xgb_cv

In [None]:
# # Random Forest Classifier

# parameters = {'n_estimators':[100,200,300,400,500],
#               'max_depth':range(1,10),
#               'min_samples_leaf': [2,4,6,8,10],
#               'min_samples_split': [2,4,6,8,10,12,14,16,20],
#               'criterion': ['gini', 'entropy']
#         }

# rf_tune = GridSearchCV(estimator=RandomForestClassifier(),
#                         param_grid=parameters,
#                         cv=10,
#                         n_jobs = -1)

# rf_tune.fit(X,y) 

# print ('RF best score :' , rf_tune.best_score_)
# print ('RF Best parameter :' , rf_tune.best_params_)
# print ('RF Best estimator :',rf_tune.best_estimator_)

In [None]:
# # After run the code above, I will get the best parameter for the model

# rf_model = RandomForestClassifier(criterion='entropy',
#                                   n_estimators=100,
#                                   max_depth=7,
#                                   min_samples_split=20,
#                                   min_samples_leaf=6,
#                                   oob_score=True,
#                                   random_state=0,
#                                   n_jobs=-1,
#                                   verbose=0) 

# rf_model.fit(X,y)

# rf_cv = cross_val_score(rf_model,X,y, cv = 10,scoring = "accuracy").mean()
# rf_cv

In [None]:
# # Gradient Boosting Classifier

# parameters = {'n_estimators':[100,200,300,400,500],
#               'max_depth':range(1,10),
#               'learning_rate': [0.01,0.02,0.05,1]
#               }

# gbm_tune = GridSearchCV(estimator=GradientBoostingClassifier(),
#                          param_grid=parameters,
#                          cv=10,
#                          n_jobs = -1)

# gbm_tune.fit(X,y) 

# print ('GBM best score :', gbm_tune.best_score_)
# print ('GBM best parameters :', gbm_tune.best_params_)
# print ('GBM best estimator :', gbm_tune.best_estimator_)

In [None]:
# # After run the code above, I will get the best parameter for the model

# gbm_model = GradientBoostingClassifier(n_estimators=100,
#                           learning_rate= 0.01,
#                           max_depth=3,
#                           random_state=0)

# gbm_model.fit(X,y)

# gbm_cv =cross_val_score(gbm_model,X,y, cv = 10,scoring = "accuracy").mean()
# gbm_cv

In [None]:
# # Ada Boost Classifier

# parameters = {'n_estimators':[100,200,300,500,1000],
#               'learning_rate':[0.01,0.02,0.5]
#               }

# adb_tune = GridSearchCV(AdaBoostClassifier(),
#                                  param_grid=parameters,
#                                  cv=10,
#                                  n_jobs = -1)
# adb_tune.fit(X,y) 

# adb_model = adb_tune.best_estimator_

# print (adb_tune.best_score_)
# print (adb_tune.best_params_)
# print (adb_tune.best_estimator_)

In [None]:
# Logistic Regression

## C_vals is the alpla value of lasso and ridge regression(as alpha increases the model complexity decreases,)
## remember effective alpha scores are 0<alpha<infinity 

C_vals = [0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,2,3,4,5,6,7,8,9,10,12,13,14,15,16,16.5,17,17.5,18]

## Choosing penalties(Lasso(l1) or Ridge(l2))
penalties = ['l1','l2']

## setting parameters for param_grid in GridSearchCV. 
parameters = {'penalty': penalties, 
              'C': C_vals}

logreg = LogisticRegression(solver='liblinear')

## Calling on GridSearchCV object. 
lo_tune = GridSearchCV(estimator=LogisticRegression(), 
                       param_grid = parameters,
                       cv = 10,
                       scoring = 'accuracy',
                       n_jobs =-1,
                      )

## Fitting the model
lo_tune.fit(X, y)

print(lo_tune.best_score_)
print(lo_tune.best_params_)
print(lo_tune.best_estimator_)

In [None]:
# Logistic Regression
# After run the code above, I will get the best parameter for the model

lo_model = LogisticRegression(C=0.9, penalty='l2')

lo_model.fit(X,y)

lo_cv =cross_val_score(lo_model,X,y, cv = 10,scoring = "accuracy").mean()

lo_cv

In [None]:
# # Compare model after Hyperparameter tuning

# scores_list=[]

# scores_list = [["XGB",xgb_cv],
#                ["GBM",gbm_cv],
#                ["ET",et_cv],
#                ["RF",rf_cv], 
#                ["KNN",knn_tune.best_score_],
#                ["SVC",svc_tune.best_score_],
#                ["DT",dt_tune.best_score_],
#                ["ADB",adb_tune.best_score_],
#                ["LO",lo_cv]
#               ]

# scores = (pd.DataFrame(scores_list,columns = ["Model", "Score"])).sort_values(by=["Score"], ascending=False )

# scores

In [None]:
# # Define estimators
# estimators = [('RF',rf_model),
#               ('GBM',gbm_model),
#               #('DT',dt_model),
#               ('SVC',svc_model), 
#               #('XGB', xgb_model),
#               #('KNN',knn_model),
#               #('ET',et_model),
#               #('ADB',adb_model),
#               #('LO',lo_model)
#              ]

# voting_model = VotingClassifier(estimators=estimators,
#                                 voting='soft')

# voting_soft = voting_model.fit(X, y)

# voting_cv = cross_val_score(voting_model,X,y, cv = 10,
#                             scoring = "accuracy").mean()

# voting_cv


In [None]:
# Make predictions on the test dataset

lo_model = lo_model.predict(df_test).astype(int)

# Create a submission file

lo_output = pd.DataFrame({'PassengerId': test['PassengerId'],
                               'Survived': lo_model})
lo_output.to_csv('sumbmission_lo.csv', index=False)
lo_output.head()