# Titanic Dataset

In [None]:
# Import Basic Packages for data exploration and wrangling
import numpy as np
import pandas as pd
from scipy.stats import norm
from scipy import stats

# Import visualization packages
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
import matplotlib.pyplot as plt
import datetime       # Added for version 4 (to post output file with date/time)

# sklearn pre-processing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest,f_regression
from sklearn.metrics import classification_report,confusion_matrix,make_scorer,f1_score
# sklearn models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier


# Data Load and Exploration

In [None]:
titanic_tr = pd.read_csv('../input/titanic/train.csv')
titanic_tst = pd.read_csv('../input/titanic/test.csv')
titanic_tst

In [None]:
titanic_tr.head(50)

**Initial Column Types (prior to deep inspection):**

Categorical Variables:
* Pclass
* Sex (might be binary)
* SibSp
* Parch
* Embarked

Continuous Variables:
* Age
* Fare

Natural Language:
* Name
* Cabin

In [None]:
# Verify column types
titanic_tr.info()

In [None]:
# Verified data ranges for age and fare
# Information shows null values; summarize the number of null values
titanic_tr.isna().sum()

In [None]:
# SHow the divide between those who survived and those who did not
titanic_tr['Survived'].value_counts()

In [None]:
# Show potential values of all categorical variables
cat_vars = ['Pclass','Sex','SibSp','Parch','Embarked']

for var in cat_vars :
    print(var)
    print('==============================')
    print(titanic_tr[var].value_counts())
    print('')
    print('')


In [None]:
# Visualize numeric ranges
cont_var = ['Age','Fare']

fig = make_subplots(rows=1, cols=2)

for idx, var in enumerate(cont_var):
   fig.add_trace(go.Violin(y=titanic_tr[var], box_visible=True, line_color='white', meanline_visible=True, fillcolor='darkturquoise', opacity=0.5, 
                           points='all', x0=var), 1, idx + 1) 

fig.update_layout(height=800, width = 1200, title_text = 'Continuous Variable Distribution', showlegend=False, template='plotly_dark')

In [None]:
# There are clear outliers on the Fare variable, though they are likely valid due to the high cost of first class tickets.

# View split violin plots, showing those that survived and those that did not.
fig = make_subplots(rows = 1, cols = 2)

for idx, var in enumerate(cont_var):
    fig.add_trace(go.Violin(y=titanic_tr[var][ titanic_tr['Survived'] == 1 ],
                            line_color='white', fillcolor='darkturquoise', opacity=0.5, 
                            legendgroup='Yes', name='Yes', side='negative', scalegroup='Yes',
                       x0=var), 1, idx + 1)
    fig.add_trace(go.Violin(y=titanic_tr[var][ titanic_tr['Survived'] == 0 ],
                            line_color='white', fillcolor='darkorange', opacity=0.5, 
                            legendgroup='Yes', name='No', side='positive', scalegroup='No',
                       x0=var), 1, idx + 1)

fig.update_traces(meanline_visible=True)
fig.update_layout(height=800, width=1200, violinmode='overlay',
                  title_text = 'Continuous Variable Distribution - Survived Compared', 
                  template="plotly_dark")
fig.show()

The graph above helps to determine how to manage outliers.  Generally speaking, those above the age of 70 did not survive.  Also, generally speaking, those who paid a far above $100 survived.  

In [None]:
# Display an sns plot for all numeric data types
titanic_num = titanic_tr.select_dtypes('number').drop(columns=['PassengerId'])
plt.style.use('dark_background')
sns.pairplot(titanic_num)

In [None]:
# View using a heatmap
corr = titanic_num.corr()
fig = px.imshow(corr)
fig.update_layout(title_text = 'Correlation Heat Map - Numerical Fields', template="plotly_dark")
fig.show()

# Data Transformation and Pre-Processing

In [None]:
# Combine Train and Test, ensuring that all pre-processing steps are managed simultaneously
train_rows = titanic_tr.shape[0]
test_rows = titanic_tst.shape[0]
print('Rows in the train dataset: ', train_rows)
print('Rows in the test dataset: ', test_rows)
titanic_cmb = pd.concat([titanic_tr.iloc[:, 2:], titanic_tst.iloc[:, 1:]])

#### Included new columns "Cab", "Title", "FamilySize", "Solo", "SmFam", "LgFam", "numeric_ticket", "IsMale" 

In [None]:
# Multiple cabins:
titanic_cmb['Cabin_multiple'] = titanic_cmb.Cabin.apply(lambda x:0 if pd.isna(x) else len(x.split(' ')))


In [None]:
# Manage outliers, using the extremes listed above
titanic_cmb.Age[titanic_cmb.Age >= 70] = 70
titanic_cmb.Fare[titanic_cmb.Fare >= 100] = 100

# creates catagories based on the cabin letter:
# in this case we will treat null value like its own catagory:
titanic_cmb.Cabin.fillna('U', inplace = True)
titanic_cmb['Cab'] = titanic_cmb.Cabin.apply(lambda x: str(x)[0])

# including a new column "FamiySize":
titanic_cmb['FamilyCt'] = titanic_cmb['Parch'] + titanic_cmb['SibSp'] + 1
titanic_cmb['FamilySize'] = np.select([
    titanic_cmb['FamilyCt'] == 1,
    titanic_cmb['FamilyCt'].between(2,4, inclusive=True),
    titanic_cmb['FamilyCt'] >= 5
],
[
    'Solo',
    'SmFam',
    'LgFam'
])

# Replace empty Embarked fields using the most common value (S)
titanic_cmb['Embarked'] = titanic_cmb['Embarked'].fillna('S')

# feature engineering on person's title:
titanic_cmb['Title'] = titanic_cmb.Name.apply(lambda x: x.split(',')[1].split('.')[0].strip())

# Organize by sex-based honorifics
                             
rep = ['Countess','Lady','Sir','Don','Jonkheer','Dona','Rev','Dr','Col','Major','Capt','Mrs','Ms','Mme','Mr','Miss','Mlle','Master']
wth = ['Royalty','Royalty','Royalty','Royalty','Royalty','Royalty','Officer','Officer','Officer','Officer','Officer','Mrs','Mrs','Mrs','Mr','Miss','Miss','Master']

titanic_cmb['Title2'] = titanic_cmb['Title'].replace(rep,wth)

# catagorizing tickets as numeric and non numeric:
titanic_cmb['numeric_ticket'] = titanic_cmb.Ticket.apply(lambda x:1 if x.isnumeric() else 0)

titanic_cmb['ticket_letters'] = titanic_cmb.Ticket.apply(lambda x:''.join(x.split(' ')[:-1]).replace('.','').replace('/','').lower() if len(x.split(' ')[:-1]) > 0 else 0) 

# Break the fare into quantiles
titanic_cmb['FareCut']= pd.qcut(titanic_tr['Fare'], [0, .3, .6, .8, .9, 1],
                                labels=["1st", "2nd", "3rd", "4th", "5th"])

# Convert the Sex column to binary IsMale
titanic_cmb['IsMale'] = np.where(titanic_cmb.Sex == 'male', 1, 0)

titanic_cmb.head(10)

#### There is a gradual decrease in survival rate going from Cabin A to n (unassigned) Cabin

In [None]:
# comparing survival rate by cabin for the training dataset:
titanic_train = titanic_cmb.iloc[0:891,0:]
print(titanic_train['Cab'].value_counts())
pd.pivot_table(titanic_train, index = titanic_tr['Survived'], columns = titanic_train['Cab'], values = 'Name', aggfunc = 'count')

In [None]:
titanic_train.Cabin_multiple.value_counts()

In [None]:
pd.pivot_table(titanic_train, index = titanic_tr['Survived'], columns = titanic_train['Cabin_multiple'], values = 'Ticket', aggfunc = 'count')

#### The "name_title"  shows that people with tilte Mrs., Miss, Ms, Master ( Female and kids) survived more as compared to Mr. ( Males)

In [None]:
# comparing survival rate by name_title for training set:
print(titanic_train.Title.value_counts())

pd.pivot_table(titanic_train, index = titanic_tr['Survived'], columns = titanic_train['Title'], values = 'Name', aggfunc = 'count')

#### The new column "numeric_ticket" is created with two catagories: numeric ticket (1) ( numbered tickets) and non-numeric ticket (0) ( a mix of letters and numbers) : Survival rate is same for numeric and non-numeric tickets.

In [None]:
# comparing survival rate by numeric_ticket of the training dataset:
print(titanic_train.numeric_ticket.value_counts())

pd.pivot_table(titanic_train, index = titanic_tr['Survived'], columns = titanic_train['numeric_ticket'], values = 'Name', aggfunc = 'count')

#### New column "FamilySize" vs Survival shows that passengers having small family size had  higher survival

In [None]:
# comparing survival by FamilySize of the training dataset:
print(titanic_train.FamilySize.value_counts())

pd.pivot_table(titanic_train, index = titanic_tr['Survived'], columns = titanic_train['FamilySize'], values = 'Name', aggfunc = 'count')

In [None]:
titanic_train.shape

In [None]:
T = pd.concat([titanic_tr.loc[:,'Survived'], titanic_train.iloc[0:,0:]], axis=1)
cor = T.corr()
print(cor)

In [None]:
# Separate Column Types
cont_vars = ['Age']
bin_vars = ['IsMale']
cat_vars = ['FareCut','Embarked', 'Pclass', 'Cab', 'Title2', 'FamilySize']

In [None]:
cont_pipeline = Pipeline([
    ('imputer', KNNImputer(n_neighbors=2)),
    ('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
#    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder()),
])

full_pipeline = ColumnTransformer([
    ('continuous', cont_pipeline, cont_vars),
    ('binary', 'passthrough', bin_vars),
    ('category', cat_pipeline, cat_vars)
], sparse_threshold=0)

# Get a list of column names
cat_col_names = OneHotEncoder().fit(titanic_cmb[cat_vars]).get_feature_names(cat_vars)
col_names = [*cont_vars, *bin_vars, *cat_col_names]

In [None]:
# Establish ML datasets
X = full_pipeline.fit_transform(titanic_cmb)

X_train = X[0:train_rows, 0:]
X_test = X[train_rows:, 0:]
y_train = titanic_tr['Survived']

# Model Creation and Tracking

In [None]:
# Establish variables that will be used for each model
cross_val = 10
rnd_st = 2020

# Create a dataframe that will be used for comparison
results = pd.DataFrame(columns = ['Model Type','Model Name','Accuracy','Hyperparameters'])

## Logistic Regression

In [None]:
log_grd = LogisticRegression(random_state=rnd_st)

param_grid_log = [
    {'penalty' : ['l1','l2'], 'C':[1, 5, 10, 15], 'class_weight': ['balanced', None], 
     'solver' : ['liblinear']},
    {'penalty' : ['elasticnet'], 'C':[1,5, 10, 15], 'class_weight':['balanced', None],
    'solver' : ['saga'], 'max_iter':[10000], 'l1_ratio' : [0.25, 0.5, 0.75]}
]

grid_search_log = GridSearchCV(log_grd, param_grid_log, cv = cross_val, scoring='accuracy', 
                               return_train_score=True)

grid_search_log.fit(X_train, y_train)

In [None]:
grid_search_log.best_params_

In [None]:
# View the scores of all permutations
cvres = grid_search_log.cv_results_
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']) :
    print(round(mean_score, 4), params)

In [None]:
results = results.append({'Model Type' : 'Logistic Regression',
                          'Model Name' : 'grid_search_log',
                          'Accuracy' : grid_search_log.best_score_ ,
                          'Hyperparameters' : grid_search_log.best_params_}, ignore_index=True)

## Decision Tree

In [None]:
tree_grd = DecisionTreeClassifier(random_state = rnd_st)

param_grid_tree = [
    {'splitter' : ['best','random'],
    'max_depth' : [3,5,7,9],
    'max_features' : ['auto','sqrt','log2']},
]

grid_search_tree = GridSearchCV(tree_grd, param_grid_tree, cv = cross_val, scoring='accuracy',
                               return_train_score=True)
grid_search_tree.fit(X_train, y_train)

In [None]:
grid_search_tree.best_params_

In [None]:
cvres = grid_search_tree.cv_results_
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']) :
    print(round(mean_score, 4), params)

In [None]:
results = results.append({'Model Type' : 'Decision Tree',
                          'Model Name' : 'grid_search_tree',
                          'Accuracy' : grid_search_tree.best_score_ ,
                          'Hyperparameters' : grid_search_tree.best_params_}, ignore_index=True)

# View results so far
results

## Support Vector Machine

In [None]:
svm_grd = SVC(random_state = rnd_st)

param_grid_svm = [
    {'kernel' : ['linear','poly','rbf','sigmoid'],
    'coef0' : [0, 1],
    'decision_function_shape' : ['ovo','ovr']
    },
]

grid_search_svm = GridSearchCV(svm_grd, param_grid_svm, cv = cross_val, scoring='accuracy', 
                               return_train_score=True)

grid_search_svm.fit(X_train, y_train)

In [None]:
# Display the best estimator
grid_search_svm.best_params_

In [None]:
cvres = grid_search_svm.cv_results_
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']) :
    print(round(mean_score, 4), params)

In [None]:
results = results.append({'Model Type' : 'Support Vector Machine',
                          'Model Name' : 'grid_search_svm',
                          'Accuracy' : grid_search_svm.best_score_ ,
                          'Hyperparameters' : grid_search_svm.best_params_}, ignore_index=True)

# View results so far
results

## Random Forest

In [None]:
rf_grd = RandomForestClassifier(random_state = rnd_st, n_estimators=100, criterion='gini')

param_grid_rf = [
    {
        'max_depth' : [3,5,7,9],
        'class_weight' : ['balanced','balanced_subsample']
    },
]

grid_search_rf = GridSearchCV(rf_grd, param_grid_rf, cv = cross_val, scoring='accuracy', 
                               return_train_score=True)

grid_search_rf.fit(X_train, y_train)

In [None]:
grid_search_rf.best_params_

In [None]:
cvres = grid_search_rf.cv_results_
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']) :
    print(round(mean_score, 4), params)

In [None]:
results = results.append({'Model Type' : 'Random Forest',
                          'Model Name' : 'grid_search_rf',
                          'Accuracy' : grid_search_rf.best_score_ ,
                          'Hyperparameters' : grid_search_rf.best_params_}, ignore_index=True)

# View results so far
results

In [None]:
# With bagging
bag_grd = BaggingClassifier(DecisionTreeClassifier(), random_state = rnd_st, bootstrap=True)

param_grid_bag = [
    {
        'n_estimators' : [10, 100, 500],
        'max_samples' : [10, 100, 500],
    }
]

grid_search_bag = GridSearchCV(bag_grd, param_grid_bag, cv = cross_val, scoring='accuracy', 
                               return_train_score=True)

grid_search_bag.fit(X_train, y_train)

In [None]:
grid_search_bag.best_params_

In [None]:
results = results.append({'Model Type' : 'Random Forest with Bagging',
                          'Model Name' : 'grid_search_bag',
                          'Accuracy' : grid_search_bag.best_score_ ,
                          'Hyperparameters' : grid_search_bag.best_params_}, ignore_index=True)

# View results so far
results

# Neural Network

In [None]:
# Multi-layer perception algorithm using backpropogation
mlp_grd = MLPClassifier(random_state = rnd_st, alpha=1e-5, max_iter = 10000)

param_grid_mlp = [
    {
        'hidden_layer_sizes' : [1, 2, 4],
        'solver' : ['lbfgs','sgd','adam'],
        'learning_rate' : ['constant','invscaling','adaptive']
    },
]

grid_search_mlp = GridSearchCV(mlp_grd, param_grid_mlp, cv = cross_val, scoring='accuracy', 
                               return_train_score=True)

grid_search_mlp.fit(X_train, y_train)

In [None]:
grid_search_mlp.best_params_

In [None]:
cvres = grid_search_mlp.cv_results_
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']) :
    print(round(mean_score, 4), params)

In [None]:
results = results.append({'Model Type' : 'Multi-layer Perception NN',
                          'Model Name' : 'grid_search_mlp',
                          'Accuracy' : grid_search_mlp.best_score_ ,
                          'Hyperparameters' : grid_search_mlp.best_params_}, ignore_index=True)

# View results so far
results

## Gaussian Naive Bayes

In [None]:
gnb_grd = GaussianNB()

param_grid_gnb = [
    {
        'var_smoothing' : [.0001, 1e-9]
    },
]

grid_search_gnb = GridSearchCV(gnb_grd, param_grid_gnb, cv = cross_val, scoring='accuracy', 
                               return_train_score=True)

grid_search_gnb.fit(X_train, y_train)

In [None]:
grid_search_gnb.best_params_

In [None]:
cvres = grid_search_gnb.cv_results_
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']) :
    print(round(mean_score, 4), params)

In [None]:
results = results.append({'Model Type' : 'Gaussian Naive Bayes',
                          'Model Name' : 'grid_search_gnb',
                          'Accuracy' : grid_search_gnb.best_score_ ,
                          'Hyperparameters' : grid_search_gnb.best_params_}, ignore_index=True)

# View results so far
results

## Ensemble Model

In [None]:
# Create a list of models
estimators = []
estimators.append(('Logistic Regression',grid_search_log))
estimators.append(('Decision Tree',grid_search_tree))
estimators.append(('Support Vector Machine',grid_search_svm))
estimators.append(('Random Forest',grid_search_rf))
estimators.append(('Random Forest w Bagging',grid_search_rf))
estimators.append(('MLP NN',grid_search_mlp))

In [None]:
# Fit to the voting classifier
from sklearn.ensemble import VotingClassifier
ensemble = VotingClassifier(estimators)
ensemble.fit(X_train, y_train)

In [None]:
ensemble.score(X_train, y_train)

# Export the predictions

In [None]:
y_predict = ensemble.predict(X_test)

In [None]:
pred = pd.DataFrame()
pred['PassengerId'] = titanic_tst['PassengerId']
pred['Survived'] = y_predict
pred.to_csv('submission.csv', index=False)