ivancloudvm@gmail.com

# Titanic using Naive-Bayes

In [43]:
# Data wrangling
import pandas as pd
import numpy as np
from collections import Counter
import datetime as dt
import re

# Data preprocessing
from sklearn.preprocessing import StandardScaler


# Data visualisation
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
print(plt.style.available)
plt.style.use('fivethirtyeight')

# Machine learning models
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

#Cluster model
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Model evaluation
from sklearn.model_selection import cross_val_score

# Hyperparameter tuning
from sklearn.model_selection import GridSearchCV

# Remove warnings
import warnings
warnings.filterwarnings('ignore')

print("Libraries loaded")

['Solarize_Light2', '_classic_test_patch', 'bmh', 'classic', 'dark_background', 'fast', 'fivethirtyeight', 'ggplot', 'grayscale', 'seaborn', 'seaborn-bright', 'seaborn-colorblind', 'seaborn-dark', 'seaborn-dark-palette', 'seaborn-darkgrid', 'seaborn-deep', 'seaborn-muted', 'seaborn-notebook', 'seaborn-paper', 'seaborn-pastel', 'seaborn-poster', 'seaborn-talk', 'seaborn-ticks', 'seaborn-white', 'seaborn-whitegrid', 'tableau-colorblind10']
Libraries loaded


In [44]:
eval_data = pd.read_csv("../input/titanic/test.csv")
train_data = pd.read_csv("../input/titanic/train.csv")
#total_data = pd.concat([test_data,train_data])

print(f"eval_data shape: {eval_data.shape}")
print(f"train_data shape: {train_data.shape}")
#print(f"total_data shape: {total_data.shape}")
train_data.head()

eval_data shape: (418, 11)
train_data shape: (891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [45]:
df = train_data[["Pclass","Sex","Age","Fare","Survived"]]
df_eval = eval_data[["Pclass","Sex","Age","Fare"]]

df.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Survived
0,3,male,22.0,7.25,0
1,1,female,38.0,71.2833,1
2,3,female,26.0,7.925,1
3,1,female,35.0,53.1,1
4,3,male,35.0,8.05,0


In [46]:
target = df.Survived
inputs = df.drop("Survived", axis= 1)


In [47]:
dummies = pd.get_dummies(inputs.Sex)
dummies_eval = pd.get_dummies(df_eval.Sex)
dummies.head()

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [48]:
inputs = pd.concat([inputs,dummies], axis = 1)
df_eval = pd.concat([df_eval, dummies_eval], axis= 1) 
inputs.head()

Unnamed: 0,Pclass,Sex,Age,Fare,female,male
0,3,male,22.0,7.25,0,1
1,1,female,38.0,71.2833,1,0
2,3,female,26.0,7.925,1,0
3,1,female,35.0,53.1,1,0
4,3,male,35.0,8.05,0,1


In [49]:
inputs = inputs.drop('Sex', axis = 1)
df_eval = df_eval.drop('Sex', axis = 1)
inputs.head()

Unnamed: 0,Pclass,Age,Fare,female,male
0,3,22.0,7.25,0,1
1,1,38.0,71.2833,1,0
2,3,26.0,7.925,1,0
3,1,35.0,53.1,1,0
4,3,35.0,8.05,0,1


In [50]:
inputs.isnull().sum()

Pclass      0
Age       177
Fare        0
female      0
male        0
dtype: int64

In [51]:
df_eval.isnull().sum()

Pclass     0
Age       86
Fare       1
female     0
male       0
dtype: int64

In [52]:
inputs.Age = inputs.Age.fillna(inputs.Age.mean())
df_eval.Age = df_eval.Age.fillna(df_eval.Age.mean())
inputs.head()

Unnamed: 0,Pclass,Age,Fare,female,male
0,3,22.0,7.25,0,1
1,1,38.0,71.2833,1,0
2,3,26.0,7.925,1,0
3,1,35.0,53.1,1,0
4,3,35.0,8.05,0,1


In [53]:
df_eval.head()

Unnamed: 0,Pclass,Age,Fare,female,male
0,3,34.5,7.8292,0,1
1,3,47.0,7.0,1,0
2,2,62.0,9.6875,0,1
3,3,27.0,8.6625,0,1
4,3,22.0,12.2875,1,0


In [54]:
len(inputs)

891

In [55]:
len(df_eval)

418

In [69]:
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size = 0.2, random_state = 1)

# Model

In [77]:
model = GaussianNB()

model.fit(X_train, y_train)

accuracy = round(model.score(X_test, y_test),2)

print("Accuracy model: ", accuracy)

Accuracy model:  0.78


Comparamos y_test con la prediccion del modelo en X_test:

In [105]:
y_test

862    1
223    0
84     1
680    0
535    1
      ..
796    1
815    0
629    0
421    0
448    1
Name: Survived, Length: 179, dtype: int64

In [115]:
y_test_df = pd.DataFrame(y_test)

y_test_df.reset_index(drop=True, inplace = True)

y_test_df

Unnamed: 0,Survived
0,1
1,0
2,1
3,0
4,1
...,...
174,1
175,0
176,0
177,0


In [94]:
predict_series = pd.Series(model.predict(X_test), name = "Predictions")

predict_series

0      1
1      0
2      1
3      1
4      1
      ..
174    1
175    0
176    0
177    0
178    1
Name: Predictions, Length: 179, dtype: int64

In [116]:
predict_df = pd.DataFrame(predict_series)
predict_df.reset_index(drop=True, inplace = True)

predict_df

Unnamed: 0,Predictions
0,1
1,0
2,1
3,1
4,1
...,...
174,1
175,0
176,0
177,0


In [117]:
#We can see the probabilities for each row:
array_prob = model.predict_proba(X_test)

array_prob.tolist()

[[0.0064147143818383085, 0.9935852856181618],
 [0.9905574724354776, 0.009442527564522579],
 [0.021623351166609902, 0.9783766488333907],
 [0.06341095728471584, 0.9365890427152844],
 [0.013750342480922243, 0.9862496575190781],
 [0.9890773506235213, 0.010922649376478893],
 [0.9768982266366406, 0.023101773363358977],
 [0.004371480302949657, 0.9956285196970505],
 [0.6915142266150291, 0.3084857733849716],
 [0.06308945191087975, 0.9369105480891198],
 [0.9899360258683005, 0.010063974131699696],
 [0.06061564008907778, 0.9393843599109218],
 [0.8440962330115618, 0.15590376698843772],
 [0.9778507691590587, 0.022149230840940774],
 [0.06337078293298795, 0.9366292170670123],
 [0.9140822771238417, 0.08591772287615858],
 [0.9829396479345628, 0.017060352065437212],
 [0.9896813074089971, 0.01031869259100373],
 [0.9908410038810194, 0.009158996118981366],
 [0.025079075416820418, 0.9749209245831796],
 [0.9907304471938407, 0.009269552806158751],
 [0.9900797894356007, 0.009920210564398953],
 [0.00428898262729

In [118]:
array_prob_series = pd.Series(array_prob.tolist(), name = "Prob")

array_prob_series

0      [0.0064147143818383085, 0.9935852856181618]
1       [0.9905574724354776, 0.009442527564522579]
2       [0.021623351166609902, 0.9783766488333907]
3        [0.06341095728471584, 0.9365890427152844]
4       [0.013750342480922243, 0.9862496575190781]
                          ...                     
174     [0.006287273958482196, 0.9937127260415183]
175      [0.9189424720174514, 0.08105752798254912]
176     [0.9905546921119001, 0.009445307888100078]
177     [0.9890749440666049, 0.010925055933394549]
178     [0.030819499041882643, 0.9691805009581174]
Name: Prob, Length: 179, dtype: object

In [119]:
prob_df = pd.DataFrame(array_prob_series)

prob_df

Unnamed: 0,Prob
0,"[0.0064147143818383085, 0.9935852856181618]"
1,"[0.9905574724354776, 0.009442527564522579]"
2,"[0.021623351166609902, 0.9783766488333907]"
3,"[0.06341095728471584, 0.9365890427152844]"
4,"[0.013750342480922243, 0.9862496575190781]"
...,...
174,"[0.006287273958482196, 0.9937127260415183]"
175,"[0.9189424720174514, 0.08105752798254912]"
176,"[0.9905546921119001, 0.009445307888100078]"
177,"[0.9890749440666049, 0.010925055933394549]"


In [120]:
df_tot = pd.concat([prob_df,y_test_df,predict_df])

df_tot.head()

Unnamed: 0,Prob,Survived,Predictions
0,"[0.0064147143818383085, 0.9935852856181618]",,
1,"[0.9905574724354776, 0.009442527564522579]",,
2,"[0.021623351166609902, 0.9783766488333907]",,
3,"[0.06341095728471584, 0.9365890427152844]",,
4,"[0.013750342480922243, 0.9862496575190781]",,


In [121]:
df_tot.isnull().sum()

Prob           358
Survived       358
Predictions    358
dtype: int64

In [None]:
df = pd.DataFrame(my_array, columns = ['Column_A','Column_B','Column_C'])