In [1]:
import pandas as pd
import numpy as np

### Read Train Dataset

In [2]:
df = pd.read_csv('../data/titanic_EDA.csv')

# Machine Learning

## Logistic Regression

In [3]:
X=df.drop('Survived', axis=1)
y=df['Survived']

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=101)

In [6]:
from sklearn.linear_model import LogisticRegression

In [7]:
logmodel = LogisticRegression()

In [8]:
logmodel.fit(X_train, y_train)

In [9]:
predictions = logmodel.predict(X_test)

In [10]:
from sklearn.metrics import confusion_matrix

In [11]:
confusion_matrix(y_test, predictions)

array([[155,  11],
       [ 25,  71]], dtype=int64)

In [12]:
from sklearn.metrics import accuracy_score

In [13]:
log_reg_accuracy = accuracy_score(y_test, predictions, normalize=True)

In [14]:
from sklearn.metrics import precision_score

In [15]:
log_reg_precision = precision_score(y_test, predictions)

In [16]:
from sklearn.metrics import f1_score

In [17]:
log_reg_f1 = f1_score(y_test, predictions)

In [18]:
dic = {'model': 'Logistic', 'accuracy': log_reg_accuracy, 'precision': log_reg_precision, 'f1': log_reg_f1, 'hp': ''}

In [19]:
df_results = pd.DataFrame(dic, index=[0])

In [20]:
df_results

Unnamed: 0,model,accuracy,precision,f1,hp
0,Logistic,0.862595,0.865854,0.797753,


## K Nearest Neighbors Classification

In [21]:
from sklearn.neighbors import KNeighborsClassifier

In [22]:
knn = KNeighborsClassifier(n_neighbors=10)

In [23]:
knn.fit(X_train, y_train)

In [24]:
predictions = knn.predict(X_test)

In [25]:
confusion_matrix(y_test, predictions)

array([[146,  20],
       [ 35,  61]], dtype=int64)

In [26]:
knn_accuracy = accuracy_score(y_test, predictions, normalize=True)

In [27]:
knn_precision = precision_score(y_test, predictions)

In [28]:
knn_f1 = f1_score(y_test, predictions)

In [29]:
hp = {'n_neighbors': 10}

In [30]:
import json

In [31]:
dic = {'model': 'KNN', 'accuracy': knn_accuracy, 'precision': knn_precision, 'f1': knn_f1, 'hp': json.dumps(hp)}

In [32]:
df_results = pd.concat([df_results, pd.DataFrame(dic, index=[0])], ignore_index=True)

In [33]:
df_results

Unnamed: 0,model,accuracy,precision,f1,hp
0,Logistic,0.862595,0.865854,0.797753,
1,KNN,0.790076,0.753086,0.689266,"{""n_neighbors"": 10}"


In [34]:
from sklearn.model_selection import GridSearchCV

In [35]:
parameters = {'n_neighbors': [1,2,3,4,5,6,7,8,9,10]}

In [36]:
knn_1 = KNeighborsClassifier()

In [37]:
knn_cv = GridSearchCV(knn_1, parameters)

In [38]:
knn_cv.fit(X_train, y_train)

In [39]:
knn_cv.best_params_

{'n_neighbors': 9}

In [40]:
knn = KNeighborsClassifier(n_neighbors=9)

In [41]:
knn.fit(X_train, y_train)

In [42]:
predictions = knn.predict(X_test)

knn_accuracy = accuracy_score(y_test, predictions, normalize=True)

knn_precision = precision_score(y_test, predictions)

knn_f1 = f1_score(y_test, predictions)

In [43]:
hp = {'n_neighbors': 9}

dic = {'model': 'KNN', 'accuracy': knn_accuracy, 'precision': knn_precision, 'f1': knn_f1, 'hp': json.dumps(hp)}

df_results = pd.concat([df_results, pd.DataFrame(dic, index=[0])], ignore_index=True)

In [44]:
df_results

Unnamed: 0,model,accuracy,precision,f1,hp
0,Logistic,0.862595,0.865854,0.797753,
1,KNN,0.790076,0.753086,0.689266,"{""n_neighbors"": 10}"
2,KNN,0.793893,0.728261,0.712766,"{""n_neighbors"": 9}"


## DecisionTree Classification

In [45]:
from sklearn.tree import DecisionTreeClassifier

In [46]:
tree = DecisionTreeClassifier()

In [47]:
tree.fit(X_train, y_train)

In [48]:
predictions = tree.predict(X_test)

In [49]:
dt_accuracy = accuracy_score(y_test, predictions, normalize=True)

dt_precision = precision_score(y_test, predictions)

dt_f1 = f1_score(y_test, predictions)

In [50]:
dic = {'model': 'DecisionTree', 'accuracy': dt_accuracy, 'precision': dt_precision, 'f1': dt_f1, 'hp': ''}

df_results = pd.concat([df_results, pd.DataFrame(dic, index=[0])], ignore_index=True)

In [51]:
df_results

Unnamed: 0,model,accuracy,precision,f1,hp
0,Logistic,0.862595,0.865854,0.797753,
1,KNN,0.790076,0.753086,0.689266,"{""n_neighbors"": 10}"
2,KNN,0.793893,0.728261,0.712766,"{""n_neighbors"": 9}"
3,DecisionTree,0.828244,0.822785,0.742857,


In [52]:
from sklearn.metrics import classification_report

In [53]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.83      0.92      0.87       166
           1       0.82      0.68      0.74        96

    accuracy                           0.83       262
   macro avg       0.83      0.80      0.81       262
weighted avg       0.83      0.83      0.82       262



In [54]:
tree_1 = DecisionTreeClassifier()

In [55]:
parameters = {
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5]
}

In [56]:
dt_cv = GridSearchCV(tree_1, parameters)

In [57]:
dt_cv.fit(X_train, y_train)

In [58]:
dt_cv.best_params_

{'min_samples_leaf': 4, 'min_samples_split': 5}

In [59]:
tree = DecisionTreeClassifier(min_samples_leaf=4, min_samples_split=2)

In [60]:
tree.fit(X_train, y_train)

In [61]:
predictions = tree.predict(X_test)

In [62]:
dt_accuracy = accuracy_score(y_test, predictions, normalize=True)

dt_precision = precision_score(y_test, predictions)

dt_f1 = f1_score(y_test, predictions)

In [63]:
hp = {'min_samples_leaf': 4, 'min_samples_split': 2}

dic = {'model': 'DecisionTree', 'accuracy': dt_accuracy, 'precision': dt_precision, 'f1': dt_f1, 'hp': json.dumps(hp)}

df_results = pd.concat([df_results, pd.DataFrame(dic, index=[0])], ignore_index=True)

In [64]:
df_results

Unnamed: 0,model,accuracy,precision,f1,hp
0,Logistic,0.862595,0.865854,0.797753,
1,KNN,0.790076,0.753086,0.689266,"{""n_neighbors"": 10}"
2,KNN,0.793893,0.728261,0.712766,"{""n_neighbors"": 9}"
3,DecisionTree,0.828244,0.822785,0.742857,
4,DecisionTree,0.858779,0.847059,0.79558,"{""min_samples_leaf"": 4, ""min_samples_split"": 2}"


## Random Forest Classification

In [65]:
from sklearn.ensemble import RandomForestClassifier

In [66]:
rfc = RandomForestClassifier()

In [67]:
rfc.fit(X_train, y_train)

In [68]:
predictions = rfc.predict(X_test)

In [69]:
rfc_accuracy = accuracy_score(y_test, predictions, normalize=True)

rfc_precision = precision_score(y_test, predictions)

rfc_f1 = f1_score(y_test, predictions)

In [70]:
hp = {'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}

dic = {'model': 'RandomForest', 'accuracy': rfc_accuracy, 
       'precision': rfc_precision, 'f1': rfc_f1, 'hp': json.dumps(hp)}

df_results = pd.concat([df_results, pd.DataFrame(dic, index=[0])], ignore_index=True)

In [71]:
parameters = {
    'min_samples_leaf': [1,2,4],
    'min_samples_split': [2,5,10],
    'n_estimators': [10, 20, 30]
}

In [72]:
rfc_cv = GridSearchCV(rfc, parameters)
rfc_cv.fit(X_train, y_train)

In [73]:
rfc_cv.best_params_

{'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 20}

In [74]:
rfc = RandomForestClassifier(min_samples_leaf=4, min_samples_split=5, n_estimators=30)

In [75]:
rfc.fit(X_train, y_train)

In [76]:
predictions = rfc.predict(X_test)

In [77]:
rfc_accuracy = accuracy_score(y_test, predictions, normalize=True)

rfc_precision = precision_score(y_test, predictions)

rfc_f1 = f1_score(y_test, predictions)

In [78]:
hp = {'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 30}

dic = {'model': 'RandomForest', 'accuracy': rfc_accuracy, 
       'precision': rfc_precision, 'f1': rfc_f1, 'hp': json.dumps(hp)}

df_results = pd.concat([df_results, pd.DataFrame(dic, index=[0])], ignore_index=True)

In [79]:
df_results

Unnamed: 0,model,accuracy,precision,f1,hp
0,Logistic,0.862595,0.865854,0.797753,
1,KNN,0.790076,0.753086,0.689266,"{""n_neighbors"": 10}"
2,KNN,0.793893,0.728261,0.712766,"{""n_neighbors"": 9}"
3,DecisionTree,0.828244,0.822785,0.742857,
4,DecisionTree,0.858779,0.847059,0.79558,"{""min_samples_leaf"": 4, ""min_samples_split"": 2}"
5,RandomForest,0.870229,0.907895,0.802326,"{""min_samples_leaf"": 1, ""min_samples_split"": 2..."
6,RandomForest,0.870229,0.8875,0.806818,"{""min_samples_leaf"": 4, ""min_samples_split"": 5..."


based on result best model for this Dataset is RandomForest.</br>
so, Let's get more information about it.</br>
First, Let's use K-Fold cross validation
## K-Fold Cross Validation

In [80]:
from sklearn.model_selection import cross_val_score
rfc = RandomForestClassifier(min_samples_leaf=4, min_samples_split=5, n_estimators=30)
rfc.fit(X_train, y_train)

In [81]:
accuracy_scores = cross_val_score(rfc, X_train, y_train, cv=10, scoring = "accuracy")

print("Scores:", accuracy_scores)
print("Mean:", accuracy_scores.mean())
print("Standard Deviation:", accuracy_scores.std())

Scores: [0.88571429 0.86666667 0.85714286 0.86666667 0.88571429 0.81904762
 0.86666667 0.85576923 0.85576923 0.84615385]
Mean: 0.8605311355311356
Standard Deviation: 0.018345944585317187


it shows that **Accuracy** is **0.8576** with **+/- 0.01**

In [82]:
precision_scores = cross_val_score(rfc, X_train, y_train, cv=10, scoring = "precision")

print("Scores:", precision_scores)
print("Mean:", precision_scores.mean())
print("Standard Deviation:", precision_scores.std())

Scores: [0.82926829 0.90322581 0.84615385 0.86486486 0.88888889 0.79411765
 0.85714286 0.80487805 0.82051282 0.82857143]
Mean: 0.8437624501108557
Standard Deviation: 0.033407107549473763


it shows that **Precision** is **0.8425** with **+/- 0.03**

In [83]:
f1_scores = cross_val_score(rfc, X_train, y_train, cv=10, scoring = "f1")

print("Scores:", f1_scores)
print("Mean:", f1_scores.mean())
print("Standard Deviation:", f1_scores.std())

Scores: [0.825      0.78873239 0.77333333 0.82051282 0.83783784 0.72
 0.78378378 0.81012658 0.81578947 0.75      ]
Mean: 0.7925116225796665
Standard Deviation: 0.03502800405077116


and finally it shows **F1** is **0.8093** with **+/- 0.04**

Now, Let's take a look at attributes importance

In [84]:
importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(rfc.feature_importances_,3)})

In [85]:
importances.sort_values('importance',ascending=False)

Unnamed: 0,feature,importance
1,Sex,0.64
5,Fare,0.079
2,Age,0.067
8,sector,0.055
0,Pclass,0.044
3,SibSp,0.036
4,Parch,0.029
7,nCabins,0.025
6,Embarked,0.024


This result shows that **Sex, Fare, Age** has the most corelation to passenger survivals in this model and, **Parch, Embarked** has least affect in passengers survivals