<a href="https://colab.research.google.com/github/jgonzet/Kaggle-Competitions/blob/main/Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn.pipeline
import sklearn.ensemble
%matplotlib inline

# Load Data

In [2]:
train_df = pd.read_csv("/content/train.csv")
test_df  = pd.read_csv("/content/test.csv")
#pd.set_option('display.max_rows', train_df.shape[0]+1)

# EDA

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [None]:
train_df.Age.describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [None]:
total = train_df.isnull().sum().sort_values(ascending=False) #
percent_1 = train_df.isnull().sum()/train_df.isnull().count()*100 #total de nulls sobre el total
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head(5)

Unnamed: 0,Total,%
Cabin,687,77.1
Age,177,19.9
Embarked,2,0.2
Fare,0,0.0
Ticket,0,0.0


# Feature Engineering


In [5]:
# Fill Embarked NaN with most frequen embarked ('S'):

train_df['Embarked'] = train_df['Embarked'].fillna('S')
test_df['Embarked'] = test_df['Embarked'].fillna('S')

# Mapping ports:

ports = {"S": 0, "C": 1, "Q": 2}
train_df['Embarked'] = train_df['Embarked'].map(ports)
test_df['Embarked'] = test_df['Embarked'].map(ports)

In [168]:
# Transform Fare to int (fillna is necessary because of errors):

train_df['Fare'] = train_df['Fare'].fillna(train_df.Fare.median()).astype(int)
test_df['Fare']  = test_df['Fare'].fillna(test_df.Fare.median()).astype(int)

In [169]:
# Family size = siblings + parch:

data = [train_df, test_df]

for dataset in data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

#print (train_df[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean())

In [170]:
for dataset in data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

#print (train_df[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean())

In [171]:
data = [train_df, test_df]
for dataset in data:
    dataset['relatives'] = dataset['SibSp'] + dataset['Parch']
    dataset.loc[dataset['relatives'] > 0, 'travelled_alone'] = 'No'
    dataset.loc[dataset['relatives'] == 0, 'travelled_alone'] = 'Yes'

In [173]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,relatives,travelled_alone,FamilySize,IsAlone
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7,,0,1,No,2,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71,C85,1,1,No,2,0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7,,0,0,Yes,1,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53,C123,0,1,No,2,0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8,,0,0,Yes,1,1


In [8]:
# Mapping male/female to 0/1:

genders = {"male": 0, "female": 1}
train_df['Sex'] = train_df['Sex'].map(genders)
test_df['Sex']  = test_df['Sex'].map(genders)

In [9]:
# FillNa Age

train_df['Age'] = train_df['Age'].fillna(train_df['Age'].mean())
test_df['Age'] = test_df['Age'].fillna(test_df['Age'].mean())

In [10]:
X_train = train_df.drop(columns = ['Survived','PassengerId','Name','Cabin','Ticket'])
Y_train = train_df['Survived']

X_test = test_df.drop(columns = ['PassengerId','Name','Cabin','Ticket'])

# Training Machine Learning Classifiers

## Random Forest

In [None]:
# Classifier building:

classifier = sklearn.ensemble.RandomForestClassifier(n_estimators=100)
classifier = sklearn.pipeline.make_pipeline(sklearn.preprocessing.StandardScaler(),classifier) 

In [None]:
# Grid Search

param_grid = { "criterion" : ["gini", "entropy"], "min_samples_leaf" : [1,2,3],
               "min_samples_split" : [2,4,6,8,9,10], "n_estimators": [100]}


#rf = sklearn.ensemble.RandomForestClassifier(n_estimators=100, max_features='auto', oob_score=True, random_state=1, n_jobs=-1)
rf = sklearn.ensemble.RandomForestClassifier(random_state=42)
clf = sklearn.model_selection.GridSearchCV(estimator=rf, param_grid=param_grid, n_jobs=-1)
clf.fit(X_train, Y_train)
clf.best_params_

{'criterion': 'gini',
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 100}

In [None]:
# Random Forest with best hyperparameters:

random_forest = sklearn.ensemble.RandomForestClassifier(criterion = "gini", 
                                                        min_samples_leaf = 2, 
                                                        min_samples_split = 2,   
                                                        n_estimators=100, 
                                                        max_features='auto', 
                                                        oob_score=True, 
                                                        random_state=42, 
                                                        n_jobs=-1)


#random_forest = sklearn.pipeline.make_pipeline(sklearn.preprocessing.StandardScaler(),random_forest) 
random_forest.fit(X_train, Y_train)

random_forest.score(X_train, Y_train)

print("oob score:", round(random_forest.oob_score_, 4)*100, "%")

oob score: 82.27 %


In [None]:
# Cross validation fitting:

scores = sklearn.model_selection.cross_val_score(random_forest,X_train,Y_train,cv=10,scoring='accuracy')
print("%0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

0.833 accuracy with a standard deviation of 0.048


In [None]:
# Confusion Matrix evaluation:

predictions = sklearn.model_selection.cross_val_predict(random_forest, X_train, Y_train, cv=10)
sklearn.metrics.confusion_matrix(Y_train, predictions)

array([[493,  56],
       [ 96, 246]])

In [None]:
# Precision and Recall:

print("Precision:", sklearn.metrics.precision_score(Y_train, predictions))
print("Recall:",sklearn.metrics.recall_score(Y_train, predictions))

Precision: 0.8145695364238411
Recall: 0.7192982456140351


## SVM

In [148]:
classifier = sklearn.svm.SVC(kernel='rbf')
classifier = sklearn.pipeline.make_pipeline(sklearn.preprocessing.StandardScaler(),classifier)  

In [144]:
# Grid Search

param_grid = { 'C' : [0.1,0.09,0.11], 'kernel':['linear']}


#rf = sklearn.ensemble.RandomForestClassifier(n_estimators=100, max_features='auto', oob_score=True, random_state=1, n_jobs=-1)
svm = sklearn.svm.SVC()
clf = sklearn.model_selection.GridSearchCV(estimator=svm, param_grid=param_grid, n_jobs=-1)
clf.fit(X_train, Y_train)
clf.best_params_

{'C': 0.1, 'kernel': 'linear'}

In [149]:
# SVM classifier with best hyperparameters

svmc = sklearn.svm.SVC(C=0.1,kernel='linear') 
                       
svmc.fit(X_train, Y_train)
svmc.score(X_train, Y_train)

0.7867564534231201

In [146]:
# Cross validation accuracy evaluation:

scores = sklearn.model_selection.cross_val_score(svmc,X_train,Y_train,cv=10,scoring='accuracy')
print("%0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

0.787 accuracy with a standard deviation of 0.029


In [147]:
# Confusion Matrix evaluation:

predictions = sklearn.model_selection.cross_val_predict(svmc, X_train, Y_train, cv=10)
sklearn.metrics.confusion_matrix(Y_train, predictions)

KeyboardInterrupt: ignored

In [None]:
# Precision and Recall:

print("Precision:", sklearn.metrics.precision_score(Y_train, predictions))
print("Recall:",sklearn.metrics.recall_score(Y_train, predictions))

# KNN

In [152]:
#class sklearn.neighbors.KNeighborsClassifier(n_neighbors=5, *, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None, **kwargs)

knn = sklearn.neighbors.KNeighborsClassifier()
knn = sklearn.pipeline.make_pipeline(sklearn.preprocessing.StandardScaler(),classifier)  

In [48]:
# Grid Search

param_grid = { 'n_neighbors' : [1,2,3,4,5]}


knn = sklearn.neighbors.KNeighborsClassifier()
clf = sklearn.model_selection.GridSearchCV(estimator=knn, param_grid=param_grid, n_jobs=-1)
clf.fit(X_train, Y_train)
clf.best_params_

{'n_neighbors': 1}

In [153]:
# SVM classifier with best hyperparameters

knn = sklearn.neighbors.KNeighborsClassifier(n_neighbors=3)
                       
knn.fit(X_train, Y_train)
knn.score(X_train, Y_train)

0.8361391694725028

In [93]:
# Cross validation accuracy evaluation:

scores = sklearn.model_selection.cross_val_score(knn,X_train,Y_train,cv=10,scoring='accuracy')
print("%0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

0.708 accuracy with a standard deviation of 0.061


In [94]:
# Confusion Matrix evaluation:

predictions = sklearn.model_selection.cross_val_predict(knn, X_train, Y_train, cv=10)
sklearn.metrics.confusion_matrix(Y_train, predictions)

array([[442, 107],
       [153, 189]])

In [95]:
# Precision and Recall:

print("Precision:", sklearn.metrics.precision_score(Y_train, predictions))
print("Recall:",sklearn.metrics.recall_score(Y_train, predictions))

Precision: 0.6385135135135135
Recall: 0.5526315789473685


# Catboost

In [98]:
#!pip install catboost
from catboost import CatBoostClassifier


Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/20/37/bc4e0ddc30c07a96482abf1de7ed1ca54e59bba2026a33bca6d2ef286e5b/catboost-0.24.4-cp36-none-manylinux1_x86_64.whl (65.7MB)
[K     |████████████████████████████████| 65.8MB 59kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.24.4


In [124]:
# Grid Search

param_grid = { 'learning_rate':[1.2],'depth':[6],'iterations':[1,2,3,4,5,6,7,8,9,10]}

catb = CatBoostClassifier()

clf = sklearn.model_selection.GridSearchCV(estimator=catb, param_grid=param_grid, n_jobs=-1).fit(X_train, Y_train)

#clf.fit(X_train, Y_train)

clf.best_params_

0:	learn: 0.4338271	total: 607us	remaining: 2.43ms
1:	learn: 0.4124672	total: 1.7ms	remaining: 2.55ms
2:	learn: 0.3964180	total: 2.59ms	remaining: 1.73ms
3:	learn: 0.3799868	total: 3.47ms	remaining: 867us
4:	learn: 0.3732998	total: 4.26ms	remaining: 0us


learning rate is greater than 1. You probably need to decrease learning rate.


{'depth': 6, 'iterations': 5, 'learning_rate': 1.2}

In [137]:
# Catboost classifier with best hyperparameters:

model = CatBoostClassifier(iterations=1000,
                           learning_rate=1,
                           depth=6)
                       
model.fit(X_train, Y_train)
model.score(X_train, Y_train)

0:	learn: 0.4157529	total: 1.47ms	remaining: 1.47s
1:	learn: 0.3886375	total: 3.23ms	remaining: 1.61s
2:	learn: 0.3719580	total: 4.96ms	remaining: 1.65s
3:	learn: 0.3641652	total: 6.66ms	remaining: 1.66s
4:	learn: 0.3512545	total: 8.56ms	remaining: 1.7s
5:	learn: 0.3472822	total: 10.4ms	remaining: 1.72s
6:	learn: 0.3407679	total: 12.6ms	remaining: 1.78s
7:	learn: 0.3283216	total: 14.4ms	remaining: 1.79s
8:	learn: 0.3156594	total: 18.9ms	remaining: 2.08s
9:	learn: 0.3058334	total: 20.7ms	remaining: 2.05s
10:	learn: 0.2927127	total: 22.5ms	remaining: 2.02s
11:	learn: 0.2806873	total: 24.5ms	remaining: 2.02s
12:	learn: 0.2744072	total: 26.5ms	remaining: 2.02s
13:	learn: 0.2688355	total: 28.4ms	remaining: 2s
14:	learn: 0.2615008	total: 31.2ms	remaining: 2.05s
15:	learn: 0.2497351	total: 33ms	remaining: 2.03s
16:	learn: 0.2438392	total: 34.2ms	remaining: 1.98s
17:	learn: 0.2239080	total: 35.3ms	remaining: 1.92s
18:	learn: 0.2159860	total: 36.3ms	remaining: 1.87s
19:	learn: 0.2128369	total: 

0.9652076318742986

In [138]:
# Cross validation accuracy evaluation:

scores = sklearn.model_selection.cross_val_score(model,X_train,Y_train,cv=10,scoring='accuracy')
print("%0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

[1;30;43mSe han truncado las últimas 5000 líneas del flujo de salida.[0m
1:	learn: 0.3856555	total: 2.03ms	remaining: 1.01s
2:	learn: 0.3659151	total: 3.13ms	remaining: 1.04s
3:	learn: 0.3455894	total: 4.13ms	remaining: 1.03s
4:	learn: 0.3289673	total: 5.12ms	remaining: 1.02s
5:	learn: 0.3220103	total: 6.07ms	remaining: 1s
6:	learn: 0.3102977	total: 7.06ms	remaining: 1s
7:	learn: 0.3078296	total: 7.82ms	remaining: 970ms
8:	learn: 0.2975787	total: 8.79ms	remaining: 968ms
9:	learn: 0.2956112	total: 9.71ms	remaining: 961ms
10:	learn: 0.2908052	total: 10.7ms	remaining: 960ms
11:	learn: 0.2855103	total: 11.5ms	remaining: 951ms
12:	learn: 0.2733176	total: 12.5ms	remaining: 951ms
13:	learn: 0.2540758	total: 13.5ms	remaining: 952ms
14:	learn: 0.2400004	total: 14.4ms	remaining: 948ms
15:	learn: 0.2288714	total: 15.4ms	remaining: 947ms
16:	learn: 0.2251124	total: 16.4ms	remaining: 946ms
17:	learn: 0.2134817	total: 17.4ms	remaining: 950ms
18:	learn: 0.2072537	total: 18.5ms	remaining: 956ms
19:	

In [139]:
# Confusion Matrix evaluation:

predictions = sklearn.model_selection.cross_val_predict(model, X_train, Y_train, cv=10)
sklearn.metrics.confusion_matrix(Y_train, predictions)

[1;30;43mSe han truncado las últimas 5000 líneas del flujo de salida.[0m
0:	learn: 0.4130358	total: 1.25ms	remaining: 1.25s
1:	learn: 0.3856555	total: 2.43ms	remaining: 1.21s
2:	learn: 0.3659151	total: 3.81ms	remaining: 1.27s
3:	learn: 0.3455894	total: 4.87ms	remaining: 1.21s
4:	learn: 0.3289673	total: 5.99ms	remaining: 1.19s
5:	learn: 0.3220103	total: 7.14ms	remaining: 1.18s
6:	learn: 0.3102977	total: 8.22ms	remaining: 1.17s
7:	learn: 0.3078296	total: 9.13ms	remaining: 1.13s
8:	learn: 0.2975787	total: 10.3ms	remaining: 1.14s
9:	learn: 0.2956112	total: 11.4ms	remaining: 1.13s
10:	learn: 0.2908052	total: 12.5ms	remaining: 1.13s
11:	learn: 0.2855103	total: 13.6ms	remaining: 1.12s
12:	learn: 0.2733176	total: 14.7ms	remaining: 1.12s
13:	learn: 0.2540758	total: 16ms	remaining: 1.13s
14:	learn: 0.2400004	total: 17ms	remaining: 1.12s
15:	learn: 0.2288714	total: 18.1ms	remaining: 1.11s
16:	learn: 0.2251124	total: 19.3ms	remaining: 1.12s
17:	learn: 0.2134817	total: 20.4ms	remaining: 1.11s
18:

array([[478,  71],
       [ 90, 252]])

In [140]:
# Precision and Recall:

print("Precision:", sklearn.metrics.precision_score(Y_train, predictions))
print("Recall:",sklearn.metrics.recall_score(Y_train, predictions))

Precision: 0.7801857585139319
Recall: 0.7368421052631579


# LDA

In [158]:
# Instantiate our model
logreg = sklearn.linear_model.LogisticRegression()
# Fit our model to the training data
logreg.fit(X_train, Y_train)

# Output builder

In [159]:
classifier = logreg # [random_forest, svmc]

Y_prediction = classifier.predict(X_test)
Y_prediction_df = pd.DataFrame(Y_prediction)

Y_prediction_df = Y_prediction_df.reset_index()
Y_prediction_df.columns = ['PassengerId','Survived']
Y_prediction_df['PassengerId'] = Y_prediction_df['PassengerId'] + 892
Y_prediction_df = Y_prediction_df.set_index('PassengerId')

In [160]:
Y_prediction_df.to_csv("outputs.csv")