<a href="https://colab.research.google.com/github/jgonzet/Kaggle-Competitions/blob/main/Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn.pipeline
import sklearn.ensemble
%matplotlib inline

# Load Data

In [2]:
train_df = pd.read_csv("/content/train.csv")
test_df  = pd.read_csv("/content/test.csv")
#pd.set_option('display.max_rows', train_df.shape[0]+1)

# EDA

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [None]:
train_df.Age.describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [None]:
total = train_df.isnull().sum().sort_values(ascending=False) #
percent_1 = train_df.isnull().sum()/train_df.isnull().count()*100 #total de nulls sobre el total
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head(5)

Unnamed: 0,Total,%
Cabin,687,77.1
Age,177,19.9
Embarked,2,0.2
Fare,0,0.0
Ticket,0,0.0


# Feature Engineering


In [5]:
# Fill Embarked NaN with most frequen embarked ('S'):

train_df['Embarked'] = train_df['Embarked'].fillna('S')
test_df['Embarked'] = test_df['Embarked'].fillna('S')

In [6]:
# Mapping ports:

ports = {"S": 0, "C": 1, "Q": 2}
train_df['Embarked'] = train_df['Embarked'].map(ports)
test_df['Embarked'] = test_df['Embarked'].map(ports)


In [7]:
# Transform Fare to int (fillna is necessary because of errors):

train_df['Fare'] = train_df['Fare'].fillna(train_df.Fare.mean()).astype(int)
test_df['Fare']  = test_df['Fare'].fillna(test_df.Fare.mean()).astype(int)

In [8]:
# Mapping male/female to 0/1:

genders = {"male": 0, "female": 1}
train_df['Sex'] = train_df['Sex'].map(genders)
test_df['Sex']  = test_df['Sex'].map(genders)

In [9]:
# FillNa Age

train_df['Age'] = train_df['Age'].fillna(train_df['Age'].mean())
test_df['Age'] = test_df['Age'].fillna(test_df['Age'].mean())

In [10]:
X_train = train_df.drop(columns = ['Survived','PassengerId','Name','Cabin','Ticket'])
Y_train = train_df['Survived']

X_test = test_df.drop(columns = ['PassengerId','Name','Cabin','Ticket'])

# Training Machine Learning Classifiers

## Random Forest

In [None]:
# Classifier building:

classifier = sklearn.ensemble.RandomForestClassifier(n_estimators=100)
classifier = sklearn.pipeline.make_pipeline(sklearn.preprocessing.StandardScaler(),classifier) 

In [None]:
# Grid Search

param_grid = { "criterion" : ["gini", "entropy"], "min_samples_leaf" : [1,2,3],
               "min_samples_split" : [2,4,6,8,9,10], "n_estimators": [100]}


#rf = sklearn.ensemble.RandomForestClassifier(n_estimators=100, max_features='auto', oob_score=True, random_state=1, n_jobs=-1)
rf = sklearn.ensemble.RandomForestClassifier(random_state=42)
clf = sklearn.model_selection.GridSearchCV(estimator=rf, param_grid=param_grid, n_jobs=-1)
clf.fit(X_train, Y_train)
clf.best_params_

{'criterion': 'gini',
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 100}

In [None]:
# Random Forest with best hyperparameters:

random_forest = sklearn.ensemble.RandomForestClassifier(criterion = "gini", 
                                                        min_samples_leaf = 2, 
                                                        min_samples_split = 2,   
                                                        n_estimators=100, 
                                                        max_features='auto', 
                                                        oob_score=True, 
                                                        random_state=42, 
                                                        n_jobs=-1)


#random_forest = sklearn.pipeline.make_pipeline(sklearn.preprocessing.StandardScaler(),random_forest) 
random_forest.fit(X_train, Y_train)

random_forest.score(X_train, Y_train)

print("oob score:", round(random_forest.oob_score_, 4)*100, "%")

oob score: 82.27 %


In [None]:
# Cross validation fitting:

scores = sklearn.model_selection.cross_val_score(random_forest,X_train,Y_train,cv=10,scoring='accuracy')
print("%0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

0.833 accuracy with a standard deviation of 0.048


In [None]:
# Confusion Matrix evaluation:

predictions = sklearn.model_selection.cross_val_predict(random_forest, X_train, Y_train, cv=10)
sklearn.metrics.confusion_matrix(Y_train, predictions)

array([[493,  56],
       [ 96, 246]])

In [None]:
# Precision and Recall:

print("Precision:", sklearn.metrics.precision_score(Y_train, predictions))
print("Recall:",sklearn.metrics.recall_score(Y_train, predictions))

Precision: 0.8145695364238411
Recall: 0.7192982456140351


## SVM

In [None]:
classifier = sklearn.svm.SVC(kernel='rbf')
classifier = sklearn.pipeline.make_pipeline(sklearn.preprocessing.StandardScaler(),classifier)  

In [37]:
# Grid Search

param_grid = { 'C' : [0.1,0.09,0.11], 'kernel':['linear']}


#rf = sklearn.ensemble.RandomForestClassifier(n_estimators=100, max_features='auto', oob_score=True, random_state=1, n_jobs=-1)
svm = sklearn.svm.SVC()
clf = sklearn.model_selection.GridSearchCV(estimator=svm, param_grid=param_grid, n_jobs=-1)
clf.fit(X_train, Y_train)
clf.best_params_

{'C': 0.1, 'kernel': 'linear'}

In [24]:
# SVM classifier with best hyperparameters

svmc = sklearn.svm.SVC(C=0.1,kernel='linear') 
                       
svmc.fit(X_train, Y_train)
svmc.score(X_train, Y_train)

0.7867564534231201

In [38]:
# Cross validation accuracy evaluation:

scores = sklearn.model_selection.cross_val_score(svmc,X_train,Y_train,cv=10,scoring='accuracy')
print("%0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

0.787 accuracy with a standard deviation of 0.029


In [40]:
# Confusion Matrix evaluation:

predictions = sklearn.model_selection.cross_val_predict(svmc, X_train, Y_train, cv=10)
sklearn.metrics.confusion_matrix(Y_train, predictions)

array([[468,  81],
       [109, 233]])

In [41]:
# Precision and Recall:

print("Precision:", sklearn.metrics.precision_score(Y_train, predictions))
print("Recall:",sklearn.metrics.recall_score(Y_train, predictions))

Precision: 0.7420382165605095
Recall: 0.6812865497076024


# KNN

In [44]:
#class sklearn.neighbors.KNeighborsClassifier(n_neighbors=5, *, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None, **kwargs)

knn = sklearn.neighbors.KNeighborsClassifier()
knn = sklearn.pipeline.make_pipeline(sklearn.preprocessing.StandardScaler(),classifier)  

In [48]:
# Grid Search

param_grid = { 'n_neighbors' : [1,2,3,4,5]}


knn = sklearn.neighbors.KNeighborsClassifier()
clf = sklearn.model_selection.GridSearchCV(estimator=knn, param_grid=param_grid, n_jobs=-1)
clf.fit(X_train, Y_train)
clf.best_params_

{'n_neighbors': 1}

In [84]:
# SVM classifier with best hyperparameters

knn = sklearn.neighbors.KNeighborsClassifier(n_neighbors=1)
                       
knn.fit(X_train, Y_train)
knn.score(X_train, Y_train)

0.9595959595959596

In [85]:
# Cross validation accuracy evaluation:

scores = sklearn.model_selection.cross_val_score(knn,X_train,Y_train,cv=10,scoring='accuracy')
print("%0.3f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))

0.700 accuracy with a standard deviation of 0.044


In [86]:
# Confusion Matrix evaluation:

predictions = sklearn.model_selection.cross_val_predict(knn, X_train, Y_train, cv=10)
sklearn.metrics.confusion_matrix(Y_train, predictions)

array([[434, 115],
       [152, 190]])

In [87]:
# Precision and Recall:

print("Precision:", sklearn.metrics.precision_score(Y_train, predictions))
print("Recall:",sklearn.metrics.recall_score(Y_train, predictions))

Precision: 0.6229508196721312
Recall: 0.5555555555555556


# Catboost

# Output builder

In [None]:
classifier = svmc # [random_forest, svmc]

Y_prediction = classifier.predict(X_test)
Y_prediction_df = pd.DataFrame(Y_prediction)

Y_prediction_df = Y_prediction_df.reset_index()
Y_prediction_df.columns = ['PassengerId','Survived']
Y_prediction_df['PassengerId'] = Y_prediction_df['PassengerId'] + 892
Y_prediction_df = Y_prediction_df.set_index('PassengerId')

In [None]:
Y_prediction_df.to_csv("outputs.csv")