In [163]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder

train = pd.read_csv('data/train.csv')
X_truetest = pd.read_csv('data/test.csv')
index_id = X_truetest.PassengerId
print('Setup completed')

Setup completed


In [128]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


On observe que dans la but de prédire la survie des passagers, notre colonne des labels y va être 'Survived'. On est ici sur un problème de classification, nous devons prédire dans quelle classe se trouve nos passagers, c'est-à-dire si ils ont survécu ou non. L'idée ici va être de commencer par analyser nos colonnes afin de voir quels features nous allons garder, et ensuite de faire une classification dummy et comparer nos modèles entre eux.

**Important à noter :** le dataset fourni par Kaggle a déjà éliminé certaines colonnes: boat et body. Ces colonnes pouvaient avoir la tendance de spoiler le modèle, il est donc plus sage qu'elles soient absentes, si elles ne l'étaient pas nous aurions dû le faire nous même.

In [129]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [130]:
train.isnull().sum()  #On veut savoir le nombre de valeurs manquantes dans notre dataset

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Ce projet n'inclue pas de traitement du langage naturel, ainsi notre modèle ne pourra tirer profit d'une colonne dont les valeurs sont toutes différentes. On doit logiquement éliminer les colonnes 'Name', 'Ticket', 'Cabin' et 'PassengerId' (on aurait pu importer le df directement avec l'ID en index). On s'enlève par la même occasion le problème des données manquantes dans la colonne 'Cabin'.

In [131]:
train=train.drop(columns=['Cabin', 'Name', 'Ticket', 'PassengerId'])
train.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked'],
      dtype='object')

On va créer de nouvelles colonnes en encodant les lieux d'embarcations et le sexe des personnes à bord. On a deux options : le get_dummies de pandas ou le OneHotEncoder de sklearn. Comme on explore les données et qu'on ne veut pas faire une pipeline tout de suite, on va utiliser le get_dummies.

In [132]:
train=pd.get_dummies(train, drop_first=True)
train.columns

Index(['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_male',
       'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [133]:
X = train.drop(columns='Survived')
y=train.Survived

In [134]:
from sklearn import model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(X ,y, test_size=0.3, random_state=4)

On doit désormais imputer nos données car il nous en manque, notamment dans la colonne 'Age'. Pour cela on va utiliser les imputers de sklearn.

In [135]:
from sklearn.experimental import enable_iterative_imputer
from sklearn import impute
num_cols=['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
imputer=impute.IterativeImputer()
imputer.fit(X_train[num_cols])
imputed=imputer.transform(X_train[num_cols])
X_train.loc[:,num_cols]=imputed
imputed_t=imputer.transform(X_test[num_cols])
X_test.loc[:,num_cols]=imputed_t

On peut désormais standardiser nos données, toutefois on ne va pas standardiser nos colonnes factices.

In [136]:
from sklearn import preprocessing
scaler=preprocessing.StandardScaler()
cols= "Pclass,Age,SibSp,Parch,Fare".split(",")

scaler.fit(X_train[cols])
X_train.loc[:,cols] = scaler.transform(X_train[cols])
X_test.loc[:,cols] = scaler.transform(X_test[cols])

#We modified only the columns we were interested in, all of the columns except the created ones



 -1.53154368  0.83314458 -0.34919955 -1.53154368  0.83314458 -0.34919955
  0.83314458  0.83314458 -1.53154368  0.83314458  0.83314458 -0.34919955
  0.83314458  0.83314458  0.83314458  0.83314458  0.83314458 -0.34919955
 -0.34919955  0.83314458  0.83314458 -1.53154368 -1.53154368 -1.53154368
  0.83314458  0.83314458  0.83314458  0.83314458  0.83314458 -1.53154368
 -1.53154368  0.83314458  0.83314458 -0.34919955 -1.53154368  0.83314458
  0.83314458  0.83314458 -1.53154368  0.83314458  0.83314458  0.83314458
 -1.53154368  0.83314458  0.83314458 -0.34919955  0.83314458 -1.53154368
 -0.34919955  0.83314458 -0.34919955  0.83314458 -1.53154368 -1.53154368
 -1.53154368  0.83314458 -1.53154368  0.83314458  0.83314458 -1.53154368
  0.83314458  0.83314458 -1.53154368  0.83314458  0.83314458 -1.53154368
  0.83314458 -1.53154368  0.83314458 -1.53154368 -1.53154368  0.83314458
  0.83314458  0.83314458  0.83314458  0.83314458 -1.53154368  0.83314458
  0.83314458  0.83314458 -0.34919955  0.83314458  0

On peut créer notre algorithme de référence avec le DummyClassifier dans le but de pouvoir comparer nos modèles par la suite.

In [137]:
from sklearn.dummy import DummyClassifier
bm=DummyClassifier()
bm.fit(X_train, y_train)
bm.score(X_test, y_test)

0.664179104477612

On va désormais tester différentes familles d'algorithmes, seulement nous allons faire des validations à k-plis donc on va redonner tout le dataset à étudier, on doit donc concaténer nos données à nouveau.

In [138]:
X_train=pd.DataFrame(X_train, columns=X_train.columns)
X_test=pd.DataFrame(X_test, columns=X_test.columns)
X=pd.concat([X_train, X_test]) 
y=pd.concat([y_train,y_test])

In [139]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost

In [140]:
for model in [DummyClassifier, LogisticRegression, DecisionTreeClassifier, KNeighborsClassifier, GaussianNB, SVC, RandomForestClassifier, xgboost.XGBClassifier]:
    cls = model()
    kfold = model_selection.KFold(n_splits=10, shuffle=True, random_state=4)
    s=model_selection.cross_val_score(cls, X, y, scoring="roc_auc", cv=kfold)
    print(
        f"{model.__name__:22} AUC:"
        f"{s.mean():.3f}    STD:  {s.std():.2f}"
    )

DummyClassifier        AUC:0.500    STD:  0.00
LogisticRegression     AUC:0.846    STD:  0.05
DecisionTreeClassifier AUC:0.767    STD:  0.05
KNeighborsClassifier   AUC:0.839    STD:  0.05
GaussianNB             AUC:0.829    STD:  0.05
SVC                    AUC:0.851    STD:  0.04
RandomForestClassifier AUC:0.867    STD:  0.04
XGBClassifier          AUC:0.861    STD:  0.06


Looking at the results, we want to pursue the idea of a Random Forest Classifier. We are going now to create the model and try to optimize through the values of the hyperparameters, and to explore it we are going to use a Grid Search.

In [141]:
#We test again a model in order to have the value of accuracy or 'precision'
from sklearn import metrics
rf1=RandomForestClassifier(random_state=4)
rf1.fit(X_train,y_train)
print(rf1.score(X_test,y_test))
print(metrics.precision_score(y_test, rf1.predict(X_test)))

0.8097014925373134
0.7294117647058823


Now we are optimizing with the hyperparameters !

In [142]:
rf=RandomForestClassifier(random_state=4)

param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 5, 10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [0.1, 1, 2, 5],
    'max_features': ['sqrt', 'log2'],
    'random_state': [42]
}

grid = model_selection.GridSearchCV(rf, param_grid=param_grid, cv=kfold)

grid.fit(X_train,y_train)

KeyboardInterrupt: 

In [143]:
print(grid.best_params_)

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [144]:
rf_final = RandomForestClassifier ( max_depth=10, max_features='log2', min_samples_leaf=1, min_samples_split=5, n_estimators=100, random_state=42)
rf_final.fit(X_train, y_train)
rf_final.score(X_test, y_test)

0.8544776119402985

In [145]:
X.shape[0] == y.shape[0]

True

In [146]:
rf_final.fit(X,y)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,10
,min_samples_split,5
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'log2'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


On doit désormais refaire le même preprocessing pour notre dataset de test :

In [162]:
def process(df):
    df=df.drop(columns=['Cabin', 'Name', 'Ticket', 'PassengerId'])
    df=pd.get_dummies(df, drop_first=True)
    num_cols=['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
    df[num_cols] = df[num_cols].astype(float)
    imputed=imputer.transform(df[num_cols])
    df.loc[:,num_cols]=imputed

    cols= "Pclass,Age,SibSp,Parch,Fare".split(",")
    df.loc[:,cols] = scaler.transform(df[cols])

    return df

In [174]:
import pickle
pic=pickle.dumps(rf_final)
rf_final=pickle.loads(pic)
#X_truetest=process(X_truetest)
y_pred=rf_final.predict(X_truetest)

pred=pd.DataFrame(y_pred, index=index_id, columns=["Survived"])
pred.to_csv('titanic_pred.csv')
pred.head()


Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,1
