In [1]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

import xgboost as xgb
from xgboost import plot_tree
from xgboost import plot_importance

from sklearn import preprocessing    
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyClassifier



In [2]:
# import des données

df = pd.read_csv("titanic_clean.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,2,0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,396.5,0.0,226.5,49.0,False
3,3,0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,226.5,193.0,False
4,4,0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True


In [3]:
# Suppression des 2 premières colonnes

df.drop(columns = ["Unnamed: 0", "PassengerId"], inplace = True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8492 entries, 0 to 8491
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8492 non-null   object 
 1   CryoSleep     8492 non-null   bool   
 2   Destination   8492 non-null   object 
 3   Age           8492 non-null   float64
 4   VIP           8492 non-null   bool   
 5   RoomService   8492 non-null   float64
 6   FoodCourt     8492 non-null   float64
 7   ShoppingMall  8492 non-null   float64
 8   Spa           8492 non-null   float64
 9   VRDeck        8492 non-null   float64
 10  Transported   8492 non-null   bool   
dtypes: bool(3), float64(6), object(2)
memory usage: 555.8+ KB


In [5]:
# Déterminons le coefficient de skewness pour voir si les données suivent la loi normale

df[["Age", "RoomService", "FoodCourt", "ShoppingMall",
    "Spa", "VRDeck"]].skew(axis = 0, skipna = True).sort_values(ascending = False)

FoodCourt      3.49
VRDeck         3.20
Spa            3.18
ShoppingMall   2.99
RoomService    2.91
Age            0.29
dtype: float64

## transformtions des variables catégorielles

In [6]:
# encodons les variables

list_cat = ['HomePlanet', 'Destination' ]

df_dummies =  pd.get_dummies(
    df, columns = list_cat, prefix="Column", dtype = float)

In [7]:
df_dummies

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Column_Earth,Column_Europa,Column_Mars,Column_55 Cancri e,Column_PSO J318.5-22,Column_TRAPPIST-1e
0,False,39.00,False,0.00,0.00,0.00,0.00,0.00,False,0.00,1.00,0.00,0.00,0.00,1.00
1,False,24.00,False,109.00,9.00,25.00,549.00,44.00,True,1.00,0.00,0.00,0.00,0.00,1.00
2,False,58.00,True,43.00,396.50,0.00,226.50,49.00,False,0.00,1.00,0.00,0.00,0.00,1.00
3,False,33.00,False,0.00,1283.00,371.00,226.50,193.00,False,0.00,1.00,0.00,0.00,0.00,1.00
4,False,16.00,False,303.00,70.00,151.00,565.00,2.00,True,1.00,0.00,0.00,0.00,0.00,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8487,False,41.00,True,0.00,396.50,0.00,1643.00,74.00,False,0.00,1.00,0.00,1.00,0.00,0.00
8488,True,18.00,False,0.00,0.00,0.00,0.00,0.00,False,1.00,0.00,0.00,0.00,1.00,0.00
8489,False,26.00,False,0.00,0.00,195.00,1.00,0.00,True,1.00,0.00,0.00,0.00,0.00,1.00
8490,False,32.00,False,0.00,1049.00,0.00,353.00,260.00,False,0.00,1.00,0.00,1.00,0.00,0.00


In [8]:
df_dummies

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Column_Earth,Column_Europa,Column_Mars,Column_55 Cancri e,Column_PSO J318.5-22,Column_TRAPPIST-1e
0,False,39.00,False,0.00,0.00,0.00,0.00,0.00,False,0.00,1.00,0.00,0.00,0.00,1.00
1,False,24.00,False,109.00,9.00,25.00,549.00,44.00,True,1.00,0.00,0.00,0.00,0.00,1.00
2,False,58.00,True,43.00,396.50,0.00,226.50,49.00,False,0.00,1.00,0.00,0.00,0.00,1.00
3,False,33.00,False,0.00,1283.00,371.00,226.50,193.00,False,0.00,1.00,0.00,0.00,0.00,1.00
4,False,16.00,False,303.00,70.00,151.00,565.00,2.00,True,1.00,0.00,0.00,0.00,0.00,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8487,False,41.00,True,0.00,396.50,0.00,1643.00,74.00,False,0.00,1.00,0.00,1.00,0.00,0.00
8488,True,18.00,False,0.00,0.00,0.00,0.00,0.00,False,1.00,0.00,0.00,0.00,1.00,0.00
8489,False,26.00,False,0.00,0.00,195.00,1.00,0.00,True,1.00,0.00,0.00,0.00,0.00,1.00
8490,False,32.00,False,0.00,1049.00,0.00,353.00,260.00,False,0.00,1.00,0.00,1.00,0.00,0.00


## Séparation du jeu de données 

In [9]:
# Séparation du jeu de données en training set et test set

train_set, test_set = train_test_split(df_dummies, test_size = 0.2, random_state = 1)

In [10]:
X_train = train_set.drop(['Transported'], axis=1)
Y_train = train_set['Transported']

In [11]:
X_test = test_set.drop(['Transported'], axis=1)
Y_test = test_set['Transported']

## Normalisation des données

In [12]:
scaler = StandardScaler()
scaler.fit(X_train)
X1 = scaler.transform(X_train)

In [13]:
scaler_ = StandardScaler()
scaler_.fit(X_test)
X2 = scaler.transform(X_test)

## Entraînement des données

In [14]:
reg = LazyClassifier(verbose=0,ignore_warnings=False, custom_metric=None)
models,predictions = reg.fit(X_train, X_test, Y_train, Y_test)
print(models)

 24%|██▍       | 7/29 [00:02<00:05,  3.72it/s]

CategoricalNB model failed to execute
Negative values in data passed to CategoricalNB (input X)


 90%|████████▉ | 26/29 [00:13<00:01,  2.13it/s]

StackingClassifier model failed to execute
__init__() missing 1 required positional argument: 'estimators'


100%|██████████| 29/29 [00:13<00:00,  2.08it/s]

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
AdaBoostClassifier                 0.79               0.79     0.79      0.79   
LGBMClassifier                     0.79               0.79     0.79      0.78   
XGBClassifier                      0.78               0.78     0.78      0.78   
NuSVC                              0.78               0.78     0.78      0.78   
RandomForestClassifier             0.77               0.77     0.77      0.77   
SVC                                0.77               0.77     0.77      0.77   
LabelSpreading                     0.76               0.76     0.76      0.76   
BaggingClassifier                  0.76               0.76     0.76      0.76   
ExtraTreesClassifier               0.76               0.76     0.76      0.76   
LabelPropagation                   0.75               0.75     0.75      0.75   
BernoulliNB                 




In [16]:
clf = xgb.XGBClassifier(objective='binary:logistic')

param_grid = {
        'learning_rate': [0.01,0.02,0.05, 0.1, 0.2],
        'subsample': [0.5, 0.6, 0.7, 0.8,0.9,1],
        'min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0],
        'max_depth':[5,6,7,8,9,10,12,15,20],
        'n_estimators': [100,200,500,800,1000,1200,1500,2000]}


rs_clf = RandomizedSearchCV(clf, param_grid, n_iter=20,
                            n_jobs=1, verbose=2, cv=2,                            
                            #early_stopping_rounds= 10, eval_set=[(X_test, Y_test)],
                            scoring = 'accuracy', refit=False, random_state=42)
print("Randomized search..")

rs_clf.fit(X_train, Y_train,eval_metric='logloss')

Randomized search..
Fitting 2 folds for each of 20 candidates, totalling 40 fits
[CV] subsample=0.9, n_estimators=800, min_child_weight=1.0, max_depth=8, learning_rate=0.1 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  subsample=0.9, n_estimators=800, min_child_weight=1.0, max_depth=8, learning_rate=0.1, total=   1.3s
[CV] subsample=0.9, n_estimators=800, min_child_weight=1.0, max_depth=8, learning_rate=0.1 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.3s remaining:    0.0s


[CV]  subsample=0.9, n_estimators=800, min_child_weight=1.0, max_depth=8, learning_rate=0.1, total=   1.3s
[CV] subsample=0.7, n_estimators=2000, min_child_weight=3.0, max_depth=8, learning_rate=0.01 
[CV]  subsample=0.7, n_estimators=2000, min_child_weight=3.0, max_depth=8, learning_rate=0.01, total=   3.1s
[CV] subsample=0.7, n_estimators=2000, min_child_weight=3.0, max_depth=8, learning_rate=0.01 
[CV]  subsample=0.7, n_estimators=2000, min_child_weight=3.0, max_depth=8, learning_rate=0.01, total=   3.5s
[CV] subsample=0.7, n_estimators=500, min_child_weight=3.0, max_depth=9, learning_rate=0.05 
[CV]  subsample=0.7, n_estimators=500, min_child_weight=3.0, max_depth=9, learning_rate=0.05, total=   1.0s
[CV] subsample=0.7, n_estimators=500, min_child_weight=3.0, max_depth=9, learning_rate=0.05 
[CV]  subsample=0.7, n_estimators=500, min_child_weight=3.0, max_depth=9, learning_rate=0.05, total=   1.1s
[CV] subsample=0.6, n_estimators=200, min_child_weight=5.0, max_depth=8, learning_rat

[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:   55.5s finished


RandomizedSearchCV(cv=2,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=100,...
                                           subsample=None, tree_method=None,
                                           validate_parameters=None,
                                   

In [17]:
rs_clf.best_params_

{'subsample': 0.9,
 'n_estimators': 200,
 'min_child_weight': 3.0,
 'max_depth': 5,
 'learning_rate': 0.05}

In [18]:
xgb_model = xgb.XGBClassifier(n_estimators= 200,learning_rate=0.05,subsample= 0.9, min_child_weight=3.0,
                      max_depth=5,objective='binary:logistic')

model = xgb_model.fit(X1, Y_train,eval_metric='logloss')

In [19]:
print("La précision de prédiction sur les données d'entrainement est de %.2f%%" % (model.score(X1, Y_train)))

La précision de prédiction sur les données d'entrainement est de 0.83%


In [20]:
# Vérifions que l'algorithme ne surajuste pas
y_pred = model.predict(X2)
predictions = [value for value in y_pred]
accuracy = accuracy_score(Y_test, predictions)
print("La précision de prediction sur les données de test est de %.2f%%" % (accuracy * 100.0))

La précision de prediction sur les données de test est de 78.93%


In [21]:
len(predictions)

1699

## Prédictions sur les données de validation

In [25]:
# phase de prétraitement des données

df_valid = pd.read_csv("titanic_validation.csv")
df_valid

Unnamed: 0.1,Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,0,0013_01,Earth,True,TRAPPIST-1e,27.00,False,0.00,0.00,0.00,0.00,0.00
1,1,0018_01,Earth,False,TRAPPIST-1e,19.00,False,0.00,9.00,0.00,234.00,0.00
2,2,0019_01,Europa,True,55 Cancri e,31.00,False,0.00,0.00,0.00,0.00,0.00
3,3,0021_01,Europa,False,TRAPPIST-1e,38.00,False,0.00,420.00,0.00,181.00,585.00
4,4,0023_01,Earth,False,TRAPPIST-1e,20.00,False,10.00,0.00,635.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...
4185,4185,9266_02,Earth,True,TRAPPIST-1e,34.00,False,0.00,0.00,0.00,0.00,0.00
4186,4186,9269_01,Earth,False,TRAPPIST-1e,42.00,False,0.00,847.00,17.00,10.00,144.00
4187,4187,9271_01,Mars,True,55 Cancri e,27.91,False,0.00,0.00,0.00,0.00,0.00
4188,4188,9273_01,Europa,False,TRAPPIST-1e,27.91,False,0.00,2680.00,0.00,0.00,523.00


In [26]:
df_valid.drop(columns=["Unnamed: 0"], inplace = True)

In [27]:
df_valid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4190 entries, 0 to 4189
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   4190 non-null   object 
 1   HomePlanet    4190 non-null   object 
 2   CryoSleep     4190 non-null   bool   
 3   Destination   4190 non-null   object 
 4   Age           4190 non-null   float64
 5   VIP           4190 non-null   bool   
 6   RoomService   4190 non-null   float64
 7   FoodCourt     4190 non-null   float64
 8   ShoppingMall  4190 non-null   float64
 9   Spa           4190 non-null   float64
 10  VRDeck        4190 non-null   float64
dtypes: bool(2), float64(6), object(3)
memory usage: 302.9+ KB


In [32]:
df1 = df_valid.drop(columns=["PassengerId"])

In [33]:
# encodons les variables

list_cat = ['HomePlanet', 'Destination' ]

df_dum =  pd.get_dummies(
    df1, columns = list_cat, prefix="Column", dtype = float)

# normalisons les données
scaler = StandardScaler()
scaler.fit(df_dum)
X = scaler.transform(df_dum)
X[0]

array([ 1.32810099, -0.06988236, -0.13315914, -0.44009297, -0.41662301,
       -0.41569501, -0.42268491, -0.40353654,  0.92278087, -0.5606279 ,
       -0.53226667, -0.49290426, -0.31626927,  0.63351227])

In [36]:
Y = model.predict(X)

4190

In [37]:
# créons un dataframe avec les numéro d'dentifiant et la prediction(Transported)

PassengerId = df_valid['PassengerId']
submission = pd.DataFrame({'PassengerId':PassengerId,'Transported':Y})
submission.head(5)

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True


In [38]:
# Enregistrons notre soumission

submission.to_csv("submission.csv", index=False)