# 1.Data preprocessing

## Importaion des bibliothèques

In [2531]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold, cross_val_score, RandomizedSearchCV, train_test_split
from xgboost import XGBClassifier
from catboost import CatBoostClassifier, Pool

## Lecture des fichiers CSV

In [2532]:
test = pd.read_csv('../titanic-project/titanic_data/test.csv') 
train = pd.read_csv('../titanic-project/titanic_data/train.csv')
train.head()
train = train.drop(columns='Cabin')
test = test.drop(columns='Cabin')
print('Train columns with null values:\n', train.isnull().sum())
print("-"*10)

print('Test/Validation columns with null values:\n', test.isnull().sum())
print("-"*10)

train.describe(include = 'all')

Train columns with null values:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         2
dtype: int64
----------
Test/Validation columns with null values:
 PassengerId     0
Pclass          0
Name            0
Sex             0
Age            86
SibSp           0
Parch           0
Ticket          0
Fare            1
Embarked        0
dtype: int64
----------


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,889
unique,,,,891,2,,,,681.0,,3
top,,,,"Braund, Mr. Owen Harris",male,,,,1601.0,,S
freq,,,,1,577,,,,7.0,,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,


In [2533]:
women = train.loc[train.Sex == 'female']["Survived"]
rate_women = sum(women)/len(women)

print("% of women who survived:", rate_women)

% of women who survived: 0.7420382165605095


In [2534]:
men = train.loc[train.Sex == 'male']["Survived"]
rate_men = sum(men)/len(men)

print("% of men who survived:", rate_men)

% of men who survived: 0.18890814558058924


In [2535]:
for pclass in [1, 2, 3]:
    survived = train[train.Pclass == pclass]["Survived"]
    rate = survived.mean()
    print(f"% of passengers who survived in class {pclass}: {rate:.2f}")

% of passengers who survived in class 1: 0.63
% of passengers who survived in class 2: 0.47
% of passengers who survived in class 3: 0.24


In [2536]:
children = train[train["Age"] < 10]["Survived"]
rate_children = children.mean()

print("% of children under 10 who survived:", rate_children)

% of children under 10 who survived: 0.6129032258064516


In [2537]:
high_fare = train[train["Fare"] > 100]["Survived"]
rate_high_fare = high_fare.mean()

print("% of passengers with Fare > 100 who survived:", rate_high_fare)

% of passengers with Fare > 100 who survived: 0.7358490566037735


In [2538]:
for port in train["Embarked"].dropna().unique():
    rate = train[train["Embarked"] == port]["Survived"].mean()
    print(f"% of passengers from port {port} who survived: {rate:.2f}")


% of passengers from port S who survived: 0.34
% of passengers from port C who survived: 0.55
% of passengers from port Q who survived: 0.39


In [2539]:
# Extraire le titre du nom
train['Title'] = train['Name'].str.extract(' ([A-Za-z]+)', expand=False)
test['Title'] = test['Name'].str.extract(' ([A-Za-z]+)', expand=False)

# Simplifier les titres rares
for df in [train, test]:
    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt','Col','Don','Dr', 'Major','Rev','Sir','Jonkheer','Dona'], 'Rare')
    df['Title'] = df['Title'].replace(['Mlle','Ms'], 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
train['Title']


0        Mr
1       Mrs
2      Miss
3       Mrs
4        Mr
       ... 
886    Rare
887    Miss
888    Miss
889      Mr
890      Mr
Name: Title, Length: 891, dtype: object

In [2540]:
# Si une personne n'a pas de frères/sœurs (SibSp) ou de parents/enfants (Parch), elle est seule.
train['IsAlone'] = (train['SibSp'] + train['Parch'] == 0).astype(int)
test['IsAlone'] = (test['SibSp'] + test['Parch'] == 0).astype(int)

train['IsAlone']


0      0
1      0
2      1
3      0
4      1
      ..
886    1
887    1
888    0
889    1
890    1
Name: IsAlone, Length: 891, dtype: int64

In [2541]:
# FamilySize - taille de la famille (y compris le passager)
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1

train['FamilySize']

0      2
1      2
2      1
3      2
4      1
      ..
886    1
887    1
888    4
889    1
890    1
Name: FamilySize, Length: 891, dtype: int64

## Entraînment du modele 

##

In [None]:
y = train["Survived"]

# features
features = ["Pclass", "Sex", "SibSp", "Parch", "IsAlone", "Title", "Embarked", "FamilySize", "Age"]
cat_features = ["Pclass", "Sex", "Title", "Embarked", "IsAlone"]

# Sélectionner les données dont nous avons besoin
X = train[features].copy()
X_test = test[features].copy()

# Remplir title  
for col in cat_features:
    X[col] = X[col].fillna("missing").astype(str)
    X_test[col] = X_test[col].fillna("missing").astype(str)

# La décomposition en formation et en validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)

# CatBoost Pool
train_pool = Pool(X_train, y_train, cat_features=cat_features)
val_pool = Pool(X_val, y_val, cat_features=cat_features)
test_pool = Pool(X_test, cat_features=cat_features)

# Créer le modèle CatBoost
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    eval_metric='Accuracy',
    class_weights=[1, 2],
    random_seed=42,
    verbose=0,
    early_stopping_rounds=50
)

model.fit(train_pool, eval_set=val_pool)

val_pred = model.predict(val_pool)
print(classification_report(y_val, val_pred))

# Prédictions sur un test
test_pred = model.predict(test_pool)
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': test_pred.astype(int)})
output.to_csv('submission.csv', index=False)

0:	learn: 0.7604485	test: 0.7460317	best: 0.7460317 (0)	total: 2.91ms	remaining: 2.91s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.7817460317
bestIteration = 7

Shrink model to first 8 iterations.
              precision    recall  f1-score   support

           0       0.81      0.90      0.85       106
           1       0.82      0.70      0.76        73

    accuracy                           0.82       179
   macro avg       0.82      0.80      0.80       179
weighted avg       0.82      0.82      0.81       179

