# Modélisation

In [3]:
import sys
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
# Load configuration
sys.path.append('../')

from utils import RANDOM_STATE, TEST_SIZE

In [4]:
# Load the data

df = pd.read_csv('../data/train_data.csv')
df.head()

Unnamed: 0,ID,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,37765,15794860,Ch'eng,627,France,Male,28.0,7,131694.04,1,1.0,1.0,161205.61,0
1,130453,15728005,Hargreaves,597,France,Male,34.0,2,0.0,2,0.0,1.0,181419.29,0
2,77297,15686810,Ts'ui,724,France,Male,39.0,7,0.0,2,1.0,1.0,100862.54,0
3,40858,15760244,Trevisano,663,Germany,Female,56.0,5,118577.24,3,1.0,0.0,61164.45,1
4,19804,15810563,French,627,France,Female,33.0,5,0.0,2,1.0,1.0,103737.82,0


In [5]:
y = df.pop('Exited')
X = df.copy()


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

print(f"The X_train set contains: {X_train.shape} elements")
print(f"The y_train set contains: {y_train.shape} elements")
print(f"The X_test set contains: {X_test.shape} elements")
print(f"The y_test set contains: {y_test.shape} elements")

The X_train set contains: (114863, 13) elements
The y_train set contains: (114863,) elements
The X_test set contains: (28716, 13) elements
The y_test set contains: (28716,) elements


## Pipeline

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.pipeline import make_pipeline, make_union
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Binarizer

from sklearn import set_config
set_config(transform_output = "pandas")


In [10]:
def create_pipeline(model):
    numerical_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary']
    catagorical_features = ['Geography', 'NumOfProducts', 'Gender']


    numerical_transformer = ColumnTransformer(
        transformers=[
            ('Scaler', MinMaxScaler(), numerical_features)
        ],
        verbose_feature_names_out=False
    )

    numerical_transformer = make_pipeline(numerical_transformer, PolynomialFeatures(degree=3, include_bias=False), SelectKBest(f_classif, k=5))

    categorical_transformer = ColumnTransformer(
        transformers=[
            ('Encoder', OneHotEncoder(drop='first', sparse_output=False), catagorical_features),
        ],
        verbose_feature_names_out=False
    )

    pipeline = make_union(numerical_transformer, categorical_transformer)
    pipeline = Pipeline(steps=[('processor', pipeline), ('classifier', model)])
    return pipeline

In [11]:
pipeline = create_pipeline(KNeighborsClassifier(n_neighbors=6))
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='f1')
print(f"Scores de validation croisée : {cv_scores}")
print(f"Moyenne des scores de validation croisée : {cv_scores.mean()}")

Scores de validation croisée : [0.51740286 0.52179438 0.51404386 0.51368904 0.50555987]
Moyenne des scores de validation croisée : 0.514498001601932


In [None]:
# Define the parameter grid
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier


param_grid = [
    {
        'classifier': [LogisticRegression(solver='liblinear')],
        'classifier__penalty': ['l1', 'l2'],
        'classifier__C': [0.1, 1.0, 10.0]
    },
    {
        'classifier': [RandomForestClassifier()],
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 10, 20]
    },
    {
        'classifier': [DecisionTreeClassifier()],
        'classifier__max_depth': [None, 10, 20],
        'classifier__min_samples_split': [2, 10, 20]
    },
    {
        'classifier': [GradientBoostingClassifier()],
        'classifier__n_estimators': [50, 100, 200],
        'classifier__learning_rate': [0.01, 0.1, 0.5]
    },
    {
        'classifier': [KNeighborsClassifier()],
        'classifier__n_neighbors': [3, 5, 7],
        'classifier__weights': ['uniform', 'distance']
    },
    {
        'classifier': [GaussianNB()],
        'classifier__var_smoothing': [1e-9, 1e-8, 1e-7]
    }
]

# Create GridSearchCV
grid_search = GridSearchCV(create_pipeline(LogisticRegression()),
                           param_grid,
                           scoring='f1',
                           cv=3,  # Adjust cross-validation folds as needed
                           verbose=2)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best F1-score:", grid_search.best_score_)

# Get the best model
best_model = grid_search.best_estimator_

Fitting 3 folds for each of 42 candidates, totalling 126 fits
[CV] END classifier=LogisticRegression(solver='liblinear'), classifier__C=0.1, classifier__penalty=l1; total time=   1.1s
[CV] END classifier=LogisticRegression(solver='liblinear'), classifier__C=0.1, classifier__penalty=l1; total time=   1.1s
[CV] END classifier=LogisticRegression(solver='liblinear'), classifier__C=0.1, classifier__penalty=l1; total time=   1.0s
[CV] END classifier=LogisticRegression(solver='liblinear'), classifier__C=0.1, classifier__penalty=l2; total time=   0.2s
[CV] END classifier=LogisticRegression(solver='liblinear'), classifier__C=0.1, classifier__penalty=l2; total time=   0.2s
[CV] END classifier=LogisticRegression(solver='liblinear'), classifier__C=0.1, classifier__penalty=l2; total time=   0.2s
[CV] END classifier=LogisticRegression(solver='liblinear'), classifier__C=1.0, classifier__penalty=l1; total time=   1.8s
[CV] END classifier=LogisticRegression(solver='liblinear'), classifier__C=1.0, class

In [9]:
# Évaluation du modèle sur le jeu de test
test_score = best_model.score(X_test, y_test)
print(f"Test set score: {test_score}")

Test set score: 0.8478200306449366


In [10]:
# On entraine le modèle sur l'ensemble des données
best_model.fit(X, y)

In [11]:
test_df = pd.read_csv('../data/test_data.csv')
test_df.head()

Unnamed: 0,ID,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,67897,15585246,Bancks,585,France,Female,53.0,2,0.0,2,1.0,1.0,91830.75
1,163075,15604551,Robb,606,France,Male,24.0,2,0.0,1,0.0,1.0,90876.95
2,134760,15729040,Ugochukwu,633,Germany,Male,44.0,1,118907.6,1,1.0,0.0,47777.15
3,68707,15792329,Hs?,602,Germany,Male,40.0,2,149961.99,2,1.0,1.0,82696.84
4,3428,15617166,Genovesi,645,France,Male,31.0,4,132351.29,1,1.0,0.0,151887.16


In [12]:
result = best_model.predict(test_df)

In [13]:
submission_df = pd.concat([test_df['ID'], pd.DataFrame(result, columns=['Exited'])], axis=1).set_index('ID')
submission_df.head()

Unnamed: 0_level_0,Exited
ID,Unnamed: 1_level_1
67897,0
163075,0
134760,1
68707,0
3428,0


In [14]:
submission_df.to_csv('../data/submission.csv')
check_df = pd.read_csv('../data/submission.csv')
check_df.head()

Unnamed: 0,ID,Exited
0,67897,0
1,163075,0
2,134760,1
3,68707,0
4,3428,0


In [None]:
from sklearn.discriminant_analysis import StandardScaler
from sklearn.pipeline import FeatureUnion, FunctionTransformer

def discretise_zero_balance_with_2_products(X):
    X['zero_balance_with_2_products'] = (X['Balance'] == 0) & (X['NumOfProducts'] == 2)
    return X
    
def create_pipeline(model):
    numerical_features = ['CreditScore', 'Age', 'Tenure', 'EstimatedSalary']
    catagorical_features = ['Geography', 'NumOfProducts', 'Gender', 'IsActiveMember']

    # On discrétise la variable 'Balance' en 2 classes. Solde est nulle ou non.
    balance_discretizer = ColumnTransformer(
        transformers=[
            ('balance_binarizer', Binarizer(), ['Balance'])])
    
    # On discrétise la variable 'CreditScore' en 2 classes. threshold <=849.
    credit_score_discretizer = ColumnTransformer(
        transformers=[
            ('credit_score_binarizer', Binarizer(threshold=849), ['CreditScore'])])
    
    # On discrétise la variable 'Balance' est nulle et le client détient 2 produits.
    zero_balance_with_2_products_discretizer = ColumnTransformer(
        transformers=[
            ('discretizer', FunctionTransformer(discretise_zero_balance_with_2_products), ['Balance', 'NumOfProducts'])
            ])

    numerical_transformer = ColumnTransformer(
        transformers=[
            ('scaler', StandardScaler(), numerical_features),

        ],
        verbose_feature_names_out=False
    )

    numerical_transformer = Pipeline(steps=[
        ('numerical_transformer', numerical_transformer),
        ('feature_engineering', PolynomialFeatures(degree=3, include_bias=False)), 
        ('feature_selection', SelectKBest(f_classif, k=5)), 
        ('feature_scaler', MinMaxScaler())
        ])


    categorical_transformer = ColumnTransformer(
        transformers=[
            ('Encoder', OneHotEncoder(drop='first', sparse_output=False), catagorical_features),
        ],
        verbose_feature_names_out=False
    )

    pipeline = FeatureUnion(
        transformer_list=[('numerical',numerical_transformer), ('categorical', categorical_transformer), ('solde', balance_discretizer), ('risque', credit_score_discretizer), ('zero_balance_with_2_products', zero_balance_with_2_products_discretizer)]
        )
    pipeline = Pipeline(steps=[('preprocessor', pipeline), ('classifier', model)])
    return pipeline

In [77]:
from sklearn.linear_model import LogisticRegression
pipeline = create_pipeline(LogisticRegression(solver='liblinear'))
pipeline

In [78]:
pipeline.get_params()

{'memory': None,
 'steps': [('preprocessor',
   FeatureUnion(transformer_list=[('numerical',
                                   Pipeline(steps=[('numerical_transformer',
                                                    ColumnTransformer(transformers=[('scaler',
                                                                                     StandardScaler(),
                                                                                     ['CreditScore',
                                                                                      'Age',
                                                                                      'Tenure',
                                                                                      'EstimatedSalary'])],
                                                                      verbose_feature_names_out=False)),
                                                   ('feature_engineering',
                                                    Po

In [79]:
# Define the parameter grid
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# best parameters: {'classifier': GradientBoostingClassifier(), 'classifier__criterion': 'friedman_mse', 'classifier__learning_rate': 0.1, 'classifier__loss': 'exponential', 'classifier__n_estimators': 200}

param_grid = [
    {
        'classifier': [GradientBoostingClassifier(criterion='friedman_mse', loss='exponential', learning_rate=0.1)],
        'classifier__n_estimators': [200],
        'preprocessor__numerical__numerical_transformer__scaler': [MinMaxScaler()],
        'preprocessor__numerical__feature_selection__k': [7],
        'preprocessor__numerical__feature_scaler': [MinMaxScaler()],
        'preprocessor__numerical__feature_engineering__degree': [2],
    }
]

# Create GridSearchCV
grid_search = GridSearchCV(create_pipeline(LogisticRegression()),
                           param_grid,
                           scoring='f1',
                           cv=3,  # Adjust cross-validation folds as needed
                           verbose=2)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best F1-score:", grid_search.best_score_)

# Get the best model
best_model = grid_search.best_estimator_

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END classifier=GradientBoostingClassifier(loss='exponential'), classifier__n_estimators=200, preprocessor__numerical__feature_engineering__degree=2, preprocessor__numerical__feature_scaler=MinMaxScaler(), preprocessor__numerical__feature_selection__k=7, preprocessor__numerical__numerical_transformer__scaler=MinMaxScaler(); total time=  19.5s
[CV] END classifier=GradientBoostingClassifier(loss='exponential'), classifier__n_estimators=200, preprocessor__numerical__feature_engineering__degree=2, preprocessor__numerical__feature_scaler=MinMaxScaler(), preprocessor__numerical__feature_selection__k=7, preprocessor__numerical__numerical_transformer__scaler=MinMaxScaler(); total time=  19.7s
[CV] END classifier=GradientBoostingClassifier(loss='exponential'), classifier__n_estimators=200, preprocessor__numerical__feature_engineering__degree=2, preprocessor__numerical__feature_scaler=MinMaxScaler(), preprocessor__numerical__feature_

In [68]:
# Évaluation du modèle sur le jeu de test
test_score = best_model.score(X_test, y_test)
print(f"Test set score: {test_score}")

Test set score: 0.8625156707062265


In [69]:
# On entraine le modèle sur l'ensemble des données
best_model.fit(X, y)

In [70]:
test_df = pd.read_csv('../data/test_data.csv')
result = best_model.predict(test_df)
submission_df = pd.concat([test_df['ID'], pd.DataFrame(result, columns=['Exited'])], axis=1).set_index('ID')
submission_df.head()

Unnamed: 0_level_0,Exited
ID,Unnamed: 1_level_1
67897,0
163075,0
134760,1
68707,0
3428,0


In [71]:
submission_df.to_csv('../data/submission.csv')
check_df = pd.read_csv('../data/submission.csv')
check_df.head()

Unnamed: 0,ID,Exited
0,67897,0
1,163075,0
2,134760,1
3,68707,0
4,3428,0


In [57]:
import os
import xgboost as xgb

# Set the environment variable (if it's not already set)
if 'DYLD_LIBRARY_PATH' not in os.environ or '/opt/homebrew/lib' not in os.environ['DYLD_LIBRARY_PATH']:
    os.environ['DYLD_LIBRARY_PATH'] = '/opt/homebrew/lib:' + os.environ.get('DYLD_LIBRARY_PATH', '')

from sklearn.metrics import f1_score

# Initialize XGBoost classifier
xgb_classifier = xgb.XGBClassifier(objective='binary:logistic',  # For binary classification
                                   n_estimators=100,             # Number of boosting rounds
                                   learning_rate=0.1,            # Step size shrinkage
                                   max_depth=3,                  # Maximum depth of a tree
                                   random_state=RANDOM_STATE)   # Random seed for reproducibility
pipeline = create_pipeline(xgb_classifier)

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1}")

F1 Score: 0.5983874064119793


In [58]:
# Define the parameter grid
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# best parameters: {'classifier': GradientBoostingClassifier(), 'classifier__criterion': 'friedman_mse', 'classifier__learning_rate': 0.1, 'classifier__loss': 'exponential', 'classifier__n_estimators': 200}

param_grid = [
    {
        'classifier__learning_rate': [0.01, 0.1, 0.5],
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [3, 5, 7],
        'preprocessor__numerical__numerical_transformer__scaler': [MinMaxScaler()],
        'preprocessor__numerical__feature_selection__k': [7],
        'preprocessor__numerical__feature_scaler': [MinMaxScaler()],
        'preprocessor__numerical__feature_engineering__degree': [2],
    }
]

# Create GridSearchCV
xgb_classifier = xgb.XGBClassifier(objective='binary:logistic',  # For binary classification
                                   n_estimators=100,             # Number of boosting rounds
                                   learning_rate=0.1,            # Step size shrinkage
                                   random_state=RANDOM_STATE)   # Random seed for reproducibility

grid_search = GridSearchCV(create_pipeline(xgb_classifier),
                           param_grid,
                           scoring='f1',
                           cv=3,  # Adjust cross-validation folds as needed
                           verbose=2)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best F1-score:", grid_search.best_score_)

# Get the best model
best_model = grid_search.best_estimator_

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] END classifier__learning_rate=0.01, classifier__max_depth=3, classifier__n_estimators=100, preprocessor__numerical__feature_engineering__degree=2, preprocessor__numerical__feature_scaler=MinMaxScaler(), preprocessor__numerical__feature_selection__k=7, preprocessor__numerical__numerical_transformer__scaler=MinMaxScaler(); total time=   0.3s
[CV] END classifier__learning_rate=0.01, classifier__max_depth=3, classifier__n_estimators=100, preprocessor__numerical__feature_engineering__degree=2, preprocessor__numerical__feature_scaler=MinMaxScaler(), preprocessor__numerical__feature_selection__k=7, preprocessor__numerical__numerical_transformer__scaler=MinMaxScaler(); total time=   0.3s
[CV] END classifier__learning_rate=0.01, classifier__max_depth=3, classifier__n_estimators=100, preprocessor__numerical__feature_engineering__degree=2, preprocessor__numerical__feature_scaler=MinMaxScaler(), preprocessor__numerical__feature_sele

In [59]:
# Évaluation du modèle sur le jeu de test
test_score = best_model.score(X_test, y_test)
print(f"Test set score: {test_score}")

Test set score: 0.8550285555091238
