# Modélisation

In [2]:
import sys
import pandas as pd
from sklearn.model_selection import train_test_split

# Load configuration
sys.path.append("../")

from utils import RANDOM_STATE, TEST_SIZE

In [3]:
# Load the data

df = pd.read_csv("../data/train_data.csv")
df.head()

Unnamed: 0,ID,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,37765,15794860,Ch'eng,627,France,Male,28.0,7,131694.04,1,1.0,1.0,161205.61,0
1,130453,15728005,Hargreaves,597,France,Male,34.0,2,0.0,2,0.0,1.0,181419.29,0
2,77297,15686810,Ts'ui,724,France,Male,39.0,7,0.0,2,1.0,1.0,100862.54,0
3,40858,15760244,Trevisano,663,Germany,Female,56.0,5,118577.24,3,1.0,0.0,61164.45,1
4,19804,15810563,French,627,France,Female,33.0,5,0.0,2,1.0,1.0,103737.82,0


In [4]:
y = df.pop("Exited")
X = df.copy()


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

print(f"The X_train set contains: {X_train.shape} elements")
print(f"The y_train set contains: {y_train.shape} elements")
print(f"The X_test set contains: {X_test.shape} elements")
print(f"The y_test set contains: {y_test.shape} elements")

The X_train set contains: (114863, 13) elements
The y_train set contains: (114863,) elements
The X_test set contains: (28716, 13) elements
The y_test set contains: (28716,) elements


## Pipeline

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.discriminant_analysis import StandardScaler
from sklearn.pipeline import FeatureUnion, FunctionTransformer
from sklearn.preprocessing import power_transform, OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Binarizer

from sklearn import set_config

set_config(transform_output="pandas")

In [6]:
from sklearn.preprocessing import RobustScaler


def discretise_zero_balance_with_2_products(X):
    X["zero_balance_with_2_products"] = (X["Balance"] == 0) & (X["NumOfProducts"] == 2)
    return X


def power_transform_numerical(X):
    tranform_data = power_transform(X, method="yeo-johnson")
    X["Age"] = tranform_data["Age"]
    return X


def create_pipeline(model):
    numerical_features = ["CreditScore", "Age", "EstimatedSalary"]
    catagorical_features = ["Geography", "NumOfProducts", "Gender", "IsActiveMember"]

    # On discrétise la variable 'Balance' en 2 classes. Solde est nulle ou non.
    balance_discretizer = ColumnTransformer(
        transformers=[("balance_binarizer", Binarizer(), ["Balance"])]
    )

    # On discrétise la variable 'CreditScore' en 2 classes. threshold <=849.
    credit_score_discretizer = ColumnTransformer(
        transformers=[
            ("credit_score_binarizer", Binarizer(threshold=849), ["CreditScore"])
        ]
    )

    # On discrétise la variable 'Balance' est nulle et le client détient 2 produits.
    zero_balance_with_2_products_discretizer = ColumnTransformer(
        transformers=[
            (
                "discretizer",
                FunctionTransformer(discretise_zero_balance_with_2_products),
                ["Balance", "NumOfProducts"],
            )
        ]
    )

    numerical_transformer = ColumnTransformer(
        transformers=[
            ("scaler", StandardScaler(), numerical_features),
        ],
        verbose_feature_names_out=False,
    )

    numerical_transformer = Pipeline(
        steps=[
            ("numerical_transformer", numerical_transformer),
            ("power_transformation", FunctionTransformer(power_transform_numerical)),
            ("feature_engineering", PolynomialFeatures(degree=3, include_bias=False)),
            ("feature_selection", SelectKBest(f_classif, k=5)),
            ("feature_scaler", RobustScaler()),
        ]
    )

    categorical_transformer = ColumnTransformer(
        transformers=[
            (
                "Encoder",
                OneHotEncoder(drop="first", sparse_output=False),
                catagorical_features,
            ),
        ],
        verbose_feature_names_out=False,
    )

    pipeline = FeatureUnion(
        transformer_list=[
            ("numerical", numerical_transformer),
            ("categorical", categorical_transformer),
            ("solde", balance_discretizer),
            ("risque", credit_score_discretizer),
            ("zero_balance_with_2_products", zero_balance_with_2_products_discretizer),
        ]
    )
    pipeline = Pipeline(steps=[("preprocessor", pipeline), ("classifier", model)])
    return pipeline

In [47]:
from sklearn.base import BaseEstimator, TransformerMixin, check_is_fitted
from sklearn.mixture import GaussianMixture


class GaussianMixtureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, n_components=2, random_state=42):
        self.n_components = n_components
        self.random_state = random_state
        self.gm = GaussianMixture(
            n_components=self.n_components, random_state=self.random_state
        )

    def fit(self, X, y=None):
        self.gm.fit(X)
        return self

    def transform(self, X):
        X_transformed = X.copy()
        return self.gm.predict(X_transformed).reshape(-1, 1)

In [54]:
from sklearn.ensemble import GradientBoostingClassifier


numerical_features = ["CreditScore", "EstimatedSalary", "Age", "Balance"]
catagorical_features = ["Geography", "NumOfProducts", "Gender", "IsActiveMember"]

n_components_dict = {
    "Age": 2,
    "CreditScore": 9,
    "EstimatedSalary": 5,
    "Balance": 2,
}

# Create separate pipelines for each feature


transformers = []
for feature in numerical_features:
    pipeline = Pipeline(
        steps=[
            # ("scaler", StandardScaler()),
            # ("power_transformation", FunctionTransformer(power_transform_numerical)),
            (
                "feature_engineering",
                GaussianMixtureTransformer(n_components=n_components_dict[feature]),
            ),
        ]
    )
    transformers.append((f"{feature}_pipeline", pipeline, [feature]))

base_numerical_pipeline = Pipeline(
    steps=[
        ("power_transformation", FunctionTransformer(power_transform_numerical)),
        # ("scaler", StandardScaler()),
    ]
)

transformers.append(("base_numerical_pipeline", base_numerical_pipeline, ["Age"]))

numerical_transformer = ColumnTransformer(
    transformers=[("base", base_numerical_pipeline, numerical_features)],
    verbose_feature_names_out=False,
)

# categorical_pipeline = Pipeline(
#     steps=[
#         ("encoder", OneHotEncoder(drop="first", sparse_output=False)),
#     ]
# )

# transformers.append(
#     ("categorical_pipeline", categorical_pipeline, catagorical_features)
# )
categorical_pipeline = ColumnTransformer(
    transformers=[
        (
            "encoder",
            OneHotEncoder(drop="first", sparse_output=False),
            catagorical_features,
        ),
    ],
    verbose_feature_names_out=False,
)

# Create the ColumnTransformer
numerical_preprocessor = ColumnTransformer(
    transformers=transformers, verbose_feature_names_out=False
)

numerical_pipeline = Pipeline(
    steps=[
        ("preprocessor", numerical_preprocessor),
        ("scaler", StandardScaler()),
        # ("feature_engineering", PolynomialFeatures(degree=3, include_bias=False)),
        # ("feature_selection", SelectKBest(f_classif, k=5)),
        # ("feature_scaler", RobustScaler()),
    ]
)

preprocessor = FeatureUnion(
    transformer_list=[
        ("numerical", numerical_pipeline),
        ("categorical", categorical_pipeline),
    ]
)

estimator = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        (
            "classifier",
            GradientBoostingClassifier(
                criterion="friedman_mse",
                loss="exponential",
                learning_rate=0.1,
                n_estimators=200,
            ),
        ),
    ]
)

In [56]:
from sklearn.model_selection import GridSearchCV


param_grid = [
    {
        "classifier__n_estimators": [200],
    }
]

# Create GridSearchCV
grid_search = GridSearchCV(
    estimator,
    param_grid,
    scoring="f1",
    cv=3,  # Adjust cross-validation folds as needed
    verbose=2,
)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best F1-score:", grid_search.best_score_)

# Get the best model
best_model = grid_search.best_estimator_

# Évaluation du modèle sur le jeu de test
test_score = best_model.score(X_test, y_test)
print(f"Test set score: {test_score}")

Fitting 3 folds for each of 1 candidates, totalling 3 fits




[CV] END .......................classifier__n_estimators=200; total time=   6.0s




[CV] END .......................classifier__n_estimators=200; total time=   5.8s




[CV] END .......................classifier__n_estimators=200; total time=   6.3s
Best parameters: {'classifier__n_estimators': 200}
Best F1-score: 0.6354824186360789
Test set score: 0.8629335562055996




In [57]:
# On entraine le modèle sur l'ensemble des données
best_model.fit(X, y)

In [58]:
test_df = pd.read_csv("../data/test_data.csv")
result = best_model.predict(test_df)
submission_df = pd.concat(
    [test_df["ID"], pd.DataFrame(result, columns=["Exited"])], axis=1
).set_index("ID")
submission_df.head()



Unnamed: 0_level_0,Exited
ID,Unnamed: 1_level_1
67897,0
163075,0
134760,1
68707,0
3428,0


In [59]:
submission_df.to_csv("../data/submission.csv")
check_df = pd.read_csv("../data/submission.csv")
check_df.head()

Unnamed: 0,ID,Exited
0,67897,0
1,163075,0
2,134760,1
3,68707,0
4,3428,0
