In [3]:
warnings.filterwarnings("ignore")
from dotenv import load_dotenv
import os

import pandas as pd
pd.options.display.max_columns = None
import seaborn as sns
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import mlflow
import datetime
import warnings
import os

In [4]:
load_dotenv("../backend/src/.env")
DagsHub_username = os.getenv("DagsHub_username")
DagsHub_token=os.getenv("DagsHub_token") 


In [5]:
print(DagsHub_username)

rami4real


In [6]:
os.environ['MLFLOW_TRACKING_USERNAME']= DagsHub_username
os.environ["MLFLOW_TRACKING_PASSWORD"] = DagsHub_token

In [7]:
mlflow.set_tracking_uri('https://dagshub.com/rami4real/mymlproject.mlflow') #your mlfow tracking uri
mlflow.set_experiment("churn-experiment")

2024/12/09 09:22:44 INFO mlflow.tracking.fluent: Experiment with name 'churn-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/566ae890f7ca4c2db92c2a642aca7189', creation_time=1733732561769, experiment_id='0', last_update_time=1733732561769, lifecycle_stage='active', name='churn-experiment', tags={}>

In [20]:
data_train = pd.read_csv("../data/customer_churn_train.csv")
data_test = pd.read_csv("../data/customer_churn_test.csv")

In [21]:
x_train = data_train.drop(['Churn'],axis = 1)  # All rows except the last two
y_train = data_train['Churn']   # The last row
x_test = data_test.drop(['Churn'],axis = 1)   # All rows except the last two
y_test = data_test['Churn'] 
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404164 entries, 0 to 404163
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Age                404164 non-null  float64
 1   Gender             404164 non-null  int64  
 2   Tenure             404164 non-null  float64
 3   Usage Frequency    404164 non-null  float64
 4   Support Calls      404164 non-null  float64
 5   Payment Delay      404164 non-null  float64
 6   Subscription Type  404164 non-null  int64  
 7   Contract Length    404164 non-null  int64  
 8   Total Spend        404164 non-null  float64
 9   Last Interaction   404164 non-null  float64
dtypes: float64(7), int64(3)
memory usage: 30.8 MB


In [22]:
x_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101042 entries, 0 to 101041
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Age                101042 non-null  float64
 1   Gender             101042 non-null  int64  
 2   Tenure             101042 non-null  float64
 3   Usage Frequency    101042 non-null  float64
 4   Support Calls      101042 non-null  float64
 5   Payment Delay      101042 non-null  float64
 6   Subscription Type  101042 non-null  int64  
 7   Contract Length    101042 non-null  int64  
 8   Total Spend        101042 non-null  float64
 9   Last Interaction   101042 non-null  float64
dtypes: float64(7), int64(3)
memory usage: 7.7 MB


In [23]:
x_test

Unnamed: 0,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction
0,1.601809,0,1.429999,0.033086,-0.585052,-1.478710,2,0,-0.897088,-0.884100
1,-0.765883,1,0.327749,1.309289,-0.904173,0.296190,2,0,-0.142193,-1.464937
2,-1.081576,1,0.791855,0.961234,-0.904173,-0.058790,1,2,0.208778,0.742241
3,0.812578,1,-1.238606,0.613178,0.053192,-0.295443,0,2,-0.334555,-0.651766
4,-0.371268,0,-0.716488,0.845215,0.372314,-0.058790,2,0,-0.664738,-0.651766
...,...,...,...,...,...,...,...,...,...,...
101037,0.417963,0,-1.586686,-0.314970,-1.223295,0.177864,1,0,0.158802,-0.070929
101038,-0.213422,0,1.197947,0.729197,-0.585052,-0.295443,2,0,0.817822,-1.581104
101039,0.102271,1,-1.238606,-1.011081,0.053192,0.296190,2,0,-0.263423,-1.348769
101040,-1.160499,0,0.733841,-1.591174,-0.904173,-1.360383,0,0,0.233155,-0.303264


In [24]:
y_test

0         1
1         0
2         0
3         0
4         1
         ..
101037    0
101038    0
101039    0
101040    0
101041    0
Name: Churn, Length: 101042, dtype: int64

In [25]:
def train_and_evaluate_model(model, x_train, y_train, x_test, y_test):
    """
    Train the model and evaluate it on test data.
    Returns the classification report as a dictionary.
    """
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    return report

In [26]:
models = {
    "Linear SVC": LinearSVC(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "MLP Classifier": MLPClassifier(random_state=42)
}

In [27]:
results = {}

# Train and evaluate each model
for model_name, model in models.items():
    print(f"Training model: {model_name}...")
    report = train_and_evaluate_model(model, x_train, y_train, x_test, y_test)
    results[model_name] = {
        "F1-Score (0)": report["0"]["f1-score"],
        "F1-Score (1)": report["1"]["f1-score"]
    }

Training model: Linear SVC...
Training model: Naive Bayes...
Training model: Random Forest...
Training model: XGBoost...
Training model: K-Nearest Neighbors...
Training model: MLP Classifier...


In [31]:
comparison = pd.DataFrame([
    {"Model": model_name, "F1 Score": max(scores.values())}  # Use max F1-Score for comparison
    for model_name, scores in results.items()
]).sort_values(by="F1 Score", ascending=False)

print("\nModel Comparison (sorted by F1 Score):")
print(comparison)



Model Comparison (sorted by F1 Score):
                 Model  F1 Score
2        Random Forest  0.945193
3              XGBoost  0.943054
5       MLP Classifier  0.933756
4  K-Nearest Neighbors  0.908235
1          Naive Bayes  0.866768
0           Linear SVC  0.832492


In [32]:
# Create a comparative table
results_df = pd.DataFrame(results)
print("\nComparative Table of F1-Scores:")

# Display the table
results_df


Comparative Table of F1-Scores:


Unnamed: 0,Linear SVC,Naive Bayes,Random Forest,XGBoost,K-Nearest Neighbors,MLP Classifier
F1-Score (0),0.801039,0.844923,0.922662,0.920188,0.883508,0.909644
F1-Score (1),0.832492,0.866768,0.945193,0.943054,0.908235,0.933756


## Best Performers:
For Class 0, Random Forest achieves the highest F1-score of 0.922662, followed closely by XGBoost (0.920188) and MLP Classifier (0.909644).

For Class 1, Random Forest leads with an F1-score of 0.945193, closely followed by XGBoost (0.943054) and MLP Classifier (0.933756).
## Ensemble methods like Random Forest and XGBoost consistently outperform other algorithms in both classes, suggesting their robustness for handling churn data.

### XGBoost

In [35]:
with mlflow.start_run(run_name='XGBoost'):
    mlflow.log_param("data_train","../data/data_train.csv")
    mlflow.log_param("data_test","../data/data_test.csv")
    mlflow.log_param("data_version","v1.0")
    xgb_model = XGBClassifier()
    params = xgb_model.get_params()
    mlflow.log_params(params)
    mlflow.set_tag(key="model", value="XGBoost")
    xgb_model.fit(x_train, y_train)
    train_features_name = f'{x_train=}'.split('=')[0]
    train_label_name = f'{y_train=}'.split('=')[0]
    mlflow.set_tag(key="train_features_name", value=train_features_name)
    mlflow.set_tag(key="train_label_name", value=train_label_name)
    predicted = xgb_model.predict(x_test)
    precision, recall, fscore, support = score(y_test, predicted, average='macro')
    
    mlflow.log_metric("Precision_test", precision)
    mlflow.log_metric("Recall_test", recall)
    mlflow.log_metric("F1_score_test", fscore)
    
    # Log the trained model as an artifact
    mlflow.sklearn.log_model(xgb_model, artifact_path="ML_models")



🏃 View run XGBoost at: https://dagshub.com/rami4real/mymlproject.mlflow/#/experiments/0/runs/d8addf2a337742b89bdce0347e794e3e
🧪 View experiment at: https://dagshub.com/rami4real/mymlproject.mlflow/#/experiments/0


In [36]:

with mlflow.start_run(run_name='Random Forest'):
    # Log dataset information
    mlflow.log_param("data_train", "../data/data_train.csv")
    mlflow.log_param("data_test", "../data/data_test.csv")
    mlflow.log_param("data_version", "v1.0")
    
    # Initialize and log Random Forest model parameters
    rf_model = RandomForestClassifier()
    params = rf_model.get_params()
    mlflow.log_params(params)
    mlflow.set_tag(key="model", value="Random Forest")
    
    # Train the Random Forest model
    rf_model.fit(x_train, y_train)
    
    # Log feature and label names as tags
    train_features_name = f'{x_train=}'.split('=')[0]
    train_label_name = f'{y_train=}'.split('=')[0]
    mlflow.set_tag(key="train_features_name", value=train_features_name)
    mlflow.set_tag(key="train_label_name", value=train_label_name)
    
    # Make predictions and calculate metrics
    predicted = rf_model.predict(x_test)
    precision, recall, fscore, support = score(y_test, predicted, average='macro')
    
    # Log metrics
    mlflow.log_metric("Precision_test", precision)
    mlflow.log_metric("Recall_test", recall)
    mlflow.log_metric("F1_score_test", fscore)
    
    # Log the trained model as an artifact
    mlflow.sklearn.log_model(rf_model, artifact_path="ML_models")



🏃 View run Random Forest at: https://dagshub.com/rami4real/mymlproject.mlflow/#/experiments/0/runs/b59bd7cdce7b4984bdab1f60b1dd9916
🧪 View experiment at: https://dagshub.com/rami4real/mymlproject.mlflow/#/experiments/0


## Finetuning

# Test avec SMOTE

In [38]:
from imblearn.over_sampling import SMOTE

method= SMOTE()

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.12.4-py3-none-any.whl.metadata (8.3 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.12.4-py3-none-any.whl (258 kB)
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.12.4 imblearn-0.0


In [45]:

# Start MLflow run for XGBoost with SMOTE
with mlflow.start_run(run_name='XGBoost with SMOTE'):
    mlflow.log_param("data_train", "../data/customer_churn_train.csv")
    mlflow.log_param("data_test", "../data/customer_churn_test.csv")
    mlflow.log_param("data_version", "v1.0")
    smote = SMOTE(random_state=42)

    # Apply SMOTE to the training data
    X_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)

    # Initialize XGBoost model
    xgb = XGBClassifier(random_state=42)


    # Fit the model with SMOTE
    xgb.fit(X_train_smote, y_train_smote)

    # Make predictions
    y_pred = xgb.predict(x_test)

    # Compute evaluation metrics
    conf_matrix = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Log model parameters, metrics, and model itself in MLflow
    mlflow.set_tag(key="model", value="XGBoost with SMOTE")
    mlflow.log_params(xgb.get_params())
    precision, recall, fscore, support = score(y_test, predicted, average='macro')
    
    # Log metrics
    mlflow.log_metric("Precision_test", precision)
    mlflow.log_metric("Recall_test", recall)
    mlflow.log_metric("F1_score_test", fscore)

    # Log the trained model
    mlflow.xgboost.log_model(xgb, artifact_path="ML_models")

    # Print results
    print("Model: XGBoost with SMOTE")
    print("Confusion Matrix:\n", conf_matrix)
    print("Classification Report:\n", classification_report(y_test, y_pred))



Model: XGBoost with SMOTE
Confusion Matrix:
 [[38754  6332]
 [  568 55388]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.86      0.92     45086
           1       0.90      0.99      0.94     55956

    accuracy                           0.93    101042
   macro avg       0.94      0.92      0.93    101042
weighted avg       0.94      0.93      0.93    101042

🏃 View run XGBoost with SMOTE at: https://dagshub.com/rami4real/mymlproject.mlflow/#/experiments/0/runs/18ed0b70aced4fffa283013bc52cf070
🧪 View experiment at: https://dagshub.com/rami4real/mymlproject.mlflow/#/experiments/0


In [46]:
# Start MLflow run for Random Forest with SMOTE
with mlflow.start_run(run_name='RandomForest with SMOTE'):
    mlflow.log_param("data_train", "../data/customer_churn_train.csv")
    mlflow.log_param("data_test", "../data/customer_churn_test.csv")
    mlflow.log_param("data_version", "v1.0")
    
    # Apply SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)

    # Initialize Random Forest model
    rf = RandomForestClassifier(random_state=42)

    # Fit the model with SMOTE
    rf.fit(X_train_smote, y_train_smote)

    # Make predictions
    y_pred = rf.predict(x_test)

    # Compute evaluation metrics
    conf_matrix = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Log model parameters, metrics, and model itself in MLflow
    mlflow.set_tag(key="model", value="RandomForest with SMOTE")
    mlflow.log_params(rf.get_params())
    
    # Compute precision, recall, and F1 score
    precision, recall, fscore, support = score(y_test, y_pred, average='macro')
    
    # Log metrics
    mlflow.log_metric("Precision_test", precision)
    mlflow.log_metric("Recall_test", recall)
    mlflow.log_metric("F1_score_test", fscore)

    # Log the trained model
    mlflow.sklearn.log_model(rf, artifact_path="ML_models")

    # Print results
    print("Model: RandomForest with SMOTE")
    print("Confusion Matrix:\n", conf_matrix)
    print("Classification Report:\n", classification_report(y_test, y_pred))




Model: RandomForest with SMOTE
Confusion Matrix:
 [[38665  6421]
 [   65 55891]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.86      0.92     45086
           1       0.90      1.00      0.95     55956

    accuracy                           0.94    101042
   macro avg       0.95      0.93      0.93    101042
weighted avg       0.94      0.94      0.94    101042

🏃 View run RandomForest with SMOTE at: https://dagshub.com/rami4real/mymlproject.mlflow/#/experiments/0/runs/3895f5666d544625a0123a97665733aa
🧪 View experiment at: https://dagshub.com/rami4real/mymlproject.mlflow/#/experiments/0
