In [1]:
import pandas as pd
import seaborn as sns
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_recall_fscore_support as score
import mlflow
import datetime
import pickle
import warnings
import numpy as np
from sklearn.model_selection import train_test_split
from arize.pandas.logger import Client, Schema
import datetime as dt
from arize.utils.types import ModelTypes, Environments
warnings.filterwarnings("ignore")
import mlflow.pyfunc
import mlflow.pyfunc


In [2]:
version = "v2.1" 
data_url = "../data/customer_churn_dataset.csv"

In [3]:
import sys  
sys.path.insert(0, '../backend/src')

In [14]:
from data_preprocessing_monitoring import transform_data
from clean_data_csv import clean_data

In [15]:
from dotenv import load_dotenv
import os
load_dotenv("../backend/src/.env")

DagsHub_username = os.getenv("DagsHub_username")
DagsHub_token=os.getenv("DagsHub_token")

In [16]:
import os
os.environ['MLFLOW_TRACKING_USERNAME']= DagsHub_username
os.environ["MLFLOW_TRACKING_PASSWORD"] = DagsHub_token

In [17]:
#setup mlflow
mlflow.set_tracking_uri('https://dagshub.com/rami4real/mymlproject.mlflow') #your mlfow tracking uri
mlflow.set_experiment("churn-experiment")

<Experiment: artifact_location='mlflow-artifacts:/566ae890f7ca4c2db92c2a642aca7189', creation_time=1733732561769, experiment_id='0', last_update_time=1733732561769, lifecycle_stage='active', name='churn-experiment', tags={}>

In [18]:
raw_train = pd.read_csv(data_url)

In [19]:
raw_train.head(3)

Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn
0,1.0,22.0,Female,25.0,14.0,4.0,27.0,Basic,Monthly,598.0,9.0,1.0
1,2.0,41.0,Female,28.0,28.0,7.0,13.0,Standard,Monthly,584.0,20.0,0.0
2,3.0,47.0,Male,27.0,10.0,2.0,29.0,Premium,Annual,757.0,21.0,0.0


In [33]:
X,y = transform_data(raw_train)
X = X.drop(columns=['CustomerID', 'Churn'])


In [34]:
all_experiments = [exp.experiment_id for exp in mlflow.search_experiments()]
df_mlflow = mlflow.search_runs(experiment_ids=all_experiments,filter_string="metrics.F1_score_test <1")
run_id = df_mlflow.loc[df_mlflow['metrics.F1_score_test'].idxmax()]['run_id']

#let's call the model from the model registry ( in production stage)
import mlflow.pyfunc

logged_model = f'runs:/{run_id}/ML_models'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)
print(loaded_model)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

mlflow.pyfunc.loaded_model:
  artifact_path: ML_models
  flavor: mlflow.sklearn
  run_id: 50f98fe38f354532a6a1f7a4caf466b6



## Transform the training data before sending it to Arize AI :

In [76]:
selected_cols=[
 'CustomerID','Age', 'Gender', 'Tenure', 'Usage Frequency', 'Support Calls', 
    'Payment Delay', 'Subscription Type', 'Contract Length', 'Total Spend', 
    'Last Interaction','Churn'
]

In [77]:
baseline = raw_train.drop(columns=['CustomerID', 'Churn'], axis=1)
baseline.dropna(inplace=True)

baseline = raw_train[selected_cols]

In [78]:

# Liste des colonnes à convertir en entier
columns_to_convert = ['Age', 'Tenure', 'Usage Frequency', 'Support Calls','Churn']

# Conversion des colonnes en type int
baseline[columns_to_convert] = baseline[columns_to_convert].astype(int)
# List of numerical columns
with open("label_encodersFinal.pkl", "rb") as le_file:
    label_encoders = pickle.load(le_file)

with open("scalerFinal.pkl", "rb") as dc_file:
    scaler = pickle.load(dc_file)
numerical_cols = [
    'Age', 'Tenure', 'Usage Frequency', 'Support Calls', 
    'Payment Delay', 'Total Spend', 'Last Interaction'
]

# List of categorical columns
categorical_cols = ["Gender", 'Subscription Type', "Contract Length"]

# Apply StandardScaler to numerical columns
baseline[numerical_cols] = scaler.transform(baseline[numerical_cols])

# Encode categorical columns
for col in categorical_cols:
    if col in baseline.columns:
        if col in label_encoders:
            le = label_encoders[col]
            baseline[col] = le.transform(baseline[col])
        else:
            raise ValueError(f"No LabelEncoder found for column: {col}")
    else:
        raise ValueError(f"Missing categorical column: {col}")


In [79]:
baseline.rename(columns = {'Churn':'actual_label'}, inplace = True)

In [80]:
transform_bin_str = { 0 : 'non_Churn', 1 : 'Churn'}
baseline['actual_label'] = baseline['actual_label'].map(transform_bin_str)

In [81]:
preds = loaded_model.predict(X)

In [82]:
baseline['prediction_label'] = preds

In [83]:
baseline['prediction_label'] = baseline['prediction_label'].map(transform_bin_str)

In [84]:
import uuid
# Prediction ID is required for all datasets
def generate_prediction_ids(X):
    return pd.Series((str(uuid.uuid4()) for _ in range(len(X))), index=X.index)

In [85]:
baseline["prediction_id"]=generate_prediction_ids(baseline)

In [86]:
baseline.head(3)

Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,actual_label,prediction_label,prediction_id
0,1.0,-1.397268,0,-0.368409,-0.198951,0.053192,1.597784,0,1,-0.089976,-0.651766,Churn,Churn,c41ea4a9-b68e-4a52-96fc-ed67d2518b10
1,2.0,0.102271,0,-0.194369,1.425308,1.010558,-0.05879,2,1,-0.147044,0.626074,non_Churn,non_Churn,ff10ae36-04e1-4fba-a840-c599822002fe
2,3.0,0.575809,1,-0.252383,-0.663026,-0.585052,1.834437,1,0,0.55816,0.742241,non_Churn,Churn,a909b0f7-dfd5-4f35-aad4-165e34aba224


In [87]:
SPACE_KEY = "5a8f660"
API_KEY = "34b51d161247d73b41b"

arize_client = Client(space_key=SPACE_KEY, api_key=API_KEY)

model_id = (
    "Churn-dector-model"  # This is the model name that will show up in Arize
)
model_version = "v2"  # Version of model - can be any string

if SPACE_KEY == "SPACE_KEY" or API_KEY == "API_KEY":
    raise ValueError("❌ NEED TO CHANGE SPACE AND/OR API_KEY")
else:
    print("✅ Arize setup complete!")

✅ Arize setup complete!


## Send train data to Arize AI :
the training data will be the reference data later in production

In [88]:
features = feature_column_names=list(baseline.columns.drop(
        ["prediction_id", "prediction_label", "actual_label"]))

In [92]:
training_schema = Schema(
    prediction_id_column_name="prediction_id",
    prediction_label_column_name="prediction_label",
    actual_label_column_name="actual_label",
    feature_column_names=features)

# Logging Training DataFrame
training_response = arize_client.log(
    dataframe=baseline,
    model_id=model_id,
    model_version=model_version,
    model_type=ModelTypes.SCORE_CATEGORICAL,
    environment=Environments.TRAINING,
    schema=training_schema,
)

# If successful, the server will return a status_code of 200
if training_response.status_code != 200:
    print(
        f"logging failed with response code {training_response.status_code}, {training_response.text}"
    )
else:
    print(f"✅ You have successfully logged training set to Arize")

[38;21m  arize.utils.logging | INFO | Success! Check out your data at https://app.arize.com/organizations/QWNjb3VudE9yZ2FuaXphdGlvbjoxMjc1OTphcUND/spaces/U3BhY2U6MTMzNzg6RlFldw==/models/modelName/Churn-dector-model?selectedTab=performance[0m
✅ You have successfully logged training set to Arize
