In [1]:
import os
import mlflow
import mlflow.sklearn
import numpy as np
import joblib
from imblearn.over_sampling import SMOTE
from mlflow.models.signature import infer_signature
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, log_loss
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score


In [2]:
import pandas as pd
df = pd.read_csv("../data/telecom_customer_churn_clean.csv")

In [3]:
df.head()

Unnamed: 0,Gender,Age,Married,Number of Dependents,City_0,City_1,City_2,City_3,City_4,City_5,...,Streaming TV,Streaming Movies,Streaming Music,Unlimited Data,Contract,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Customer Status
0,0,37,1,0,0,0,0,0,0,0,...,1,0,0,1,2,593.3,0.0,0,381.51,0
1,1,46,0,0,0,0,0,0,0,0,...,0,1,1,0,1,542.4,38.33,10,96.21,0
2,1,50,0,0,0,0,0,0,0,0,...,0,0,0,1,1,280.85,0.0,0,134.6,1
3,1,78,1,0,0,0,0,0,0,0,...,1,1,0,1,1,1237.85,0.0,0,361.66,1
4,0,75,1,0,0,0,0,0,0,0,...,1,0,0,1,1,267.4,0.0,0,22.14,1


In [4]:
mlruns_path = os.path.join(os.getcwd(), "mlruns")
os.makedirs(mlruns_path, exist_ok=True)
mlflow.set_tracking_uri(f"file:///{mlruns_path}")

In [5]:
X = df.drop("Customer Status", axis=1).values
y = df["Customer Status"].values

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
experiment_name = "SVM_model"
mlflow.set_experiment(experiment_name)
print(f"Experiment set to: {experiment_name}")

Experiment set to: SVM_model


  return FileStore(store_uri, store_uri)


### SVM Model

In [9]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
C_value = 0.01
kernel_type = 'linear'

# mlfow run name
run_name = f"SVM_C={C_value}_kernel={kernel_type}_without_GS"

with mlflow.start_run(run_name=run_name):

    # Train model
    model = SVC(C=C_value, kernel=kernel_type, probability=True)
    model.fit(X_train, y_train)

    # Evaluate
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)
    
    # Metrics (accuracy , F1 score, precision, recall , loss)
    test_accuracy = accuracy_score(y_test, y_pred)
    f1  = f1_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall    = recall_score(y_test, y_pred, average='weighted')
    loss      = log_loss(y_test, y_prob)

    # Log param and metric
    mlflow.log_param("C", C_value)
    mlflow.log_param("kernel", kernel_type)
    mlflow.log_metric("accuracy", test_accuracy)
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("loss", loss)

    # Model signature
    signature = infer_signature(X_train, model.predict(X_train))
    
    # Log model
    mlflow.sklearn.log_model(sk_model=model, name="svm_model_noGS",
                            signature=signature,
                            input_example=X_train[:5])


print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Loss: {loss:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Test Accuracy: 0.8399
Recall: 0.8399
Loss: 0.3425
F1-Score: 0.8393
Precision: 0.8389


### SVM model with GridSearch

In [9]:
param_grid = {
    "svm__C": [0.001, 0.01, 0.1, 1, 10],
    "svm__kernel": ["linear", "rbf", "poly"],
    "svm__gamma": ["scale", "auto"]
}

with mlflow.start_run(run_name="SVM_GridSearch00"):
    
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('svm', SVC(probability=True))
    ])
      
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        scoring="recall",
        n_jobs=-1,
        cv=5,
        verbose=1
    )
    
    grid_search.fit(X_train, y_train)
    
    # Best model 
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    best_cv_score = grid_search.best_score_
    
    
    #  Evaluation on test set
    
    y_pred = best_model.predict(X_test)
    y_prob = best_model.predict_proba(X_test) 
    
    test_accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred, average='weighted')  
    loss = log_loss(y_test, y_prob)
    f1 = f1_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')

    # Log best hyperparameters
    mlflow.log_params(best_params)

    # CV metrics
    mlflow.log_metric("best_cv_recall", best_cv_score)

    # Test metrics
    mlflow.log_metric("test_accuracy", test_accuracy)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("loss", loss)
    mlflow.log_metric("test_f1", f1)
    mlflow.log_metric("test_precision", precision)
    
    signature = infer_signature(X_train, best_model.predict(X_train))

    # Log model
    mlflow.sklearn.log_model( sk_model=best_model, 
                              name="best_svm_model00",
                              signature=signature,
                              input_example = X_test[:5]
                            )


print(f"Best Parameters: {best_params}")
print(f"CV Recall: {best_cv_score:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Loss: {loss:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")

Fitting 5 folds for each of 30 candidates, totalling 150 fits


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Best Parameters: {'svm__C': 10, 'svm__gamma': 'scale', 'svm__kernel': 'linear'}
CV Recall: 0.7524
Test Accuracy: 0.8475
Recall: 0.8475
Loss: 0.3323
F1-Score: 0.8478
Precision: 0.8481
