In [1]:
import pandas as pd

df = pd.read_csv('normalized_data.csv')

In [2]:
X = df.drop('ChurnLabel', axis=1)
y = df['ChurnLabel']

In [3]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

# Encode target variable ('No' -> 0, 'Yes' -> 1)
y_encoded = le.fit_transform(y)

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y)

In [6]:
import dagshub

# dagshub.init(repo_owner='hrootscraft', repo_name='my-book', mlflow=True)
DAGSHUB_TOKEN = dagshub.auth.get_token()

In [11]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np
import math

class CustomPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns_to_drop = ['Country', 'State', 'CustomerID', 'Latitude',
                               'Longitude', 'ChurnScore', 'CLTV', 
                               'ChurnReason', 'ZipCode', 'City']
        self.numeric_features = ['TenureMonths', 'MonthlyCharges', 'TotalCharges']
        self.categorical_features = None  # Will be set during fit
        self.scaler = StandardScaler()
        self.encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        self.encoded_feature_names = None

    def _impute_total_charges(self, X):
        X_ = X.copy()
        for index, row in X_.iterrows():
            if math.isnan(row['TotalCharges']):
                X_.at[index, 'TotalCharges'] = (
                    X_.at[index, 'MonthlyCharges'] * X_.at[index, 'TenureMonths']
                )
        return X_

    def fit(self, X, y=None):
        # Set categorical features (excluding target)
        self.categorical_features = [col for col in X.select_dtypes(include=['object']).columns 
                                   if col not in self.columns_to_drop + ['ChurnLabel']]
        
        X_ = X.copy()
        X_ = self._impute_total_charges(X_)
        X_ = X_.drop(columns=self.columns_to_drop, errors='ignore')
        X_['TotalCharges'] = np.log1p(X_['TotalCharges'])
        
        self.scaler.fit(X_[self.numeric_features])
        self.encoder.fit(X_[self.categorical_features])
        
        self.encoded_feature_names = []
        for i, feature in enumerate(self.categorical_features):
            feature_categories = self.encoder.categories_[i]
            self.encoded_feature_names.extend([f"{feature}_{cat}" for cat in feature_categories])
        
        return self

    def transform(self, X):
        X_ = X.copy()
        X_ = self._impute_total_charges(X_)
        X_ = X_.drop(columns=self.columns_to_drop, errors='ignore')
        X_['TotalCharges'] = np.log1p(X_['TotalCharges'])
        
        X_scaled = pd.DataFrame(
            self.scaler.transform(X_[self.numeric_features]),
            columns=self.numeric_features,
            index=X_.index
        )
        
        X_encoded = pd.DataFrame(
            self.encoder.transform(X_[self.categorical_features]),
            columns=self.encoded_feature_names,
            index=X_.index
        )
        
        return pd.concat([X_scaled, X_encoded], axis=1)

In [12]:
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.model_selection import GridSearchCV, cross_validate
import mlflow
from sklearn.pipeline import Pipeline
import time, os
from mlflow.models import infer_signature

# Set MLflow tracking URI and credentials
MLFLOW_TRACKING_URI = "https://dagshub.com/hrootscraft/my-book.mlflow"
os.environ['MLFLOW_TRACKING_USERNAME'] = 'hrootscraft'
os.environ['MLFLOW_TRACKING_PASSWORD'] = DAGSHUB_TOKEN
mlflow.set_tracking_uri(uri=MLFLOW_TRACKING_URI)
mlflow.set_experiment("churn_prediction")


def train_logistic_regression(X_train, X_test, y_train, y_test, cv_folds=5):
    """
    Trains and evaluates the model with cross-validation and hyperparameter tuning
    """
    with mlflow.start_run(run_name="logistic_regression"):
        t1 = time.time()
        
        # Create pipeline
        pipeline = Pipeline([
            ('preprocessor', CustomPreprocessor()),
            ('classifier', LogisticRegression())
        ])
        
        # Define parameter grid
        param_grid = {
            'classifier__C': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4],
            'classifier__solver': ['liblinear'],
            'classifier__class_weight': ['balanced']
        }
        
        # Perform GridSearchCV
        grid_search = GridSearchCV(
            pipeline, param_grid, 
            scoring='f1', 
            cv=cv_folds, 
            n_jobs=-1, 
            verbose=1
        )
        
        # Fit the model
        grid_search.fit(X_train, y_train)
        
        # Log best parameters
        mlflow.log_params(grid_search.best_params_)
        
        # Perform cross-validation with best model
        cv_results = cross_validate(
            grid_search.best_estimator_,
            X_train, y_train,
            cv=cv_folds,
            scoring='f1',
            return_train_score=True
        )
        
        # Log cross-validation results
        mlflow.log_metric("cv_f1_mean", cv_results['test_score'].mean())
        mlflow.log_metric("cv_f1_std", cv_results['test_score'].std())
        
        # Get predictions on test set
        y_pred = grid_search.predict(X_test)
        
        # Calculate metrics
        f1 = f1_score(y_test, y_pred)
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        
        # Log metrics
        metrics = {
            "test_f1": f1,
            "true_negatives": tn,
            "false_positives": fp,
            "false_negatives": fn,
            "true_positives": tp
        }
        mlflow.log_metrics(metrics)
        
        # Log model
        signature = infer_signature(X_test, y_pred)
        mlflow.sklearn.log_model(
            sk_model=grid_search.best_estimator_,
            artifact_path="churn_model",
            signature=signature,
            input_example=X_test.iloc[0:1],
            registered_model_name="LogisticRegression"
        )
        
        t2 = time.time()
        print(f"Training completed in {t2-t1:.2f} seconds")
        print(f"Best parameters: {grid_search.best_params_}")
        print(f"Best CV f1-score: {grid_search.best_score_:.4f}")
        print(f"Test f1-score: {f1:.4f}")
        print("\nConfusion Matrix:")
        print(f"TN: {tn}, FP: {fp}")
        print(f"FN: {fn}, TP: {tp}")
        
        return grid_search.best_estimator_

In [13]:
logi_model = train_logistic_regression(X_train, X_test, y_train, y_test)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


Registered model 'LogisticRegression' already exists. Creating a new version of this model...
2024/12/22 16:39:51 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LogisticRegression, version 2
Created version '2' of model 'LogisticRegression'.


Training completed in 21.17 seconds
Best parameters: {'classifier__C': 100.0, 'classifier__class_weight': 'balanced', 'classifier__solver': 'liblinear'}
Best CV f1-score: 0.6498
Test f1-score: 0.6253

Confusion Matrix:
TN: 756, FP: 279
FN: 77, TP: 297
🏃 View run logistic_regression at: https://dagshub.com/hrootscraft/my-book.mlflow/#/experiments/0/runs/b4f0a6c358c44a288b7b4175c430d12f
🧪 View experiment at: https://dagshub.com/hrootscraft/my-book.mlflow/#/experiments/0


In [14]:
def train_ridge_classifier(X_train, X_test, y_train, y_test, cv_folds=5):
    """
    Trains and evaluates Ridge Classifier with cross-validation and hyperparameter tuning
    """
    with mlflow.start_run(run_name="ridge_classifier"):
        t1 = time.time()
        # Create pipeline
        pipeline = Pipeline([
            ('preprocessor', CustomPreprocessor()),
            ('classifier', RidgeClassifier())
        ])
        
        # Define parameter grid
        param_grid = {
            'classifier__alpha': [0.1, 1.0, 10.0, 100.0],
            'classifier__class_weight': ['balanced'],
            'classifier__solver': ['auto', 'svd', 'cholesky']
        }
        
        # Perform GridSearchCV
        grid_search = GridSearchCV(
            pipeline, param_grid,
            scoring='f1',
            cv=cv_folds,
            n_jobs=-1,
            verbose=1
        )
        
        # Rest of the code follows the same pattern as your logistic regression
        grid_search.fit(X_train, y_train)
        mlflow.log_params(grid_search.best_params_)
        
        cv_results = cross_validate(
            grid_search.best_estimator_,
            X_train, y_train,
            cv=cv_folds,
            scoring='f1',
            return_train_score=True
        )
        
        # Log cross-validation results
        mlflow.log_metric("cv_f1_mean", cv_results['test_score'].mean())
        mlflow.log_metric("cv_f1_std", cv_results['test_score'].std())
        
        y_pred = grid_search.predict(X_test)
        f1 = f1_score(y_test, y_pred)
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        
        metrics = {
            "test_f1": f1,
            "true_negatives": tn,
            "false_positives": fp,
            "false_negatives": fn,
            "true_positives": tp
        }
        mlflow.log_metrics(metrics)
        
        signature = infer_signature(X_test, y_pred)
        mlflow.sklearn.log_model(
            sk_model=grid_search.best_estimator_,
            artifact_path="ridge_model",
            signature=signature,
            input_example=X_test.iloc[0:1],
            registered_model_name="RidgeClassifier"
        )
        
        t2 = time.time()
        print(f"Training completed in {t2-t1:.2f} seconds")
        print(f"Best parameters: {grid_search.best_params_}")
        print(f"Best CV f1-score: {grid_search.best_score_:.4f}")
        print(f"Test f1-score: {f1:.4f}")
        print("\nConfusion Matrix:")
        print(f"TN: {tn}, FP: {fp}")
        print(f"FN: {fn}, TP: {tp}")
        
        return grid_search.best_estimator_

In [15]:
ridge_model = train_ridge_classifier(X_train, X_test, y_train, y_test)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


Registered model 'RidgeClassifier' already exists. Creating a new version of this model...
2024/12/22 16:41:32 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: RidgeClassifier, version 2
Created version '2' of model 'RidgeClassifier'.


Training completed in 23.11 seconds
Best parameters: {'classifier__alpha': 100.0, 'classifier__class_weight': 'balanced', 'classifier__solver': 'auto'}
Best CV f1-score: 0.6464
Test f1-score: 0.6184

Confusion Matrix:
TN: 741, FP: 294
FN: 75, TP: 299
🏃 View run ridge_classifier at: https://dagshub.com/hrootscraft/my-book.mlflow/#/experiments/0/runs/07aadac068bd4083b1ff5caddf932996
🧪 View experiment at: https://dagshub.com/hrootscraft/my-book.mlflow/#/experiments/0


In [18]:
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer

def train_xgb_classifier(X_train, X_test, y_train, y_test, cv_folds=5):
    """
    Trains and evaluates XGBoost Classifier with cross-validation and hyperparameter tuning
    """
    with mlflow.start_run(run_name="xgboost_classifier"):
        t1 = time.time()
        
        # Create XGBoost classifier with updated parameters
        xgb_clf = xgb.XGBClassifier(
            objective='binary:logistic',
            random_state=42,
            use_label_encoder=False,  # Add this to prevent warning
            eval_metric='logloss'     # Add this to prevent warning
        )
        
        pipeline = Pipeline([
            ('preprocessor', CustomPreprocessor()),
            ('classifier', xgb_clf)
        ])
        
        # Simplified parameter grid to reduce complexity
        param_grid = {
            'classifier__max_depth': [3, 5],
            'classifier__learning_rate': [0.01, 0.1],
            'classifier__n_estimators': [100],
            'classifier__min_child_weight': [1],
            'classifier__subsample': [0.8]
        }
        
        # Use make_scorer to specify pos_label
        scorer = make_scorer(f1_score, pos_label=1)
        
        grid_search = GridSearchCV(
            pipeline, 
            param_grid,
            scoring=scorer,
            cv=cv_folds,
            n_jobs=-1,
            verbose=1
        )
        
        # Fit the model
        grid_search.fit(X_train, y_train)
        mlflow.log_params(grid_search.best_params_)
        
        cv_results = cross_validate(
            grid_search.best_estimator_,
            X_train, y_train,
            cv=cv_folds,
            scoring=scorer,
            return_train_score=True
        )
        
        # Log cross-validation results
        mlflow.log_metric("cv_f1_mean", cv_results['test_score'].mean())
        mlflow.log_metric("cv_f1_std", cv_results['test_score'].std())
        
        y_pred = grid_search.predict(X_test)
        f1 = f1_score(y_test, y_pred, pos_label=1)
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        
        metrics = {
            "test_f1": f1,
            "true_negatives": tn,
            "false_positives": fp,
            "false_negatives": fn,
            "true_positives": tp
        }
        mlflow.log_metrics(metrics)
        
        signature = infer_signature(X_test, y_pred)
        mlflow.sklearn.log_model(
            sk_model=grid_search.best_estimator_,
            artifact_path="xgboost_model",
            signature=signature,
            input_example=X_test.iloc[0:1],
            registered_model_name="XGBoostClassifier"
        )
        
        t2 = time.time()
        print(f"Training completed in {t2-t1:.2f} seconds")
        print(f"Best parameters: {grid_search.best_params_}")
        print(f"Best CV f1-score: {grid_search.best_score_:.4f}")
        print(f"Test f1-score: {f1:.4f}")
        print("\nConfusion Matrix:")
        print(f"TN: {tn}, FP: {fp}")
        print(f"FN: {fn}, TP: {tp}")
        
        return grid_search.best_estimator_

In [19]:
xgb_model = train_xgb_classifier(X_train, X_test, y_train, y_test)

Fitting 10 folds for each of 4 candidates, totalling 40 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Traceback (most recent call last):
  File "/home/rutuja/miniconda3/envs/jupyterbook/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/home/rutuja/miniconda3/envs/jupyterbook/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/home/rutuja/miniconda3/envs/jupyterbook/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
  File "/home/rutuja/miniconda3/envs/

🏃 View run xgboost_classifier at: https://dagshub.com/hrootscraft/my-book.mlflow/#/experiments/0/runs/041698b34a21426d8673c5dd7a1296ca
🧪 View experiment at: https://dagshub.com/hrootscraft/my-book.mlflow/#/experiments/0


AttributeError: 'super' object has no attribute '__sklearn_tags__'

In [22]:
def train_random_forest_classifier(X_train, X_test, y_train, y_test, cv_folds=5):
    """
    Trains and evaluates Random Forest Classifier with cross-validation and hyperparameter tuning
    """
    with mlflow.start_run(run_name="random_forest_classifier"):
        t1 = time.time()
        pipeline = Pipeline([
            ('preprocessor', CustomPreprocessor()),
            ('classifier', RandomForestClassifier(random_state=42))
        ])
        
        param_grid = {
            'classifier__n_estimators': [100, 200, 300],
            'classifier__max_depth': [10, 20, 30, None],
            'classifier__min_samples_split': [2, 5, 10],
            'classifier__min_samples_leaf': [1, 2, 4],
            'classifier__class_weight': ['balanced']
        }
        
        grid_search = GridSearchCV(
            pipeline, param_grid,
            scoring='f1',
            cv=cv_folds,
            n_jobs=-1,
            verbose=1
        )
        
        # Rest follows the same pattern
        grid_search.fit(X_train, y_train)
        mlflow.log_params(grid_search.best_params_)
        
        cv_results = cross_validate(
            grid_search.best_estimator_,
            X_train, y_train,
            cv=cv_folds,
            scoring='f1',
            return_train_score=True
        )
        
        mlflow.log_metric("cv_f1_mean", cv_results['test_score'].mean())
        mlflow.log_metric("cv_f1_std", cv_results['test_score'].std())
        
        y_pred = grid_search.predict(X_test)
        f1 = f1_score(y_test, y_pred)
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        
        metrics = {
            "test_f1": f1,
            "true_negatives": tn,
            "false_positives": fp,
            "false_negatives": fn,
            "true_positives": tp
        }
        mlflow.log_metrics(metrics)
        
        signature = infer_signature(X_test, y_pred)
        mlflow.sklearn.log_model(
            sk_model=grid_search.best_estimator_,
            artifact_path="random_forest_model",
            signature=signature,
            input_example=X_test.iloc[0:1],
            registered_model_name="RandomForestClassifier"
        )
        
        t2 = time.time()
        print(f"Training completed in {t2-t1:.2f} seconds")
        print(f"Best parameters: {grid_search.best_params_}")
        print(f"Best CV f1-score: {grid_search.best_score_:.4f}")
        print(f"Test f1-score: {f1:.4f}")
        print("\nConfusion Matrix:")
        print(f"TN: {tn}, FP: {fp}")
        print(f"FN: {fn}, TP: {tp}")
        
        return grid_search.best_estimator_

In [23]:
xgb_model = train_random_forest_classifier(X_train, X_test, y_train, y_test)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


Successfully registered model 'RandomForestClassifier'.
2024/12/22 16:56:35 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: RandomForestClassifier, version 1
Created version '1' of model 'RandomForestClassifier'.


Training completed in 317.67 seconds
Best parameters: {'classifier__class_weight': 'balanced', 'classifier__max_depth': 10, 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 100}
Best CV f1-score: 0.6501
Test f1-score: 0.6317

Confusion Matrix:
TN: 796, FP: 239
FN: 91, TP: 283
🏃 View run random_forest_classifier at: https://dagshub.com/hrootscraft/my-book.mlflow/#/experiments/0/runs/08f0bb33f86c495ea532140756bcf870
🧪 View experiment at: https://dagshub.com/hrootscraft/my-book.mlflow/#/experiments/0
