# Churn Model Training with Experiment Tracking
- This notebook trains a logistic regression model for churn prediction using K-Fold cross-validation and tracks performance across different regularization strengths (C).
- Refactored Training Pipeline: Dynamic C for Experiment Tracking



In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split               # --> data splitting
from sklearn.model_selection import KFold                           # --> create folds

from sklearn.feature_extraction import DictVectorizer             # --> handle categorical variables
from sklearn.linear_model import LogisticRegression               # --> logistic model
from sklearn.metrics import roc_auc_score                         # --> evaluate with auc_roc_score    

from tqdm.auto import tqdm                           #----> show the iteration

In [2]:
df = pd.read_csv('../03-churn-project/WA_Fn-UseC_-Telco-Customer-Churn.csv', na_values=['',' '])

df.columns  = df.columns.str.lower() #convert columns to lower case

categorical_columns = list(df.dtypes[df.dtypes == "object"].index) # get all categorical columns

# For each column name, Replace empty spaces in column indices with _
for cols in categorical_columns:
    df[cols] = df[cols].str.lower().str.replace(' ','_') 
    
# replace all missing entries in totalcharges column with the median   
df['totalcharges'] = df.totalcharges.fillna(df.totalcharges.median())

#convert the entries in churn column into integer (0,1)
df.churn = (df.churn == 'yes').astype(int)

In [3]:
df_full_train, df_test = train_test_split(df, test_size=0.2,random_state=1)

In [4]:
numerical = ['tenure', 'monthlycharges', 'totalcharges']

categorical = [
     'gender',
     'seniorcitizen',
     'partner',
     'dependents',
     'phoneservice',
     'multiplelines',
     'internetservice',
     'onlinesecurity',
     'onlinebackup',
     'deviceprotection',
     'techsupport',
     'streamingtv',
     'streamingmovies',
     'contract',
     'paperlessbilling',
     'paymentmethod',
]


In [5]:
# training pipeline - function that accepts X, y C,
# get the dictvectorizer on train, fit the logistic model, returns dv, model

def train(df_train, y_train, C):
    dicts = df_train[numerical + categorical].to_dict(orient = 'records')
    dv = DictVectorizer(sparse = False)
    X_train = dv.fit_transform(dicts)
    model = LogisticRegression(C=C, max_iter=10000).fit(X_train, y_train)
    return dv, model

In [6]:
#----> prediction function

def predict(df_val, dv, model):
    dicts = df_val[numerical + categorical].to_dict(orient = 'records')
    X_val = dv.transform(dicts)

    y_pred = model.predict_proba(X_val)[:,1]

    return y_pred

In [7]:
#--->  Run Cross-Validation and Store Results
n_splits = 5

cv_results = {}

for C in tqdm([0.001, 0.01, 0.1, 0.5, 1, 5, 10]):
    
    kfold = KFold(n_splits=5, shuffle=True, random_state=1)
    
    scores = []

    for train_idx, val_idx in kfold.split(df_full_train):
        df_train = df_full_train.iloc[train_idx]
        df_val = df_full_train.iloc[val_idx]

        y_train = df_train.churn.values
        y_val = df_val.churn.values

        dv, model = train(df_train, y_train, C=C)
        y_pred = predict(df_val, dv, model)

        auc = roc_auc_score(y_val, y_pred)
        scores.append(auc)

    cv_results[C] = np.mean(scores)
    print('C=%s %.3f +- %.3f' % (C, np.mean(scores), np.std(scores)))
    
# --> Step 3: Select Best C and Use It
best_C = max(cv_results, key=cv_results.get)
print()
print(f"Best C: {best_C} with AUC: {cv_results[best_C]:.3f}")
print()

  0%|          | 0/7 [00:00<?, ?it/s]

C=0.001 0.825 +- 0.009
C=0.01 0.840 +- 0.008
C=0.1 0.842 +- 0.007
C=0.5 0.842 +- 0.007
C=1 0.842 +- 0.007
C=5 0.842 +- 0.007
C=10 0.842 +- 0.007

Best C: 0.5 with AUC: 0.842



In [11]:
# Final training on full data
dv_final, model_final = train(df_full_train, df_full_train.churn.values, C=best_C)


In [10]:
#predict on test data
y_pred_test = predict(df_test, dv_final, model_final)
auc_test = roc_auc_score(df_test.churn.values, y_pred_test)
print(f"Final test AUC: {auc_test:.3f}")

Final test AUC: 0.858
