<a href="https://colab.research.google.com/github/harshita23sharma/CustomerChurn/blob/main/customer_churn/notebooks/pipelines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
import pandas as pd
from datetime import datetime
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

import time
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("/content/drive/MyDrive/projects/churn_prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [18]:
import os
from pathlib import Path


REPO_DIR = Path(os.path.realpath(""))
REPO_DIR

PosixPath('/content')

In [23]:
import os
from pathlib import Path


REPO_DIR = Path(os.path.realpath(""))
INFERENCE_DATA_PATH = REPO_DIR / "drive/MyDrive/projects/churn_prediction/data/sample_for_inference.csv"
TRAINING_DATA_PATH = REPO_DIR / "drive/MyDrive/projects/churn_prediction/data/WA_Fn-UseC_-Telco-Customer-Churn.csv"

DATA_PATH = "drive/MyDrive/projects/churn_prediction/data"

MODEL_PATH = "drive/MyDrive/projects/churn_prediction/model"

class PreprocessConfig:
    train_path = REPO_DIR / DATA_PATH / "preprocessed/train.csv"
    test_path = REPO_DIR / DATA_PATH / "preprocessed/test.csv"
    batch_path = REPO_DIR / DATA_PATH / "preprocessed/batch.csv"

In [88]:
#Pipeline Building

#1st Component DataPreprocessor

import pandas as pd
from sklearn.preprocessing import StandardScaler,OrdinalEncoder
from sklearn.pipeline import Pipeline

class DataPreprocessor:
  def __init__(self, is_train:bool, data_path: str):
    self.is_train = is_train
    self.data_path = data_path

  def run(self):
    self.data = pd.read_csv(self.data_path)

    if self.is_train:
      self.data = self.data.dropna()
      data = DataPreprocessor._clean_df(self.data)
      data = DataPreprocessor._preprocess(data)
      data = DataPreprocessor._split_train_tes(data)
    else:
      data = DataPreprocessor._clean_df(self.data)
      data = DataPreprocessor._preprocess(data)
    return data

  @staticmethod
  def _preprocess(df: pd.DataFrame) -> pd.DataFrame:
    scaler = StandardScaler()
    df[["tenure","MonthlyCharges","TotalCharges"]] = scaler.fit_transform(df[["tenure","MonthlyCharges","TotalCharges"]])
    categorical = df.select_dtypes("object")
    number = df.select_dtypes("number").reset_index(drop=True)
    encoder = OrdinalEncoder().fit(categorical)
    encoded = encoder.transform(categorical)
    cate = pd.DataFrame(encoded.astype("int64"),columns=categorical.columns).reset_index(drop=True)
    df_final = pd.concat([number,cate],axis=1)
    return df_final

  @staticmethod
  def _clean_df(df: pd.DataFrame) -> pd.DataFrame:
      df["TotalCharges"] = (pd.to_numeric(df["TotalCharges"],errors="coerce"))
      df.drop(labels=df[df['tenure'] == 0].index, axis=0, inplace=True)
      numeric_cols = ['tenure', 'MonthlyCharges','TotalCharges']
      #Replacing 'No internet service' with 'No'
      cols = ["OnlineSecurity","OnlineBackup","DeviceProtection","TechSupport","StreamingTV","StreamingMovies"]
      for i in cols:
          df[i].replace("No internet service","No",inplace=True)
      return df

  @staticmethod
  def _split_train_tes(df: pd.DataFrame) -> pd.DataFrame:
    X = df.drop(columns="Churn")
    y = df["Churn"]
    X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0,test_size=0.25)
    train_indices = X_train.index
    test_indices = X_test.index
    df.loc[train_indices].to_csv(PreprocessConfig.train_path,index=False)
    df.loc[test_indices].to_csv(PreprocessConfig.test_path,index=False)
    return df




In [89]:
processed_df = DataPreprocessor(is_train=True, data_path="/content/drive/MyDrive/projects/churn_prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv").run()

In [90]:
processed_df

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,customerID,gender,Partner,Dependents,PhoneService,MultipleLines,...,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,Churn
0,0,-1.280248,-1.161694,-0.994194,5365,0,1,0,0,1,...,0,1,0,0,0,0,0,1,2,0
1,0,0.064303,-0.260878,-0.173740,3953,1,0,0,1,0,...,1,0,1,0,0,0,1,0,3,0
2,0,-1.239504,-0.363923,-0.959649,2558,1,0,0,1,0,...,1,1,0,0,0,0,0,1,3,1
3,0,0.512486,-0.747850,-0.195248,5524,1,0,0,0,1,...,1,0,1,1,0,0,1,0,0,0
4,0,-1.239504,0.196178,-0.940457,6500,0,0,0,1,0,...,0,0,0,0,0,0,0,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0,-0.343137,0.664868,-0.129180,4843,1,1,1,1,2,...,1,0,1,1,1,1,1,1,3,0
7028,0,1.612573,1.276493,2.241056,1524,0,1,1,1,2,...,0,1,1,0,1,1,1,1,1,0
7029,0,-0.872808,-1.170004,-0.854514,3358,0,1,1,0,1,...,1,0,0,0,0,0,0,1,2,0
7030,1,-1.158016,0.319168,-0.872095,5923,1,1,0,1,2,...,0,0,0,0,0,0,0,1,3,1


In [91]:
# class DataSplitter:
#   def __init__(self, preprocessed_data_path: str, split_ratio= 0.2):
#     self.data_path = preprocessed_data_path
#     self.split_ratio = split_ratio

#   def run(self):
#     self.data = pd.read_csv(self.data_path)
#     self.X = self.data.drop(columns="Churn")
#     self.y = self.data["Churn"]

In [None]:
!pip install mlflow==2.9.2

In [112]:
from typing import Dict, Any
from pathlib import Path

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score, precision_score, recall_score
import pandas as pd
import mlflow


class MlFlowConfig:
    uri = "http://0.0.0.0:8000"
    experiment_name = "churn_predictor"
    artifact_path = "model-artifact"
    registered_model_name = "churn_predictor"

class TrainerConfig:
    model_name ="logistic-regression"
    random_state = 42
    train_size = 0.2
    shuffle = True
    params = {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
    }

MODEL_PATH = "drive/MyDrive/projects/churn_prediction/model"
class MlFlowConfig:
    model_path = REPO_DIR / MODEL_PATH / "model.sav"
    uri = "http://0.0.0.0:8000"
    experiment_name = "churn_predictor"
    artifact_path = "model-artifact"
    registered_model_name = "churn_predictor"


In [111]:
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split,GridSearchCV,KFold
from sklearn.metrics import f1_score,classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import joblib


In [113]:
class TrainingPipeline:
  def __init__(self, params: Dict[str, Any], model_name: str = TrainerConfig.model_name) -> None:
    self.params = params
    self.model_name = model_name

    # self.model = model

  def run(self, train_path, test_path, target):
    # self.train_path = train_path
    # self.test_path = test_path
    # self.target = target
    # mlflow.set_tracking_uri(MlFlowConfig.uri)
    # mlflow.set_experiment(MlFlowConfig.experiment_name)
    # with mlflow.start_run():
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    X_train, y_train, X_test, y_test = train_df.drop(columns=[target]), train_df[target], test_df.drop(columns=[target]), test_df[target]
    lr_clf = LogisticRegression()
    lr_model, train_lr, test_lr, f1_lr, pred_lr, time_lr = self.parameter_finder(lr_clf, self.params, X_train, y_train, X_test, y_test)


    model = lr_model
    # Evaluate
    y_test = test_df[target]
    y_pred = model.predict(test_df.drop(target, axis=1))

    # Metrics
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    print(classification_report(y_test, y_pred))

    metrics = {
        "precision": precision,
        "recall": recall,
        "roc_auc": roc_auc
    }
    # mlflow.sklearn.log_model(
    #     sk_model=model,
    #     artifact_path=MlFlowConfig.artifact_path,
    # )
    # save the model to disk
    joblib.dump(model, filename=MlFlowConfig.model_path)
    model = joblib.load(filename=MlFlowConfig.model_path)
    return model

  def parameter_finder(self, model, parameters, X_train, y_train, X_test, y_test):
    start = time.time()

    grid = GridSearchCV(model,
                        param_grid = parameters,
                        refit = True,
                        cv = KFold(shuffle = True, random_state = 1),
                        n_jobs = -1)
    grid_fit = grid.fit(X_train, y_train)
    best = grid_fit.best_estimator_
    y_pred = best.predict(X_test)

    train_score = best.score(X_train, y_train)
    test_score = best.score(X_test, y_test)
    F1_score = f1_score(y_test, y_pred).round(2)

    model_name = str(model).split('(')[0]

    end = time.time()
    takes_time = np.round(end - start,2)

    print(f"The best parameters for {model_name} model is: {grid_fit.best_params_}")
    print("--" * 10)
    print(f"(R2 score) in the training set is {train_score:0.2%} for {model_name} model.")
    print(f"(R2 score) in the testing set is {test_score:0.2%} for {model_name} model.")
    print(f"F1 score is {F1_score:,} for {model_name} model.")
    print("--" * 10)
    print(f"Runtime of the program is: {end - start:0.2f}")


    return best,train_score, test_score, F1_score, y_pred, takes_time

In [114]:
lr_model = TrainingPipeline(TrainerConfig.params, TrainerConfig.model_name )

In [115]:
model = lr_model.run(PreprocessConfig.train_path, PreprocessConfig.test_path, "Churn")

The best parameters for LogisticRegression model is: {'C': 0.1, 'penalty': 'l2'}
--------------------
(R2 score) in the training set is 80.02% for LogisticRegression model.
(R2 score) in the testing set is 80.55% for LogisticRegression model.
F1 score is 0.57 for LogisticRegression model.
--------------------
Runtime of the program is: 2.75
              precision    recall  f1-score   support

           0       0.84      0.91      0.87      1311
           1       0.65      0.51      0.57       447

    accuracy                           0.81      1758
   macro avg       0.75      0.71      0.72      1758
weighted avg       0.79      0.81      0.80      1758



30 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.79199817        nan 0.79

In [124]:
from typing import List
class InferencePipeline:
  def __init__(self, model_path: Path) -> None:
    self.model_path = model_path
    self.model = joblib.load(filename=self.model_path)
  def run(self, batch_path: Path) -> List[int]:
    batch = pd.read_csv(batch_path)
    batch = batch.drop(columns=["Churn"])
    prediction = self.model.predict(batch)
    return prediction

In [125]:
preds = InferencePipeline(MlFlowConfig.model_path).run(PreprocessConfig.test_path)

In [130]:
np.sum(preds), len(preds)

(353, 1758)