# Telco Customer Churn Prediction Kubeflow Pipeline

In this [Kaggle competition](https://www.kaggle.com/datasets/blastchar/telco-customer-churn) 

>In this competition, your goal is to analyze behavior to retain customers and predict churning. You can analyze all relevant customer data and develop focused customer retention programs.

# Install relevant libraries


>Update pip `pip install --user --upgrade pip`

>Install and upgrade kubeflow sdk `pip install kfp --upgrade --user --quiet`

You may need to restart your notebook kernel after installing the kfp sdk

In [1]:
!pip install --user --upgrade pip



In [2]:
!pip install kfp --upgrade --user --quiet

In [3]:
# confirm the kfp sdk
! pip show kfp

Name: kfp
Version: 1.8.11
Summary: KubeFlow Pipelines SDK
Home-page: https://github.com/kubeflow/pipelines
Author: The Kubeflow Authors
Author-email: 
License: UNKNOWN
Location: /home/jovyan/.local/lib/python3.6/site-packages
Requires: absl-py, click, cloudpickle, dataclasses, Deprecated, docstring-parser, fire, google-api-python-client, google-auth, google-cloud-storage, jsonschema, kfp-pipeline-spec, kfp-server-api, kubernetes, protobuf, pydantic, PyYAML, requests-toolbelt, strip-hints, tabulate, typer, typing-extensions, uritemplate
Required-by: kubeflow-kale


# Imports

In this section we import the kfp methods we need for this example. Make it a habit to gather your imports in a single place.

In [4]:
import kfp
import kfp.components as comp
import kfp.dsl as dsl
from kfp.components import InputPath, OutputPath
from typing import NamedTuple

# Kubeflow pipeline component creation

## Download and load the dataset

In [5]:
# load data step
def load_data(download_link: str, data_path: OutputPath(str)):
        
    # install the necessary libraries
    import os, sys, pickle, subprocess;
    subprocess.run(["python", "-m", "pip", "install", "--upgrade", "pip"])
    subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])
    subprocess.run([sys.executable, "-m", "pip", "install", "wget"])
    import wget
    
    # import libraries
    import pandas as pd
    
    # create data_path directory
    if not os.path.exists(data_path):
        os.makedirs(data_path)

    # download data
    wget.download(download_link, f'{data_path}/Telco-Customer-Churn.csv')
    
    # read data
    data = pd.read_csv(f"{data_path}/Telco-Customer-Churn.csv")

    # Save data as a pickle file to be used by the tranform_data component.
    with open(f'{data_path}/data', 'wb') as f:
        pickle.dump(data, f)

    return(print('Done!'))

## Transform Data

In [6]:
# transform data step

def transform_data(data_path: InputPath(str), 
              transform_data_path: OutputPath(str)):
    
    # install the necessary libraries
    import sys, subprocess;
    subprocess.run(["python", "-m", "pip", "install", "--upgrade", "pip"])
    subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])
    subprocess.run([sys.executable, '-m', 'pip', 'install','scipy'])
    
    # import Libraries
    import os, pickle;
    import pandas as pd
    import numpy as np

    
    # load data from data_path
    with open(f'{data_path}/data', 'rb') as f:
        data = pickle.load(f)

    # remove rows with spaces in TotalCharges column
    data = data[data['TotalCharges'] !=' '].copy()
    
    # convert TotalCharges column datatype to float 
    data['TotalCharges'] = data['TotalCharges'].astype(float)
    
    # convert Churn targe from string to integers
    # replace no with 1 and yes with 0
    data['Churn'] = data['Churn'].replace({'No':1, 'Yes':0})
    
    #creating the transform_data_path
    os.makedirs(transform_data_path, exist_ok = True)
    
    #Save data as a pickle file to be used by the feature_engineering component.
    with open(f'{transform_data_path}/data', 'wb') as f:
        pickle.dump(data, f)
    
    return(print('Done!'))

# Feature Engineering

Grouping the tenure, monthly charge and total charge column into different segments

In [7]:
# feature engineering step

def feature_engineering(transform_data_path: InputPath(str), 
            feat_eng_path: OutputPath(str)):
    
    # install the necessary libraries
    import sys, subprocess;
    subprocess.run(["python", "-m", "pip", "install", "--upgrade", "pip"])
    subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])
    subprocess.run([sys.executable, '-m', 'pip', 'install','scikit-learn'])
    
  
    
    # import Library
    import os, pickle;
    import numpy as np
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import MinMaxScaler
    
    # loading the data
    with open(f'{transform_data_path}/data', 'rb') as f:
        data = pickle.load(f)
        
    def yearly_tenure(tenure):
        if tenure <= 12:
            return 1
        elif tenure >12 and tenure <=24:
            return 2
        elif tenure >24 and tenure <=36:
            return 3
        elif tenure >36 and tenure <=48:
            return 4
        elif tenure >48 and tenure <=60:
            return 5
        elif tenure > 60 and tenure <=72:
            return 6

    def monthly_charge_plan(charge):
        if charge <= 35:
            return 'Basic'
        elif charge>35 and charge <= 80:
            return 'Advanced'
        elif charge>80:
            return 'Premium'
        
    def total_charge_status(charge):
        if charge <= 250:
            return 'V-low'
        elif charge > 250 and charge <= 450:
            return 'low'
        elif charge > 450 and charge <= 1500:
            return 'medium'
        elif charge > 1500 and charge <= 3500:
            return 'High'
        elif charge > 3500:
            return 'V-High'
        
    # generating new features
    data['OnlineSecurityBackup'] = data['OnlineSecurity'] + data['OnlineBackup']
    data['OnlineSecurityDevice'] = data['OnlineSecurity'] + data['DeviceProtection']
    data['Streaming'] = data['StreamingMovies'] + data['StreamingTV']
    
    # applying grouping functions to create new features
    data['yearly_tenure'] = data['tenure'].apply(yearly_tenure)
    data['MonthlyChargesplan'] = data['MonthlyCharges'].apply(monthly_charge_plan)
    data['TotalChargestatus'] = data['TotalCharges'].apply(total_charge_status)
    
    # drop customerID
    data = data.drop('customerID', axis=1)
    
    # scale 'MonthlyCharges' and 'TotalCharges' columns
    scaler = MinMaxScaler()
    data[['MonthlyCharges', 'TotalCharges']] = scaler.fit_transform(data[['MonthlyCharges', 'TotalCharges']])
    
    # one-hot encode categorical variables 
    X = pd.get_dummies(data.drop(columns=['Churn']), drop_first=True)
    y= data['Churn']
    
    # split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=22, stratify=y)

    # creating the feat_eng_path
    os.makedirs(feat_eng_path, exist_ok = True)
      
    # save the train_test_split data as a pickle file to be used by the modeling component.
    with open(f'{feat_eng_path}/split_data', 'wb') as f:
        pickle.dump((X_train, X_test, y_train, y_test), f)
    
    return(print('Done!'))  

# Modelling

## Catboost

In [36]:
# catboost modeling step

def catboost_modeling(feat_eng_path: InputPath(str), 
                      cb_ensemble_path: OutputPath(str),
                      mlpipeline_ui_metadata_path: OutputPath(str)):
    
    # install the necessary libraries
    import sys, subprocess;
    subprocess.run(["python", "-m", "pip", "install", "--upgrade", "pip"])
    subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])
    subprocess.run([sys.executable, '-m', 'pip', 'install','catboost'])
    subprocess.run([sys.executable, '-m', 'pip', 'install','scikit-learn'])
    
    # import Library
    import os, json, pickle;
    import numpy as np
    import pandas as pd
    from sklearn.utils import class_weight
    from catboost import CatBoostClassifier
    from sklearn.metrics import confusion_matrix
    from collections import namedtuple

    #loading the new_feats data
    with open(f'{feat_eng_path}/split_data', 'rb') as f:
        X_train, X_test, y_train, y_test = pickle.load(f)
        
    y = np.append(y_train.values, y_test.values)
    
    # compute class weight to handle imbalance nature
    class_weight = dict(zip(np.unique(y), class_weight.compute_class_weight('balanced',
                                                 classes=np.unique(y), y=y))) 
        
    
    #creating the cb_ensemble_path directory
    os.makedirs(cb_ensemble_path, exist_ok = True)
    
    
    # model initialization
    cb=CatBoostClassifier(class_weights=class_weight, 
                          n_estimators=150,
                          eval_metric='AUC', 
                          learning_rate=0.1043242,
                          max_depth=5, 
                          use_best_model=True, 
                          random_state=22, 
                          allow_writing_files=False, 
                          metric_period=20)

    # fitting
    cb.fit(X_train, y_train, eval_set=(X_test, y_test))
    
    # predict
    cb_pred = cb.predict(X_test)
    
    #Save the predicted data as a pickle file to be used by the ensembling component.
    with open(f'{cb_ensemble_path}/cb_pred', 'wb') as f:
        pickle.dump(cb_pred, f)
    
    # plot confusion_matrix
    cm = confusion_matrix(y_test, cb_pred)
    vocab = list(np.unique(y_test))
    
    # confusion_matrix pair dataset 
    data = []
    for target_index, target_row in enumerate(cm):
        for predicted_index, count in enumerate(target_row):
            data.append((vocab[target_index], vocab[predicted_index], count))
    
    # convert confusion_matrix pair dataset to dataframe
    df = pd.DataFrame(data,columns=['target','predicted','count'])
    
    # change 'target', 'predicted' to integer strings
    df[['target', 'predicted']] = (df[['target', 'predicted']].astype(int)).astype(str)
    
    # create kubeflow metric metadata for UI
    metadata = {
                "outputs": [
                    {
                        "type": "confusion_matrix",
                        "format": "csv",
                        "schema": [
                            {
                                "name": "target",
                                "type": "CATEGORY"
                            },
                            {
                                "name": "predicted",
                                "type": "CATEGORY"
                            },
                            {
                                "name": "count",
                                "type": "NUMBER"
                            }
                        ],
                        "source": df.to_csv(header=False, index=False),
                        "storage": "inline",
                        "labels": [
                            "0",
                            "1"
                        ]
                    }
                ]
            }
    
    with open(mlpipeline_ui_metadata_path, 'w') as metadata_file:
        json.dump(metadata, metadata_file)

    conf_m_result = namedtuple('conf_m_result', ['mlpipeline_ui_metadata'])
    
    return conf_m_result(json.dumps(metadata))

## Xgboost

In [37]:
# xgboost modeling step

def xgboost_modeling(feat_eng_path: InputPath(str), 
                     xgb_ensemble_path: OutputPath(str),
                     mlpipeline_ui_metadata_path: OutputPath(str)):
    
    # install the necessary libraries
    import sys, subprocess;
    subprocess.run(["python", "-m", "pip", "install", "--upgrade", "pip"])
    subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])
    subprocess.run([sys.executable, '-m', 'pip', 'install','xgboost'])
    subprocess.run([sys.executable, '-m', 'pip', 'install','scikit-learn'])
    
    # import Library
    import os, json, pickle, joblib;
    import numpy as np
    import pandas as pd
    from xgboost import XGBClassifier
    from sklearn.metrics import confusion_matrix
    from collections import namedtuple

    #loading the split_data data
    with open(f'{feat_eng_path}/split_data', 'rb') as f:
        X_train, X_test, y_train, y_test = pickle.load(f)
            
    #creating the ensemble_path directory
    os.makedirs(xgb_ensemble_path, exist_ok = True)
    
    # model initialization
    xgb=XGBClassifier(scale_pos_weight=0.3627, 
                      max_depth=10, 
                      learning_rate=0.1043242, 
                      n_estimators=600, 
                      colsample_bylevel=0.8, 
                      reg_alpha=0.8,
                      silent=True, 
                      metrics='auc', 
                      random_state=22)
    
    # fitting
    xgb.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_test, y_test)], early_stopping_rounds=50,verbose=50)
    
    # predict
    xgb_pred = xgb.predict(X_test)
    
    #Save the predicted data as a pickle file to be used by the ensembling component.
    with open(f'{xgb_ensemble_path}/xgb_pred', 'wb') as f:
        pickle.dump(xgb_pred, f) 
    
    # plot confusion_matrix
    cm = confusion_matrix(y_test, xgb_pred)
    vocab = list(np.unique(y_test))
    
    # confusion_matrix pair dataset 
    data = []
    for target_index, target_row in enumerate(cm):
        for predicted_index, count in enumerate(target_row):
            data.append((vocab[target_index], vocab[predicted_index], count))
    
    # convert confusion_matrix pair dataset to dataframe
    df = pd.DataFrame(data,columns=['target','predicted','count'])
    
    # change 'target', 'predicted' to integer strings
    df[['target', 'predicted']] = (df[['target', 'predicted']].astype(int)).astype(str)
    
    # create kubeflow metric metadata for UI
    metadata = {
                "outputs": [
                    {
                        "type": "confusion_matrix",
                        "format": "csv",
                        "schema": [
                            {
                                "name": "target",
                                "type": "CATEGORY"
                            },
                            {
                                "name": "predicted",
                                "type": "CATEGORY"
                            },
                            {
                                "name": "count",
                                "type": "NUMBER"
                            }
                        ],
                        "source": df.to_csv(header=False, index=False),
                        "storage": "inline",
                        "labels": [
                            "0",
                            "1"
                        ]
                    }
                ]
            }
    
    with open(mlpipeline_ui_metadata_path, 'w') as metadata_file:
        json.dump(metadata, metadata_file)

    conf_m_result = namedtuple('conf_m_result', ['mlpipeline_ui_metadata'])
    
    return conf_m_result(json.dumps(metadata))

## LightGBM

In [38]:
# lightgbm modeling step

def lightgbm_modeling(feat_eng_path: InputPath(str), 
                      lgbm_ensemble_path: OutputPath(str),
                      mlpipeline_ui_metadata_path: OutputPath(str)):
    
    # install the necessary libraries
    import sys, subprocess;
    subprocess.run(["python", "-m", "pip", "install", "--upgrade", "pip"])
    subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])
    subprocess.run([sys.executable, '-m', 'pip', 'install','lightgbm'])
    subprocess.run([sys.executable, '-m', 'pip', 'install','scikit-learn'])
    
    # import Library
    import os, json, pickle;
    import numpy as np
    import pandas as pd
    from lightgbm import LGBMClassifier
    from sklearn.metrics import confusion_matrix
    from collections import namedtuple

    #loading the new_feats data
    with open(f'{feat_eng_path}/split_data', 'rb') as f:
        X_train, X_test, y_train, y_test = pickle.load(f)
        
    
    #creating the ensemble_path directory
    os.makedirs(lgbm_ensemble_path, exist_ok = True)
    
    # model initialization
    lgbm = LGBMClassifier(random_state=22,scale_pos_weight=0.362)

    # fitting
    lgbm.fit(X_train, y_train, categorical_feature = 'auto', eval_set=(X_test, y_test),feature_name='auto', verbose=0)
    
    # predict
    lgbm_pred = lgbm.predict(X_test)
    
    #Save the predicted data as a pickle file to be used by the ensembling component.
    with open(f'{lgbm_ensemble_path}/lgbm_pred', 'wb') as f:
        pickle.dump((y_test, lgbm_pred), f)
    
    # plot confusion_matrix
    cm = confusion_matrix(y_test, lgbm_pred)
    vocab = list(np.unique(y_test))
    
    # confusion_matrix pair dataset 
    data = []
    for target_index, target_row in enumerate(cm):
        for predicted_index, count in enumerate(target_row):
            data.append((vocab[target_index], vocab[predicted_index], count))
    
    # convert confusion_matrix pair dataset to dataframe
    df = pd.DataFrame(data,columns=['target','predicted','count'])
    
    # change 'target', 'predicted' to integer strings
    df[['target', 'predicted']] = (df[['target', 'predicted']].astype(int)).astype(str)
    
    # create kubeflow metric metadata for UI
    metadata = {
                "outputs": [
                    {
                        "type": "confusion_matrix",
                        "format": "csv",
                        "schema": [
                            {
                                "name": "target",
                                "type": "CATEGORY"
                            },
                            {
                                "name": "predicted",
                                "type": "CATEGORY"
                            },
                            {
                                "name": "count",
                                "type": "NUMBER"
                            }
                        ],
                        "source": df.to_csv(header=False, index=False),
                        "storage": "inline",
                        "labels": [
                            "0",
                            "1"
                        ]
                    }
                ]
            }
    
    
    
    with open(mlpipeline_ui_metadata_path, 'w') as metadata_file:
        json.dump(metadata, metadata_file)

    conf_m_result = namedtuple('conf_m_result', ['mlpipeline_ui_metadata'])
    
    return conf_m_result(json.dumps(metadata))

## Ensembling

In [39]:
# ensembling step

def ensembling(lgbm_ensemble_path: InputPath(str),
               xgb_ensemble_path: InputPath(str),
               cb_ensemble_path: InputPath(str),
               mlpipeline_ui_metadata_path: OutputPath(str)) -> NamedTuple('conf_m_result', [('mlpipeline_ui_metadata', 'UI_metadata')]):
    
    # install the necessary libraries
    import sys, subprocess;
    subprocess.run(["python", "-m", "pip", "install", "--upgrade", "pip"])
    subprocess.run([sys.executable, '-m', 'pip', 'install','pandas'])
    subprocess.run([sys.executable, '-m', 'pip', 'install','scipy'])
    subprocess.run([sys.executable, '-m', 'pip', 'install','scikit-learn'])
    
    # import Library
    import os, json, pickle;
    import numpy as np
    import pandas as pd
    from scipy import stats
    from sklearn.metrics import confusion_matrix
    from collections import namedtuple
    
    #loading the new_feats data
    with open(f'{lgbm_ensemble_path}/lgbm_pred', 'rb') as f:
        (y_test, lgbm_pred) = pickle.load(f)
    with open(f'{xgb_ensemble_path}/xgb_pred', 'rb') as g:
        xgb_pred = pickle.load(g)
    with open(f'{cb_ensemble_path}/cb_pred', 'rb') as h:
        cb_pred = pickle.load(h)
    
    # create an array of all predictions
    predictions = np.array([cb_pred, xgb_pred, lgbm_pred])
    
    # find the most frequent predicted value 
    pred_mode = stats.mode(predictions, axis=0)[0][0]
    
    # plot confusion_matrix
    cm = confusion_matrix(y_test, pred_mode)
    vocab = list(np.unique(y_test))
    
    # confusion_matrix pair dataset 
    data = []
    for target_index, target_row in enumerate(cm):
        for predicted_index, count in enumerate(target_row):
            data.append((vocab[target_index], vocab[predicted_index], count))
    
    # convert confusion_matrix pair dataset to dataframe
    df = pd.DataFrame(data,columns=['target','predicted','count'])
    
    # change 'target', 'predicted' to integer strings
    df[['target', 'predicted']] = (df[['target', 'predicted']].astype(int)).astype(str)
    
    # create kubeflow metric metadata for UI
    metadata = {
                "outputs": [
                    {
                        "type": "confusion_matrix",
                        "format": "csv",
                        "schema": [
                            {
                                "name": "target",
                                "type": "CATEGORY"
                            },
                            {
                                "name": "predicted",
                                "type": "CATEGORY"
                            },
                            {
                                "name": "count",
                                "type": "NUMBER"
                            }
                        ],
                        "source": df.to_csv(header=False, index=False),
                        "storage": "inline",
                        "labels": [
                            "0",
                            "1"
                        ]
                    }
                ]
            }
    
    with open(mlpipeline_ui_metadata_path, 'w') as metadata_file:
        json.dump(metadata, metadata_file)

    conf_m_result = namedtuple('conf_m_result', ['mlpipeline_ui_metadata'])
    
    return conf_m_result(json.dumps(metadata))

## Create pipeline components 

using `create_component_from_func`

In [40]:
# create light weight components
load_op = comp.create_component_from_func(load_data,base_image="python:3.7.1")
transform_op = comp.create_component_from_func(transform_data,base_image="python:3.7.1")
feature_eng_op = comp.create_component_from_func(feature_engineering,base_image="python:3.7.1")
catboost_modeling_op = comp.create_component_from_func(catboost_modeling, base_image="python:3.7.1")
lightgbm_modeling_op = comp.create_component_from_func(lightgbm_modeling, base_image="python:3.7.1")
xgboost_modeling_op = comp.create_component_from_func(xgboost_modeling, base_image="python:3.7.1")
ensembling_op = comp.create_component_from_func(ensembling, base_image="python:3.7.1")

In [66]:
# define pipeline
@dsl.pipeline(name="telco-customer-churn", 
              description="Predicting real future returns of around 2,000 stocks.")

# Define parameters to be fed into pipeline
def telco_customer_churn_pipeline(
                             download_link: str,
                             data_path: str,
                             transform_data_path: str, 
                             feat_eng_data_path: str,
                             cb_ensemble_path:str,
                             xgb_ensemble_path:str,
                             lgbm_ensemble_path:str
                            ):

    # Create load container.
    load_container = load_op(download_link)
    # Create transform container.
    transform_container = transform_op(load_container.output)
    # Create feature engineering container.
    feature_eng_container = feature_eng_op(transform_container.output)
    # Create catboost modeling container.
    cb_modeling_container = catboost_modeling_op(feature_eng_container.output)
    # Create xgboost modeling container.
    xgb_modeling_container = xgboost_modeling_op(feature_eng_container.output)
    # Create lightgbm modeling container.
    lgbm_modeling_container = lightgbm_modeling_op(feature_eng_container.output)
    # Create ensemble container.
    ensembling_container = ensembling_op(lgbm_modeling_container.outputs["lgbm_ensemble"], \
                                         xgb_modeling_container.outputs["xgb_ensemble"], \
                                         cb_modeling_container.outputs["cb_ensemble"])

In [67]:
# create client that would enable communication with the Pipelines API server 
client = kfp.Client()

In [68]:
# arguments
download_link = "https://github.com/kubeflow/examples/blob/master/telco-customer-churn-kaggle-competition/data/WA_Fn-UseC_-Telco-Customer-Churn.csv?raw=true"
data_path = "data"
transform_data_path = "tdp"
feat_eng_data_path = "feat"
cb_ensemble_path = "cep"                             
xgb_ensemble_path = "xep"
lgbm_ensemble_path = "lep"

In [69]:
pipeline_func = telco_customer_churn_pipeline

experiment_name = 'telco_customer_churn_pipeline_lightweight'
run_name = pipeline_func.__name__ + ' run'

arguments = {
             "download_link": download_link,
             "data_path": data_path,
             "transform_data_path": transform_data_path,
             "feat_eng_data_path": feat_eng_data_path,
             "cb_ensemble_path": cb_ensemble_path,
             "xgb_ensemble_path": xgb_ensemble_path,
             "lgbm_ensemble_path": lgbm_ensemble_path
            }

# Compile pipeline to generate compressed YAML definition of the pipeline.
kfp.compiler.Compiler().compile(pipeline_func,  
  '{}.zip'.format(experiment_name))

# Submit pipeline directly from pipeline function
run_result = client.create_run_from_pipeline_func(pipeline_func, 
                                                  experiment_name=experiment_name, 
                                                  run_name=run_name, 
                                                  arguments=arguments
                                                 )


{'data': {{pipelineparam:op=load-data;name=data}}}
{'data': {{pipelineparam:op=load-data;name=data}}}
