## Importing libraries

In [25]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import warnings
warnings.filterwarnings("ignore")
import io
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.metrics import roc_auc_score,roc_curve,scorer
from sklearn.metrics import f1_score
import statsmodels.api as sm
from sklearn.metrics import precision_score,recall_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import RFE

## Setting Experiment Name

In [26]:
import mlflow
import mlflow.sklearn
"""added line"""  
# Set the experiment name to an experiment in the shared experiments folder
mlflow.set_experiment("/test_mlflow/3churnPrediction")

# <a id='1'>1.Data</a>

In [27]:
telcom = pd.read_csv(r"WA_Fn-UseC_-Telco-Customer-Churn.csv")
#first few rows
telcom.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


# <a id='2'>2. Data Pipeline</a>

In [28]:
#Data Manipulation

#Replacing spaces with null values in total charges column
telcom['TotalCharges'] = telcom["TotalCharges"].replace(" ",np.nan)

#Dropping null values from total charges column which contain .15% missing data 
telcom = telcom[telcom["TotalCharges"].notnull()]
telcom = telcom.reset_index()[telcom.columns]

#convert to float type
telcom["TotalCharges"] = telcom["TotalCharges"].astype(float)

#replace 'No internet service' to No for the following columns
replace_cols = [ 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                'TechSupport','StreamingTV', 'StreamingMovies']
for i in replace_cols : 
    telcom[i]  = telcom[i].replace({'No internet service' : 'No'})
    
#replace values
telcom["SeniorCitizen"] = telcom["SeniorCitizen"].replace({1:"Yes",0:"No"})

#Tenure to categorical column
def tenure_lab(telcom) :
    
    if telcom["tenure"] <= 12 :
        return "Tenure_0-12"
    elif (telcom["tenure"] > 12) & (telcom["tenure"] <= 24 ):
        return "Tenure_12-24"
    elif (telcom["tenure"] > 24) & (telcom["tenure"] <= 48) :
        return "Tenure_24-48"
    elif (telcom["tenure"] > 48) & (telcom["tenure"] <= 60) :
        return "Tenure_48-60"
    elif telcom["tenure"] > 60 :
        return "Tenure_gt_60"
telcom["tenure_group"] = telcom.apply(lambda telcom:tenure_lab(telcom),
                                      axis = 1)

#Separating churn and non churn customers
churn     = telcom[telcom["Churn"] == "Yes"]
not_churn = telcom[telcom["Churn"] == "No"]

#Separating catagorical and numerical columns
Id_col     = ['customerID']
target_col = ["Churn"]
cat_cols   = telcom.nunique()[telcom.nunique() < 6].keys().tolist()
cat_cols   = [x for x in cat_cols if x not in target_col]
num_cols   = [x for x in telcom.columns if x not in cat_cols + target_col + Id_col]



from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

#customer id col
Id_col     = ['customerID']
#Target columns
target_col = ["Churn"]
#categorical columns
cat_cols   = telcom.nunique()[telcom.nunique() < 6].keys().tolist()
cat_cols   = [x for x in cat_cols if x not in target_col]
#numerical columns
num_cols   = [x for x in telcom.columns if x not in cat_cols + target_col + Id_col]
#Binary columns with 2 values
bin_cols   = telcom.nunique()[telcom.nunique() == 2].keys().tolist()
#Columns more than 2 values
multi_cols = [i for i in cat_cols if i not in bin_cols]

#Label encoding Binary columns
le = LabelEncoder()
for i in bin_cols :
    telcom[i] = le.fit_transform(telcom[i])
    
#Duplicating columns for multi value columns
telcom = pd.get_dummies(data = telcom,columns = multi_cols )

#Scaling Numerical columns
std = StandardScaler()
scaled = std.fit_transform(telcom[num_cols])
scaled = pd.DataFrame(scaled,columns=num_cols)

#dropping original values merging scaled values for numerical columns
df_telcom_og = telcom.copy()
telcom = telcom.drop(columns = num_cols,axis = 1)
telcom = telcom.merge(scaled,left_index=True,right_index=True,how = "left")




from imblearn.over_sampling import SMOTE
cols    = [i for i in telcom.columns if i not in Id_col+target_col]

smote_X = telcom[cols]
smote_Y = telcom[target_col]

#Split train and test data
smote_train_X,smote_test_X,smote_train_Y,smote_test_Y = train_test_split(smote_X,smote_Y,
                                                                         test_size = .25 ,
                                                                         random_state = 111)

#oversampling minority class using smote
os = SMOTE(random_state = 0)
os_smote_X,os_smote_Y = os.fit_sample(smote_train_X,smote_train_Y)
os_smote_X = pd.DataFrame(data = os_smote_X,columns=cols)
os_smote_Y = pd.DataFrame(data = os_smote_Y,columns=target_col)








#splitting train and test data 
train,test = train_test_split(telcom,test_size = .25 ,random_state = 111)
    
##seperating dependent and independent variables
cols    = [i for i in telcom.columns if i not in Id_col + target_col]
train_X = train[cols]
train_Y = train[target_col]
test_X  = test[cols]
test_Y  = test[target_col]



# 3. Common function for model prediction

In [29]:
def telecom_churn_prediction(algorithm,training_x,testing_x,training_y,testing_y) :
    
    #model
    algorithm.fit(training_x,training_y)
    predictions   = algorithm.predict(testing_x)
    probabilities = algorithm.predict_proba(testing_x)
        
    model_accuracy = accuracy_score(testing_y,predictions)
    print ("Accuracy   Score : ",model_accuracy,"\n")
    model_roc_auc = roc_auc_score(testing_y,predictions) 
    print ("Area under curve : ",model_roc_auc,"\n")
    model_f1_score = f1_score(testing_y,predictions) 
    print ("F1 score: ",model_f1_score,"\n")
           
    return model_accuracy,model_roc_auc,model_f1_score;

# 3.1. Logistic Regression

In [30]:
import ipywidgets as widgets
from IPython.display import display
slider_C = widgets.IntSlider( description='C:', value=1)
text = widgets.IntText()
display(slider_C,text)
widgets.jslink((slider_C, 'value'), (text, 'value'))

IntSlider(value=1, description='C:')

IntText(value=0)

Link(source=(IntSlider(value=1, description='C:'), 'value'), target=(IntText(value=0), 'value'))

In [43]:

"""added line"""  
with mlflow.start_run(run_name='LogisticRegression'):
        
    logit  = LogisticRegression(C=slider_C.value, class_weight=None, dual=False, fit_intercept=True,intercept_scaling=1, max_iter=6, multi_class='ovr', n_jobs=1,penalty='l1', random_state=None, solver='liblinear', tol=0.0001,verbose=0, warm_start=False)
    model_accuracy,model_roc_auc,model_f1_score=telecom_churn_prediction(logit,train_X,test_X,train_Y,test_Y)
    
    """added line"""  
    mlflow.log_param("C",slider_C.value)
    mlflow.log_metric("AUC", model_roc_auc)
    mlflow.log_metric("Accuracy", model_accuracy)
    mlflow.log_metric("F1", model_f1_score)
    
    mlflow.sklearn.log_model(knn, "log_reg_model")  #for saving log of pickle & MLmodel file in artifacts
    mlflow.sklearn.save_model(knn, "log_reg_model") #for saving pickle & MLmodel file in our file directory

Accuracy   Score :  0.7997724687144482 

Area under curve :  0.7134423485482522 

F1 score:  0.5906976744186047 



# 3.2 XG Boost Model

In [44]:
import ipywidgets as widgets
from IPython.display import display
slider_max_depth = widgets.IntSlider(
    min=1,
    max=10,
    step=1,
    description='max_depth:',
    value=7
)
text = widgets.IntText()
display(slider_max_depth,text)
widgets.jslink((slider_max_depth, 'value'), (text, 'value'))

IntSlider(value=7, description='max_depth:', max=10, min=1)

IntText(value=0)

Link(source=(IntSlider(value=7, description='max_depth:', max=10, min=1), 'value'), target=(IntText(value=0), …

In [45]:

with mlflow.start_run(run_name='xgboost'):

    
    xgc = XGBClassifier(base_score=.9, booster='gbtree', colsample_bylevel=1,colsample_bytree=1, gamma=0, learning_rate=0.9, max_delta_step=0,max_depth = slider_max_depth.value, min_child_weight=1, missing=None, n_estimators=100,n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,silent=True, subsample=1)
    model_accuracy,model_roc_auc, model_f1_score=telecom_churn_prediction(xgc,os_smote_X,test_X,os_smote_Y,test_Y)
    
    mlflow.log_param("max_depth",slider_max_depth.value)          
    mlflow.log_metric("AUC", model_roc_auc)
    mlflow.log_metric("Accuracy", model_accuracy)
    mlflow.log_metric("F1", model_f1_score)
  
    mlflow.sklearn.log_model(knn, "xgboost_model")  #for saving log of pickle & MLmodel file in artifacts
    mlflow.sklearn.save_model(knn, "xgboost_model") #for saving pickle & MLmodel file in our file directory

Accuracy   Score :  0.7337883959044369 

Area under curve :  0.6939966522886758 

F1 score:  0.5584905660377358 



# 3.3 Gaussian Naive Bayes

In [46]:

with mlflow.start_run(run_name='GaussianNB'):
         
    gnb = GaussianNB(priors=None)
    model_accuracy,model_roc_auc, model_f1_score=telecom_churn_prediction(gnb,os_smote_X,test_X,os_smote_Y,test_Y)

           
    mlflow.log_metric("AUC", model_roc_auc)
    mlflow.log_metric("Accuracy", model_accuracy)
    mlflow.log_metric("F1", model_f1_score)
    
    mlflow.sklearn.log_model(knn, "gnb_model")  #for saving log of pickle & MLmodel file in artifacts
    mlflow.sklearn.save_model(knn, "gnb_model") #for saving pickle & MLmodel file in our file directory

Accuracy   Score :  0.7480091012514221 

Area under curve :  0.7645850769329814 

F1 score:  0.6395443449959317 



# 3.4 KNN model

In [47]:
import ipywidgets as widgets
from IPython.display import display
slider_leaf_size = widgets.IntSlider(
    min=1,
    max=150,
    step=5,
    description='leaf_size:',
    value=90
)
text = widgets.IntText()
display(slider_leaf_size,text)
widgets.jslink((slider_leaf_size, 'value'), (text, 'value'))

IntSlider(value=90, description='leaf_size:', max=150, min=1, step=5)

IntText(value=0)

Link(source=(IntSlider(value=90, description='leaf_size:', max=150, min=1, step=5), 'value'), target=(IntText(…

In [49]:

with mlflow.start_run(run_name='KNN'):
        
   
    
    knn = KNeighborsClassifier(algorithm='auto', leaf_size=slider_leaf_size.value, metric='minkowski',metric_params=None, n_jobs=1, n_neighbors=8, p=2,weights='uniform')
    model_accuracy,model_roc_auc, model_f1_score=telecom_churn_prediction(knn,os_smote_X,test_X,os_smote_Y,test_Y)

    mlflow.log_param("leaf_size",slider_leaf_size.value)
    mlflow.log_metric("AUC", model_roc_auc)
    mlflow.log_metric("Accuracy", model_accuracy)
    mlflow.log_metric("F1", model_f1_score)
    
    mlflow.sklearn.log_model(knn, "knn_model")  #for saving log of pickle & MLmodel file in artifacts
    mlflow.sklearn.save_model(knn, "knn_model") #for saving pickle & MLmodel file in our file directory

Accuracy   Score :  0.7445961319681457 

Area under curve :  0.7309148264984228 

F1 score:  0.6044052863436123 

