#Building a Machine Learning supervised Churn detector step 

In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from scipy.special import expit, logit
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.cluster import KMeans
from sklearn import metrics as ms

import statsmodels.formula.api as smf
import matplotlib as mpl

# matplotlib params
mpl.rcParams['axes.spines.right'] = False
mpl.rcParams['axes.spines.top'] = False
plt.style.use('seaborn-white')

#### Payload functions

In [72]:
# features
FEATS = ['gender',
          'Partner',
          'Dependents',
          'TechSupport',
          'Contract',
          'PaperlessBilling',
          'SeniorCitizen',
          'tenure',
          'TotalCharges']

# target
TARGET = ['Churn']

# handle function for TotalCharges column
def handle_strnumber(point):
  if type(point) == str:
    try:
      point = float(point)
      return point
    except:
      return 'dropme'

# initial preprocess step with mapping
def preprocess(data) -> pd.DataFrame:
  X = data.copy()

  X['TotalCharges'] = X['TotalCharges'].apply(lambda x: handle_strnumber(x))
  X = X[X['TotalCharges'] != 'dropme']
  X['TotalCharges'] = X['TotalCharges'].astype(float)

  X['gender'] = X['gender'].map({'Male':1, 'Female':0})
  X['Partner'] = X['Partner'].map({'Yes':1, 'No':0})
  X['Dependents'] = X['Dependents'].map({'Yes':1, 'No':0})
  X['TechSupport'] = X['TechSupport'].map({'Yes':1, 'No internet service':0, 'No': -1})
  X['Contract'] = X['Contract'].map({'Month-to-month':1, 'Two year':0, 'One year': -1})
  X['PaperlessBilling'] = X['PaperlessBilling'].map({'Yes':1, 'No':0})

  return X[FEATS], X[TARGET]

# KMeans clustering feature engineering labels column
def preprocess2(data, pipeline, kmeans, fit = True) -> np.array:
  """
  <Docstring>
  Remember to first apply this function with fit param = True
  on the training set.
  """
  X = data.copy()
  if fit:
    scld = pipeline.fit_transform(X)
    scld = pd.DataFrame(scld, columns = ['PC1','PC2'])
    labels = kmeans.fit_predict(scld)
  else:
    scld = pipeline.transform(X)
    scld = pd.DataFrame(scld, columns = ['PC1','PC2'])
    labels = kmeans.predict(scld)
  return labels

# Standardizing only numeric features
def preprocess3(data, scaler, fit = True) -> pd.DataFrame:
  X = data.copy()
  cols = ['SeniorCitizen','tenure','TotalCharges']

  if fit:
    numerics = scaler.fit_transform(X[cols].copy())
  else:
    numerics = scaler.transform(X[cols].copy())
  X[cols] = numerics
  return X

In [55]:
# loading and separating test set
data = pd.read_csv('data.csv').drop('customerID', 1)
train, test = data.iloc[:int(data.shape[0]*.7), :], data.iloc[int(data.shape[0]*.7):, :]

  data = pd.read_csv('data.csv').drop('customerID', 1)


In [56]:
# preprocessing sanity-check: ok!
X, y = preprocess(train)
Xtest, ytest = preprocess(test)
X.shape, y.shape, Xtest.shape, ytest.shape

((4922, 9), (4922, 1), (2110, 9), (2110, 1))

In [57]:
# data splitting
Xtrain, Xval, ytrain, yval = train_test_split(
    X, y, stratify = y,
    test_size = .3, random_state = 777
)

The validation schema above is really strong in a way that there will be two out of sample model checking:  


1.   Validation set outside training validation folds
2.   Test set to use as the model final performance ground truth in deployed web app
3.   Both random split and temporal validation are present. Remember that I stored the last 30% of dataset respecting temporality to the step above




In [58]:
print(100*'=')
print(
    f'Training set size: {Xtrain.shape, ytrain.shape}\nValidation set size: {Xval.shape, yval.shape}\nOut of sample test set size: {Xtest.shape, ytest.shape}'
)
print(100*'=')

Training set size: ((3445, 9), (3445, 1))
Validation set size: ((1477, 9), (1477, 1))
Out of sample test set size: ((2110, 9), (2110, 1))


###- Creating KMeans feature

In [73]:
# KMEANS CLUSTERING FOR EDA
kmeans = KMeans(n_clusters = 2, max_iter = 2000, random_state = 77)

# StandardScaler
scaler = StandardScaler()
pca = PCA(n_components = 2)
ohe = OneHotEncoder()
wranglers = [scaler, ohe, pca]

preprocessor1 = ColumnTransformer(transformers=[
    ('scaler', wranglers[0], selector(dtype_exclude=object)),
    ('ohe', wranglers[1], selector(dtype_include=object))])

pipe1 = Pipeline(steps = [('preprocessor', preprocessor1),
                         ('PCA', wranglers[2])])

# scaler for preprocess3 below
scaler2 = StandardScaler()

In [74]:
labels_train = preprocess2(Xtrain, pipe1, kmeans, fit = True)
labels_val = preprocess2(Xval, pipe1, kmeans, fit = False)
labels_test = preprocess2(Xtest, pipe1, kmeans, fit = False)

Xtrain['cluster'] = labels_train
Xval['cluster'] = labels_val
Xtest['cluster'] = labels_test

In [75]:
pipe1

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('scaler', StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7ff0e507e820>),
                                                 ('ohe', OneHotEncoder(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7ff0e507e0d0>)])),
                ('PCA', PCA(n_components=2))])

In [76]:
# joblib.dump(pipe1, 'pipeline1.joblib')
# joblib.dump(kmeans, 'kmeans.joblib')

# sanity-check for correct pipeline1 saving
preprocess2(Xtrain, joblib.load('pipeline1.joblib'), joblib.load('kmeans.joblib'), fit = False)

array([1, 0, 0, ..., 1, 0, 0], dtype=int32)

In [77]:
print(100*'=')
print(
    f'Training set size: {Xtrain.shape, ytrain.shape}\nValidation set size: {Xval.shape, yval.shape}\nOut of sample test set size: {Xtest.shape, ytest.shape}'
)
print(100*'=')

Training set size: ((3445, 10), (3445, 1))
Validation set size: ((1477, 10), (1477, 1))
Out of sample test set size: ((2110, 10), (2110, 1))


In [79]:
Xtrain = preprocess3(Xtrain, scaler2)
# joblib.dump(scaler2, 'scaler_numerics.joblib')

Xval = preprocess3(Xval, joblib.load('scaler_numerics.joblib'))
Xtest = preprocess3(Xtest, joblib.load('scaler_numerics.joblib'))

In [85]:
Xtrain.head(3)

Unnamed: 0,gender,Partner,Dependents,TechSupport,Contract,PaperlessBilling,SeniorCitizen,tenure,TotalCharges,cluster
104,1,1,0,1,0,1,-0.436342,1.452213,2.365029,0
3999,0,0,0,-1,1,1,2.291783,-1.239061,-0.952239,1
4521,0,1,0,-1,1,0,-0.436342,0.555122,-0.096114,1


Now that everything from the data exploratory analysis and data wrangling/preprocessing went well, we can jump into Machine Learning modelling without worrying about implementation issues regarding bad practices with features