In [None]:
import pandas as pd
import numpy as np
import random as rd
import itertools as itertools

# SQL
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import _mysql

# Plotting packages
import matplotlib.pyplot as plt
from bokeh.plotting import figure, output_notebook, show
from bokeh.models import Range1d
from bokeh.charts import Bar, output_file, show
from bokeh.sampledata.autompg import autompg as df
%matplotlib inline
output_notebook()

# Curve (plot) smoothing 
from scipy.signal import savgol_filter

# kslearn packages
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, precision_recall_curve, precision_score,recall_score
from sklearn.metrics import average_precision_score,confusion_matrix, f1_score

# regular expression
import re

mysqlFilePath = 'mysql://root:@localhost/clientsuccess?charset=utf8&use_unicode=0'

'''
Thie function replaces two special characters that correspond to True and False 
in the MySQL dump file that I received from the startup company that I consulted for.
'''
def replace_special(char):
    try:
        if char.encode('string-escape') == r'\x01':
            return True
        elif char.encode('string-escape') == r'\x00':
            return False
        else:
            return char
    except:
        return char

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
engine = create_engine(mysqlFilePath, pool_recycle=3600)
connection = engine.connect()

tb_name = "kim_all_tenants"
tb_result =  pd.read_sql_query("""
    SELECT 
        *
    FROM 
        %s
        """%tb_name, connection).applymap(lambda x: replace_special(x))

tb_result = tb_result[tb_result['sentiment'].apply(lambda x: False if np.isnan(x) else True)]\
        .copy(deep=True)
tb_res = tb_result[tb_result['amount_per_day'].apply(lambda x: False if np.isnan(x) | np.isinf(x) else True)]\
        .copy(deep=True)
tb_result = None

### Random Forest 

In [None]:
def RF(features, Xtrain, ytrain, test_size = 0.3, n_estimators = 100, max_depth = 3):

    clf = RandomForestClassifier(n_estimators=n_estimators, max_depth = max_depth)
    clf = clf.fit(X_train, y_train)
    coeff = clf.get_params()

    print coeff

    print("score: %.4f" % clf.score(X_test, y_test))
    
    return clf

In [None]:
def data_preprocessing(df):

    lst_tenants = df['tenant_id'].unique()
    #print lst_tenants

    for x in lst_tenants:
        name = "tenant"+str(x)
        mask = (df['tenant_id'] == x)
        mask = [1 if x == True else 0 for x in mask]
        df[name] = pd.Series(mask, index = df.index)

    tenant_names = ["tenant"+str(x) for x in lst_tenants]
    feature_lst = ['client_note_total_count', 'sub_duration', 'last_sub_duration', 'amount_per_day', 'sentiment']\
        + tenant_names
    #print feature_lst
    X_original = df[feature_lst].as_matrix()
    y = np.array(df['churned'])
    return X_original, y , feature_lst

### Confusion Table

In [None]:
def confusion_table_plot(y_test, y_pred):
    cnf_matrix = confusion_matrix(y_test, y_pred)
    np.set_printoptions(precision=2)

    # Plot non-normalized confusion matrix
    plt.figure()

    class_names = ['Non-Churn','Churn']
    plot_confusion_matrix(cnf_matrix, classes=class_names,
                          title='Confusion matrix, without normalization')

    # Plot normalized confusion matrix
    plt.figure()
    plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                          title='Normalized confusion matrix')

    plt.show()

In [None]:
X_original, y, feature_lst = data_preprocessing(tb_res)
scaler = StandardScaler()
X_raw = scaler.fit_transform(X_original)
X_train, X_test, y_train, y_test = train_test_split(X_raw, y, test_size=.3, random_state = 100)

clf = RF(feature_lst, X_train, y_train, n_estimators = 1000, max_depth = 10)
y_pred = clf.predict(X_test)
confusion_table_plot(y_test, y_pred)

In [None]:
precision_score(y_test, y_pred) 

### Recall is one of the important metric to consider, because the algorithm would not want to predict unhappy customers as happy ones. 

In [None]:
recall_score(y_test, y_pred)

In [None]:
f1_score(y_test, y_pred)

In [None]:
def plot_importance(feature_importance_lst, feature_lst):
    p=figure(plot_width=800, plot_height=200)  
    df = pd.DataFrame()
    df['importance'] = pd.Series(feature_importance_lst , index = feature_lst)
    p = Bar(df, values='importance',title="Feature Importance", legend=False,\
            plot_width=800, plot_height=300)
    p.xaxis.axis_label = 'Features'
    p.yaxis.axis_label = 'Importance'
    show(p)
                    
plot_importance(clf.feature_importances_, feature_lst)

### Precision-recall curve

In [None]:
def plot_precision_recall_curve(clf, X_test, y_test):
    y_score = clf.predict_proba(X_test)[:, 1]
    precision, recall, _ = precision_recall_curve(y_test, y_score)
    plt.plot(recall, precision, lw = 1, color='navy',label='Precision-Recall curve')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title("Precision-Recall: Random Forest")
    plt.show()

In [None]:
plot_precision_recall_curve(clf, X_test, y_test)

In [None]:
average_precision_score(y_test, y_pred)

### ROC

In [None]:
def plot_ROC(clf, X_test, y_test):
    y_score = clf.predict_proba(X_test)[:, 1]
    fpr_rt, tpr_rt, _ = roc_curve(y_test, y_score)
    roc_auc = auc(fpr_rt, tpr_rt)

    plt.figure()
    lw = 2
    plt.plot(fpr_rt, tpr_rt, color='darkorange',lw=lw, label='Random Forest (area = %0.2f)' % roc_auc)

    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()
plot_ROC(clf, X_test, y_test)

### In the reduced feature space (with only the important features taken into account), we can check whether it is ok to use the reduced feature space or not.

In [None]:
def lst_b_in_lst_a(lst_b, lst_a):
    mask = []
    for i in lst_a:
        if i in lst_b:
            mask.append(True)
        else:
            mask.append(False)
    return mask

#### Features, of which the importance scores are larger than 0.001, were selected. The confusion matrix and other metrics were plotted. 



In [None]:
df = pd.DataFrame(index = feature_lst)
df['importance'] = pd.Series(clf.feature_importances_ , index = feature_lst)
df_important_ = df[df['importance']>=0.001]
mask = [[bool(re.search('tenant', i))] for i in df_important_.index]
important_tenant_name_lst = df_important_[mask].index
#print important_tenant_name_lst
feature_lst_reduced = ['client_note_total_count', 'sub_duration', 'last_sub_duration', 'amount_per_day', 'sentiment']\
    + list(important_tenant_name_lst)
#print feature_lst
X_original = tb_res[feature_lst_reduced].as_matrix()
y = np.array(tb_res['churned'])

scaler = StandardScaler()
X = scaler.fit_transform(X_original)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)

clf_reduced = RF(feature_lst_reduced, X_train, y_train, n_estimators = 1000, max_depth = 10)
y_pred = clf_reduced.predict(X_test)

In [None]:
plot_importance(clf_reduced.feature_importances_, feature_lst_reduced)

### Confusion Table

In [None]:
confusion_table_plot(y_test, y_pred)

In [None]:
precision_score(y_test, y_pred) 

### Recall is one of the important metric to consider, because the algorithm would not want to predict unhappy customers as happy ones. Then, the unhappy customer will be likely to churn. F1 score as a metric indicating a geometric mean of precision and recall can be another important metric to consider.   

In [None]:
recall_score(y_test, y_pred)

In [None]:
f1_score(y_test, y_pred)

In [None]:
plot_precision_recall_curve(clf_reduced, X_test, y_test)

In [None]:
plot_ROC(clf_reduced, X_test, y_test)

### The average churn rate over all the clients was ~19%

In [None]:
num_total_churn = (tb_res['churned']==1).sum()
total_clients = len(tb_res)
churn_rate = num_total_churn/float(total_clients)
print "total_clients = ", total_clients
print "churn_rate = ", churn_rate

### 19% churn rate (annual) is really high! My algorithm, based on random forest, can correctly predict the churning clients with accuracy 78%. This improved accuracy can help the tenant companies actively target to the churning customers. If the tenant companies could keep all the customers from churning, the new churn rate will be

In [None]:
Total_test_cases = len(X_test)
TN = (np.array(y_test) & np.array(y_pred)).sum()
TN/float(Total_test_cases)

### Predicted churn was saved in a column 'shurned_pred' in the table tb_res.

In [None]:
tb_res['churned_pred'] = pd.Series(clf.predict(X_raw), index = tb_res.index)

### I looked into whether there is more room to improve the recall. For every tenant company, predicted and true churn numbers were saved in the table, df_churn .  

In [None]:
lst_cl =[]
lst_true = []
lst_pred = []
lst_corr = []
# For all tenants.
lst_tenants = tb_res['tenant_id'].unique()

for tid in lst_tenants:
    df_t = tb_res[tb_res['tenant_id'] == tid]
    num_cl = len(df_t)
    num_true_churn = df_t['churned'].sum()
    num_pred_churn = df_t['churned_pred'].sum()
    num_corr_churn = (df_t['churned'] & df_t['churned_pred']).sum()
    #print num_cl, "\t", num_true_churn, "\t", num_pred_churn, "\t", num_corr_churn
    lst_cl.append(num_cl)
    lst_true.append(num_true_churn)
    lst_pred.append(num_pred_churn)
    lst_corr.append(num_corr_churn)
    
df_churn = pd.DataFrame(index = lst_tenants)
df_churn['total_clients'] = lst_cl
df_churn['true_churn'] = lst_true
df_churn['pred_churn'] = lst_pred
df_churn['correctly_pred_churn'] = lst_corr

### Feature importance scores of individual tenant companies were saved in the table df_churn

In [None]:
df_importances = pd.DataFrame(index = feature_lst)
df_importances['importance'] = clf.feature_importances_
mask = [[bool(re.search('tenant', i))] for i in df_importances.index]
df_importances_only_tenants = df_importances[mask].copy(deep=True)

new_index = [int(re.match(r'^tenant(.*)', i_str).group(1)) for i_str in df_importances_only_tenants.index]
df_importances_only_tenants.index = new_index
df_importances_only_tenants
df_churn['importance'] = df_importances_only_tenants['importance']

p = figure(plot_width=800, plot_height=200, \
           y_range = Range1d(end=0.03), x_range=Range1d(end=60),\
           #y_axis_type="log", x_axis_type ="log"\
          )  
p.circle(df_churn['true_churn'], df_churn['importance'])
p.xaxis.axis_label = 'Number of Customer Churn Cases'
p.yaxis.axis_label = 'Importance Scores'
show(p)

### The graph above clearly showed that the importance scores are closely related to the number of churns, actually quite linearly. Thus, I segmented the tenants into two groups with the number of churns less than or larger than 10. The optimal threshold values can be found with manual tuning. The table df_churn showed that for some tenant companies such as ID = 173 and 132, churn was not predicted at all. But, with the segmentation of the tenant companies, you will see the dramatic enhancement of the prediction recall (refer to "classification_large_churn_cases"). 

In [None]:
df_churn.sort_values('true_churn', ascending=False)

### RF for two different groups (many churn tenants vs. small churn tenants)

In [None]:
df_small_churns = df_churn[df_churn['true_churn']<10].copy(deep=True)
df_large_churns = df_churn[df_churn['true_churn']>=10].copy(deep=True)

In [None]:
lst = []
for i in df_small_churns.index:
    lst.append(tb_res[tb_res['tenant_id'] == i])
mask = lst_b_in_lst_a(list(pd.concat(lst).index), list(tb_res.index))
tb_res_small = tb_res[mask].copy(deep=True)
tb_res_large = tb_res[[not x for x in mask]].copy(deep=True)

In [None]:
tb_res_small.to_sql(con=engine, name='kim_small_churn_tenants1', if_exists='replace')
tb_res_large.to_sql(con=engine, name='kim_large_churn_tenants1', if_exists='replace')