In [1]:
%reload_ext autoreload

In [13]:
# Imports

%load_ext autoreload
%autoreload 2

%matplotlib inline
import numpy as np
import random
from fastai.imports import *
from fastai.structured import *
from matplotlib import pyplot
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
from tabulate import tabulate
from sklearn.model_selection import GridSearchCV
from scipy.stats import trim_mean, kurtosis
from scipy.stats.mstats import mode, gmean, hmean
from imblearn.over_sampling import SMOTE
from sklearn import metrics
from sklearn.externals import joblib
from sklearn.metrics import f1_score,\
    accuracy_score, confusion_matrix,\
    precision_score, recall_score,\
    roc_curve, roc_auc_score,\
    cohen_kappa_score, mean_absolute_error,\
    precision_recall_curve, auc,\
    average_precision_score

set_plot_sizes(12,14,16)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
def prepare_data(df_raw):
    # Sort data by date
    df_raw = df_raw.sort_values(by='licence_registration_date')
    
    # Convert annual_revenue from a string to a float
    df_raw['annual_revenue'] = pd.to_numeric(df_raw['annual_revenue'].str.replace(',', ''))
    
    # Convert fields to INT and setting any NaNs to the mean of that type
    case_types = ['cases_total','cases_open','cases_closed','cases_age_hours_total','cases_age_hours_average', 'last_login_days']

    for case_type in case_types:
        default_value = df_raw[case_type].fillna(df_raw[case_type].median())
        df_raw[case_type] = df_raw[case_type].fillna(default_value).astype(int)
    
    # Fix missing values for annual revenue, replace with mean/trimmed mean of the plan size they are on
    plan_list = df_raw.plan[~pd.isnull(df_raw.plan)].unique()

    for plan in plan_list:
        mean = round(df_raw.annual_revenue[df_raw.plan == plan].mean(), 2)
        trimmed_mean = trim_mean(df_raw.annual_revenue[df_raw.plan == plan].values, 0.1)
    
        if pd.isnull(mean):
            revenue = 0
        else:
            revenue = mean
        df_raw.loc[df_raw.plan==plan, 'annual_revenue'] = df_raw.loc[df_raw.plan==plan, 'annual_revenue'].fillna(revenue)
        
    # 'bin' last login days

    bins = [1, 3, 7, 14, 30, 60]
    group_names = ['day', 'few_days', 'week', 'fortnight', 'month']

    # need to get the mean of the plan size for last_login_days and set each row to that
    #df_raw.last_login_days = df_raw.last_login_days.fillna(np.mean(df_raw.last_login_days))

    last_login_categories = pd.cut(df_raw['last_login_days'], bins, labels=group_names)
    df_raw['last_login_categories'] = pd.cut(df_raw['last_login_days'], bins, labels=group_names)
    #pd.value_counts(df_raw['last_login_categories'])
    
    # one-hot encode fields
    dummy_columns = ['customer_account_status', 'last_login_categories', 'plan']

    for dummy_column in dummy_columns:
        dummy = pd.get_dummies(df_raw[dummy_column], prefix=dummy_column)
        df_raw = pd.concat([df_raw,dummy], axis=1)
        df_raw = df_raw.drop(columns=dummy_column)
        
    
    # This breaks all the date features up into number columns
    # These steps can only be run once then you need to comment them out
    add_datepart(df_raw, 'licence_registration_date')
    add_datepart(df_raw, 'golive_date')
    
    # Drop columns, some of these create "Data Leakage", some are just to test if it has impact when they are taken out
    df_raw = df_raw.drop(columns=['customer_account_status_Good', 'last_login_concern',
                                  'last_login_days', 'account_status', 'changing_platform', 
                                  'new_platform', 'licence_status', 'canceldate', 
                                  'cancel_details', 'cancel_reason'])
    
    # Set default values for NaN values in NPS
    df_raw.nps = df_raw.nps.fillna(np.nanmean(df_raw.nps))

    # Set NaN to zero
    features = ['churned', 'interactions_total', 'interactions_completed', 'interactions_no_response', 'interactions_no_onboarding', 'interactions_completed_training']

    for feature in features:
        df_raw[feature] = df_raw[feature].fillna(0)
        
    # Complete the transformation of all data into numbers using proc_df and create training dataframes
    train_cats(df_raw)
    
    return df_raw

In [95]:
# Load in data

PATH = "../data/"
df_raw = pd.read_csv(f'{PATH}churn.csv', low_memory=False, 
                     parse_dates=['canceldate', 'licence_registration_date', 'golive_date'])

In [96]:
print(len(df_raw[df_raw.churned == 0]), df_raw[df_raw.churned == 0].shape)
df_processed = prepare_data(df_raw)
print(len(df_processed), df_processed.shape)
df_filtered = df_processed[df_processed.licence_registration_Year >2017]
df_data, y_data, nas = proc_df(df_filtered, 'churned')
print(len(df_data), df_data.shape)

2780 (2780, 38)
5889 (5889, 99)
1351 (1351, 102)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [97]:
joblib_file = "churn_model.pkl"

# Load from file
churn_model = joblib.load(joblib_file)

predictions = churn_model.predict(df_data)
probability = churn_model.predict_proba(df_data)

now = datetime.datetime.now()
array = []
for i in range(len(df_filtered[df_filtered.churned == 0].username)):
    if predictions[i] == 1:
        array.append([now,df_filtered[df_filtered.churned == 0].username.iloc[i],df_filtered[df_filtered.churned == 0].annual_revenue.iloc[i],predictions[i],probability[i][1]])
        

print(len(df_raw[df_raw.churned == 0]), len(array))

2780 353


In [98]:
count = 0

churn_concerns = []

for i in range(len(array)):
    if array[i][4] > .88:
        count += 1
        churn_concerns.append(array[i])

In [99]:
print(count)

49


In [102]:
now = datetime.datetime.now()
import csv

sorted_array = sorted(churn_concerns, key=lambda x: x[2], reverse=True)

for i in range(len(sorted_array)):
    print(f"{now},{sorted_array[i][0]},{sorted_array[i][1]},{sorted_array[i][2]},{sorted_array[i][3]}")
    with open('../data/predictions.csv', 'w') as writeFile:
        writer = csv.writer(writeFile)
        writer.writerows(sorted_array)

writeFile.close()

2019-10-05 13:55:33.754868,2019-10-05 13:51:39.613636,N052273,3161843.85,1
2019-10-05 13:55:33.754868,2019-10-05 13:51:39.613636,N055220,800290.23,1
2019-10-05 13:55:33.754868,2019-10-05 13:51:39.613636,N054211,649310.58,1
2019-10-05 13:55:33.754868,2019-10-05 13:51:39.613636,N052113,485805.76,1
2019-10-05 13:55:33.754868,2019-10-05 13:51:39.613636,N056267,393561.83,1
2019-10-05 13:55:33.754868,2019-10-05 13:51:39.613636,N059921,330861.34,1
2019-10-05 13:55:33.754868,2019-10-05 13:51:39.613636,N055984,295827.07,1
2019-10-05 13:55:33.754868,2019-10-05 13:51:39.613636,N053827,257118.25,1
2019-10-05 13:55:33.754868,2019-10-05 13:51:39.613636,N060326,87983.24,1
2019-10-05 13:55:33.754868,2019-10-05 13:51:39.613636,N047717,73909.26,1
2019-10-05 13:55:33.754868,2019-10-05 13:51:39.613636,N060976,70305.75,1
2019-10-05 13:55:33.754868,2019-10-05 13:51:39.613636,N056363,51103.36,1
2019-10-05 13:55:33.754868,2019-10-05 13:51:39.613636,N061812,49879.88,1
2019-10-05 13:55:33.754868,2019-10-05 13:5