In [0]:
##### Churn Rate Prediction Model #####

In [0]:
# Libraries Import
from __future__ import division
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.metrics import average_precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve, roc_auc_score
from scipy import interp
import warnings
from plotnine import *
from sklearn.preprocessing import LabelEncoder

In [0]:
# Sampling data
def random_sample(df, percentage):
    # Calculate the number of rows to select
    num_rows = int(len(df) * percentage)
    
    # Randomly select the rows
    sample_df = df.sample(n=num_rows, random_state=42)  # Set a random seed for reproducibility
    
    # Reset the index of the new DataFrame
    sample_df = sample_df.reset_index(drop=True)
    
    return sample_df

In [0]:

def run_cv(X,y,clf_class,**kwargs):
    #Construct a kfolds object
    kf=KFold(n_splits=3,shuffle=True)
    y_pred=y.copy()
    #Iterate through folds
    for train_index, test_index in kf.split(X,y):
        X_train, X_test = X[train_index],X[test_index]
        y_train= y[train_index]
    
        # Initialize a classifier with key word arguments
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        y_pred[test_index]=clf.predict(X_test)
    return y_pred

In [0]:
# Obtain the accuracy

def accuracy(y_true,y_pred):
    # Numpy interpretes True and False as 1 and 0
    return np.mean(y_true==y_pred) *100

In [0]:
# Produce Predictions

def run_prob_cv(X,y,clf_class, roc=False, **kwargs):
    kf=KFold(n_splits=3, shuffle=True)
    y_prob=np.zeros((len(y),2))
    for train_index, test_index in kf.split(X,y):
        X_train, X_test = X[train_index],X[test_index]
        y_train= y[train_index]
    
        # Initialize a classifier with key word arguments
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        y_prob[test_index]=clf.predict_proba(X_test)
    
    return y_prob

In [0]:
# Obtain Calibration

    #NOTE: Calibration measurement for a set of predictions.
    #When predicting events at a given probability, how far is #frequency of positive outcomes from that probability?
    #NOTE: Lower scores are better
    #prob: array_like, float
    #    Probability estimates for a set of events
    #outcome: array_like, bool
    #    If event predicted occurred
    #n_bins: int
    #    Numbero of judgement categories to prefrom #calculation over.
    #    Prediction are binned based on probability, since #"descrete"
    #    probabilities aren't required.


def calibration(prob,outcome,n_bins=10):

    prob=np.array(prob)
    outcome=np.array(outcome)

    c=0.0
    #Construct bins
    judgement_bins=np.arange(n_bins+1) / n_bins
    #Which bin is each prediction in?
    bin_num = np.digitize(prob, judgement_bins)
    for j_bin in np.unique(bin_num):
        # Is event in bin
        in_bin=bin_num == j_bin
        # Predicted probability taken as average of preds in bin
        predicted_prob = np.mean(prob[in_bin])
        # How often did events in this bin actually happen?
        true_bin_prob = np.mean(outcome[in_bin])
        # Squared distance between predicted and true times num of obs
        c += np.sum(in_bin) * ((predicted_prob - true_bin_prob) ** 2)
    return c / len(prob) 

In [0]:
# Obtain Discrimination

    #NOTE: Discrimination measurement for a set of predictions.
    #For each judgement category, how far from the base #probability is the true frequency of that bin?
    #NOTE: High scores are better
    #prob: array_like, float
    #    Probability estiamtes for a set of events
    #outcome: array_like, bool
    #    If event predicted occurred
    #n_bins: int
    #    Number of judgement categories to prefrom calculation #over.
    #    Prediction are binned based on probability, since #"descrete"
    #    probabilities aren't required.    
     



def discrimination(prob, outcome, n_bins=10):

    prob = np.array(prob)
    outcome = np.array(outcome)

    d = 0.0
    # Base frequency of outcomes
    base_prob = np.mean(outcome)
    # Construct bins
    judgement_bins = np.arange(n_bins + 1) / n_bins
    # Which bin is each prediction in?
    bin_num = np.digitize(prob, judgement_bins)
    for j_bin in np.unique(bin_num):
        in_bin = bin_num == j_bin
        true_bin_prob = np.mean(outcome[in_bin])
        # Squared distance between true and base times num of obs
        d += np.sum(in_bin) * ((true_bin_prob - base_prob) ** 2)
    return d / len(prob)

In [0]:
# Get All Three Error Measures

def print_measurements(pred_prob):
    churn_prob, is_churn = pred_prob[:, 1], y == 1
    print (" %-20s %.4f" % ("Calibration Error", calibration(churn_prob, is_churn)))
    print (" %-20s %.4f" % ("Discrimination", discrimination(churn_prob, is_churn)))
    print ("Note -- Lower calibration is better, higher discrimination is better")



In [0]:
# ROC graph & AUC

def plot_roc(X, y, clf_class, **kwargs):
    kf = KFold(n_splits=5, shuffle=True)
    y_prob = np.zeros((len(y), 2))
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    
    le = LabelEncoder()
    y = le.fit_transform(y)
    
    for i, (train_index, test_index) in enumerate(kf.split(X, y)):
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        clf = clf_class(**kwargs)
        clf.fit(X_train, y_train)

        y_prob[test_index] = clf.predict_proba(X_test)

        fpr, tpr, thresholds = roc_curve(y[test_index], y_prob[test_index, 1])

        mean_tpr += interp(mean_fpr, fpr, tpr)

        mean_tpr[0] = 0.0
        roc_auc = roc_auc_score(y[test_index], y_prob[test_index, 1])

        plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))
    
    mean_tpr /= kf.get_n_splits()
    mean_tpr[-1] = 1.0
    mean_auc = roc_auc_score(y, y_prob[:, 1])
    plt.plot(mean_fpr, mean_tpr, 'k--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)

    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Random')
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc='lower right')
    plt.show()


In [0]:
##### BG-NBD Model #####