In [27]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
import category_encoders as ce
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import uniform, randint
import numpy as np

In [28]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn
0,OH,107,area_code_415,no,yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,no
1,NJ,137,area_code_415,no,no,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,no
2,OH,84,area_code_408,yes,no,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,no
3,OK,75,area_code_415,yes,no,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,no
4,MA,121,area_code_510,no,yes,24,218.2,88,37.09,348.5,108,29.62,212.6,118,9.57,7.5,7,2.03,3,no


This dataframe is a subset of a Telco churn dataset which records information about customers who have stayed or lef the company. The last column, "churn", indicates whether the customer has left the company or not.

In [29]:
def clean_dataset():
    df_train = pd.read_csv('train.csv')

    hash_encoder = ce.HashingEncoder(cols=['state'])
    df_train = hash_encoder.fit_transform(df_train)
    
    df_train.international_plan.replace(['no', 'yes'], [0,1], inplace=True)
    df_train.voice_mail_plan.replace(['no', 'yes'], [0,1], inplace=True)
    df_train.churn.replace(['no', 'yes'], [0,1], inplace=True)

    onehot_area = OneHotEncoder()
    onehot_area.fit(df_train[['area_code']])
    encoded_values = onehot_area.transform(df_train[['area_code']])
    df_train[onehot_area.categories_[0]] = encoded_values.toarray()
    df_train = df_train.drop('area_code', axis=1)

    features = df_train.drop('churn', axis=1).values
    target = df_train.churn.values

    return features, target

hash_encoder = ce.HashingEncoder(cols=['state'])
df_train = hash_encoder.fit_transform(df_train)

This line creates an instance of the HashingEncoder class from the category_encoders library. It passes in the argument (cols=['state'], indicating that the state column should be encoded using this method. The second line fits the encoder to the data and transforms it in one step. The HashingEncoder maps categorical values into numerical values. For example, states such as 'NJ', 'NYC' will be mapped into unique numerical categories

df_train.international_plan.replace(['no', 'yes'], [0,1], inplace=True)
df_train.voice_mail_plan.replace(['no', 'yes'], [0,1], inplace=True)
df_train.churn.replace(['no', 'yes'], [0,1], inplace=True)

These 3 lines of code performs a value replacement operation on international_plan, voice_mail_plan, and churn columns in the dataframe. In this case, the values 'no' and 'yes' in the columns are being replaced with the values 0 and 1, respectively. In effect, these lines of code are transforming the columns from a categorical variable with string values to a numerical variable with binary values (0 for 'no' and 1 for 'yes')

onehot_area = OneHotEncoder()
onehot_area.fit(df_train[['area_code']])
encoded_values = onehot_area.transform(df_train[['area_code']])
df_train[onehot_area.categories_[0]] = encoded_values.toarray()
df_train = df_train.drop('area_code', axis=1)

The next part of the data cleaning process was to encode the area code values as their numerical value wouldn't provide any use to the models. Each row of the original categorical variable will be transformed into a row of n binary variables, with only one variable having the value of 1 (indicating the presence of the corresponding category) and the rest having the value of 0. By converting categorical variables into binary variables, one-hot encoding allows categorical data to be used as input to these algorithms.

In [30]:
def load_train_split(features, target):
    X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.3)
    #sm = SMOTE(sampling_strategy = 1, random_state=1)

    #X_train, y_train = sm.fit_resample(X_train, y_train.ravel())

    #scaler = MinMaxScaler()
    #X_train = scaler.fit_transform(X_train)
    #X_val = scaler.transform(X_val)

    return X_train, X_val, y_train, y_val

X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.3)

This function takes in two inputs, features and target and then the train_test_split is used to split the features and the target into training and validation sets. The input data is split into training (70%) and validation (30%) sets by specifying test_size=0.3.

sm = SMOTE(sampling_strategy = 1, random_state=1)

X_train, y_train = sm.fit_resample(X_train, y_train.ravel())

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

The commented out code contains the SMOTE function from the imblearn.over_sampling library which is used to create synthethic samples of the minority class. With the sampling_strategy argument set to 1, the number of synthetic samples to be generated for the minority class will be equal to the number of samples in the majority class. The MinMax Scaler is used to scale the features of the training and validation sets. The accuracy and the precision of the model was reduced however with these features present, hence the comments.  

In [31]:
def random_forest(X_train, X_val, y_train, y_val):
    classifier = RandomForestClassifier(n_estimators=80, min_samples_split=5, max_depth=29, random_state=1)
    classifier = classifier.fit(X_train, y_train)
    y_predictions = classifier.predict(X_val)

    conf_matrix = confusion_matrix(y_val, y_predictions)
    accuracy = cross_val_score(classifier, X_train, y_train, cv=5, scoring='accuracy').mean()
    precision = cross_val_score(classifier, X_train, y_train, cv=5, scoring='precision').mean()

    #param_dist = {
    #'n_estimators': np.arange(10, 100, 10),
    #'max_depth': np.arange(5, 30),
    #'min_samples_split': np.arange(2, 12)}

    #random_search = RandomizedSearchCV(classifier, param_distributions=param_dist, n_iter=100, cv=5, scoring='accuracy')
    #random_search.fit(X_train, y_train)

    #best_params = random_search.best_params_
    #print(best_params)

    print('RF Accuracy Score: ',accuracy)
    print('RF Precision Score: ',precision)
    print('RF Confusion Matrix: ',conf_matrix)
    print('\n')

classifier = RandomForestClassifier(n_estimators=80, min_samples_split=5, max_depth=29, random_state=1) 
classifier = classifier.fit(X_train, y_train)
y_predictions = classifier.predict(X_val)

The use of a Random Forest Model as the initial algorithm for this classification test was chosen due to its versatility and efficacy in handling various factors. As a non-linear model, it remains suitable to handle datasets with complex relationships. Furthermore, it provides feature importance values, which can be useful in understanding which features are the most significant in predicting churn.

conf_matrix = confusion_matrix(y_val, y_predictions)
accuracy = cross_val_score(classifier, X_train, y_train, cv=5, scoring='accuracy').mean()
precision = cross_val_score(classifier, X_train, y_train, cv=5, scoring='precision').mean()

TP (True Positives)    FP (False Positives)
FN (False Negatives)    TN (True Negatives)

[[1080  7][137  51]]

Above, is an example of a confusion matrix. The array [1080  7][137  51]] represents the amount of each condition, in this case being 1080 True Positives, 7 False Positives, 137 False Negatives, and 51 False Negatives. Accuracy is the proportion of correctly classified samples over all samples. It is calculated as (TP + TN) / (TP + TN + FP + FN). Precision is the proportion of correctly classified positive samples over all samples classified as positive. It is calculated as TP / (TP + FP). The cross_val_score function obtains the mean accuracy and precision of the model using 5 folds. 

classifier = RandomForestClassifier(n_estimators=80, min_samples_split=5, max_depth=29)

param_dist = {
'n_estimators': np.arange(10, 100, 10),
'max_depth': np.arange(5, 30),
'min_samples_split': np.arange(2, 12)}

random_search = RandomizedSearchCV(classifier, param_distributions=param_dist, n_iter=100, cv=5,scoring='accuracy')
random_search.fit(X_train, y_train)

best_params = random_search.best_params_
print(best_params)

The commented out code represents hyperparameter tuning using randomized search cross-validation (RandomizedSearchCV) on the RandomForestClassifier model. The param_dist defines the hyperparameters to be tuned and their respective search ranges. The random_search creates an instance of the RandomizedSearchCV the hyperparameters to be tuned and their search ranges defined in param_dist, the number of iterations, the number of folds to use in cross-validation, and the scoring metric to be used. This means that after using RandomizedSearchCV to find the best hyperparameters for the RandomForestClassifier, the values were then incorporated into the model for improved performance as shown in the first line. The n_estimators parameter defines the number of "trees" in the forest, more trees implies a better model performance, but a longer training time. The max_depth parameter determines the maximum depth of each tree in the performance, increasing it could lead to more complex trees but also overfitting. The min_samples_split sets the minimum number of samples required to split an internal node in the tree, however increasing it too much could also lead to overfitting. 

In [32]:
def xgb(X_train, X_val, y_train, y_val):
    classifier = XGBClassifier(max_depth=9, gamma=0.480, learning_rate=0.150, min_child_weight=1, n_estimators=259, subsample=0.5851, colsample_bytree=0.541,random_state=1)
    classifier = classifier.fit(X_train, y_train)
    y_predictions = classifier.predict(X_val)

    #param_grid = {
    #"learning_rate": uniform(0, 1),
    #"max_depth": randint(1, 10),
    #"n_estimators": randint(50, 500),
    #"min_child_weight": randint(1, 10),
    #"subsample": uniform(0.1, 1),
    #"gamma": uniform(0, 1),
    #"colsample_bytree": uniform(0.1, 1)}

    #random_search = RandomizedSearchCV(classifier, param_grid, cv=5, n_iter=100, scoring='accuracy', n_jobs=-1)

    #random_search.fit(X_train, y_train)

    #print(random_search.best_params_)

    conf_matrix = confusion_matrix(y_val, y_predictions)
    accuracy = cross_val_score(classifier, X_train, y_train, cv=5, scoring='accuracy').mean()
    precision = cross_val_score(classifier, X_train, y_train, cv=5, scoring='precision').mean()

    print('\n')
    print('XGB Accuracy Score: ',accuracy)
    print('XGB Precision Score: ',precision)
    print('XGB Confusion Matrix: ',conf_matrix)

classifier = XGBClassifier(max_depth=9, gamma=0.480, learning_rate=0.150, min_child_weight=1, n_estimators=259, subsample=0.5851, colsample_bytree=0.541, random_state=1)
classifier = classifier.fit(X_train, y_train)
y_predictions = classifier.predict(X_val)

The next model chosen was the XGBoost ML algorithim which is a highly flexible and customizable model allowing you to adjust hyperparameters for optimal performance. It is considered a an advanced implentation of the gradient boosting algorithim and has also shown to produce great results. The learning rate hyperparameter controls the step size at which the optimizer makes updates to the model weights. Smaller values will make the model converge slower, but will typically result in a better fit. Subsample specifies the fraction of the training data to use for each tree. Smaller values can result in better models, but at a risk of underfitting the data. Gamma specifies the minimum reduction in the loss function required to split further. Increaing this value can result in fewer splits which could then result in overfitting. The same random_search was applied to this model to generate the optimal hyperparameters for the model. 

In [33]:
def k_neighbors(X_train, X_val, y_train, y_val):
    classifier = KNeighborsClassifier(metric='manhattan', n_neighbors=14, weights='distance')
    classifier = classifier.fit(X_train, y_train)
    y_predictions = classifier.predict(X_val)

    conf_matrix = confusion_matrix(y_val, y_predictions)
    accuracy = cross_val_score(classifier, X_train, y_train, cv=5, scoring='accuracy').mean()
    precision = cross_val_score(classifier, X_train, y_train, cv=5, scoring='precision').mean()

    #param_grid = {'n_neighbors': np.arange(1, 50),
    #          'weights': ['uniform', 'distance'],
    #          'metric': ['euclidean', 'manhattan']}
    
    #rand_search = RandomizedSearchCV(classifier, param_distributions=param_grid, n_iter=100, cv=5, n_jobs=-1)
    #rand_search.fit(X_train, y_train)
    #best_params = rand_search.best_params_
    #best_estimator = rand_search.best_estimator_
    #best_score = rand_search.best_score_

    #print("Best parameters: ", best_params)
    #print("Best score: ", best_score)
    
    print('\n')
    print('KNN Accuracy Score: ',accuracy)
    print('KNN Precision Score: ',precision)
    print('KNN Confusion Matrix: ',conf_matrix)


classifier = KNeighborsClassifier(metric='manhattan', n_neighbors=14, weights='distance')
classifier = classifier.fit(X_train, y_train)
y_predictions = classifier.predict(X_val)

KNeighbors was the next model chosen as it was a non-parametric model, meaning it doesn't make any assumptions about the distribution of data which could be useful when dealing with datasets that have complex relationships. N_neighbors represents the number of nearest neighbors to consider while making predictions and large values of this can result in overfitting the model. The weight function is used to define the distance between the nearest neighbors. 'Distance' assigns weights proportional to the inverse of the distance to the nearest neighbors. Metric is used to compute the distances between the instances. The same random_search was again applied to this model to generate optimal hyperparameters for the model. 

In [26]:
if __name__ == "__main__":   
    features, target = clean_dataset()
    X_train, X_val, y_train, y_val = load_train_split(features, target)
    random_forest(X_train, X_val, y_train, y_val)
    xgb(X_train, X_val, y_train, y_val)
    k_neighbors(X_train, X_val, y_train, y_val)

RF Accuracy Score:  0.9556302521008403
RF Precision Score:  0.9624919655082028
RF Confusion Matrix:  [[1082   13]
 [  59  121]]




XGB Accuracy Score:  0.9542857142857143
XGB Precision Score:  0.9203285653378117
XGB Confusion Matrix:  [[1079   16]
 [  53  127]]


KNN Accuracy Score:  0.891764705882353
KNN Precision Score:  0.8463120052876787
KNN Confusion Matrix:  [[1089    6]
 [ 129   51]]
