In [1]:
import knn
from sklearn import linear_model
from sklearn import neighbors
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from data_util import *

In [2]:
cdata = pd.read_csv('churn_data.csv')
cvalid = pd.read_csv('churn_validation.csv')
        
cdata = pd.get_dummies(cdata, columns=cat_features(cdata))
cdata = cdata.drop(['Churn_No'], axis=1)
cdata.rename(columns = {'Churn_Yes':'Churn'}, inplace = True)

data_x = cdata.drop(['Churn'], axis=1)
data_y = cdata['Churn']

x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.3,
                                                    random_state=4)

### Logistic Regression Model

In [3]:
log_mod = linear_model.LogisticRegression()
log_mod.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [4]:
# make predictions - class labels and predictive probabilities
preds = log_mod.predict(x_test) # class labels
pred_probs = log_mod.predict_proba(x_test) # predicted probability 0 and 1 for each test case
#print(pred_probs[:10])
# [prob being 0 prob being a 1]
pred_pos = pred_probs.transpose()[1] # prob of each being a 1
pred_neg = pred_probs.transpose()[0] # prob of being a 0
print(pred_pos[:10])
print(pred_neg[:10])

[0.30739119 0.50113121 0.39115316 0.54526617 0.40375419 0.67087871
 0.76927679 0.4982347  0.39268247 0.68448938]
[0.69260881 0.49886879 0.60884684 0.45473383 0.59624581 0.32912129
 0.23072321 0.5017653  0.60731753 0.31551062]


In [5]:
# look at the results
pred_df = pd.DataFrame({'Actual':y_test, 'Predicted':preds,
                        'P(1)':pred_pos, 'P(0)':pred_neg})
print(pred_df.head(20))

     Actual  Predicted      P(1)      P(0)
5         0          0  0.307391  0.692609
24        1          1  0.501131  0.498869
29        1          0  0.391153  0.608847
61        1          1  0.545266  0.454734
19        0          0  0.403754  0.596246
95        1          1  0.670879  0.329121
2         1          1  0.769277  0.230723
25        0          0  0.498235  0.501765
90        0          0  0.392682  0.607318
78        1          1  0.684489  0.315511
12        0          1  0.678395  0.321605
74        0          1  0.868615  0.131385
34        1          0  0.410420  0.589580
16        1          0  0.248717  0.751283
20        0          0  0.476307  0.523693
84        0          0  0.247611  0.752389
107       0          1  0.685458  0.314542
123       0          0  0.449741  0.550259
18        1          0  0.447265  0.552735
82        1          1  0.844427  0.155573


In [6]:
print_binary_classif_error_report(y_test, preds)

Accuracy: 0.6410256410256411
Precison: 0.7058823529411765
Recall: 0.5714285714285714
F1: 0.6315789473684211
ROC AUC: 0.6468253968253967
Confusion Matrix:
[[13  5]
 [ 9 12]]


### Sklearn KNN

In [7]:
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.3,
                                                    random_state=4)

ks = [1,3,5,7,9,11,13,15,17,19,21] # Possible numbers for K
for k in ks:
    print('------------- EVALING ' + str(k) + ' --------')
    mod = neighbors.KNeighborsClassifier(n_neighbors=k)
    mod.fit(x_train, y_train)
    
    # make preds on current model
    preds = mod.predict(x_test)
    
    print_binary_classif_error_report(y_test, preds)

    
## A K of Nine is the best score

------------- EVALING 1 --------
Accuracy: 0.5384615384615384
Precison: 0.5789473684210527
Recall: 0.5238095238095238
F1: 0.5500000000000002
ROC AUC: 0.5396825396825397
Confusion Matrix:
[[10  8]
 [10 11]]
------------- EVALING 3 --------
Accuracy: 0.46153846153846156
Precison: 0.5
Recall: 0.5238095238095238
F1: 0.5116279069767442
ROC AUC: 0.4563492063492063
Confusion Matrix:
[[ 7 11]
 [10 11]]
------------- EVALING 5 --------
Accuracy: 0.5897435897435898
Precison: 0.5925925925925926
Recall: 0.7619047619047619
F1: 0.6666666666666666
ROC AUC: 0.5753968253968254
Confusion Matrix:
[[ 7 11]
 [ 5 16]]
------------- EVALING 7 --------
Accuracy: 0.46153846153846156
Precison: 0.5
Recall: 0.5714285714285714
F1: 0.5333333333333333
ROC AUC: 0.4523809523809524
Confusion Matrix:
[[ 6 12]
 [ 9 12]]
------------- EVALING 9 --------
Accuracy: 0.6666666666666666
Precison: 0.6818181818181818
Recall: 0.7142857142857143
F1: 0.6976744186046512
ROC AUC: 0.6626984126984128
Confusion Matrix:
[[11  7]
 [ 6 15]

### My KNN

In [9]:
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.3,
                                                    random_state=4)
# Doing 9 for my K, since I discovered thats the best one
boi = knn.KNN(9, 'euclidian')
boi.fit(x_train, y_train)
preds = boi.predict(x_test)
print_binary_classif_error_report(y_test, preds)

Accuracy: 0.6666666666666666
Precison: 0.6818181818181818
Recall: 0.7142857142857143
F1: 0.6976744186046512
ROC AUC: 0.6626984126984128
Confusion Matrix:
[[11  7]
 [ 6 15]]


### Churn Validation

In [11]:
cvalid = pd.get_dummies(cvalid, columns=cat_features(cvalid))
cvalid = cvalid.drop(['Churn_No'], axis=1)
cvalid.rename(columns = {'Churn_Yes':'Churn'}, inplace = True)

data_x = cvalid.drop(['Churn'], axis=1)
data_y = cvalid['Churn']

In [15]:
preds2 = boi.predict(data_x)
print_binary_classif_error_report(data_y, preds2)

Accuracy: 0.46875
Precison: 0.4090909090909091
Recall: 0.6923076923076923
F1: 0.5142857142857142
ROC AUC: 0.5040485829959513
Confusion Matrix:
[[ 6 13]
 [ 4  9]]


1. The response variable is Churn, and the predictor variables are all other ones. 

2. I transformed the data by first doing OHE, then following that up by dropping churn_no and renaming churn_yes to just churn.

3. I tried a couple models such as linear regression as well as both the sklearn and my KNN models. I found that mine worked a bit better, so I used it.

4. I used the F1 from print_binary_classif_error_report to measure all models.

5. I constructed my KNN using all predictors available and a K of 9, as that gave the best result.

6. I ran the validation through my KNN, and the error rates were worse than when run using the test set, but that was to be expected.

7. Somewhat. It could definitely be better, 50% definitely isn't good. I could've tried more models, but I'm unfortunately running out of time, and I'm very very tired.