In [1]:
# standard modules
import seaborn as sns
import pandas as pd
import numpy as np
import os
#import math

# Modules for Displaying Figures
import matplotlib.pyplot as plt
import scipy.stats as stats


# Data Science Modules 
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# My modules
import src.acquire as ac
import src.prepare as pp
import src.helper as helper
import src.evaluate as evaluate

# Turn off the red warnings
import warnings
warnings.filterwarnings("ignore")

The following datasets are available:
telco


In [17]:
telco = ac.get_telco_data()

In [19]:
telco.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   payment_type_id           7043 non-null   int64  
 1   internet_service_type_id  7043 non-null   int64  
 2   contract_type_id          7043 non-null   int64  
 3   customer_id               7043 non-null   object 
 4   gender                    7043 non-null   object 
 5   senior_citizen            7043 non-null   int64  
 6   partner                   7043 non-null   object 
 7   dependents                7043 non-null   object 
 8   tenure                    7043 non-null   int64  
 9   phone_service             7043 non-null   object 
 10  multiple_lines            7043 non-null   object 
 11  online_security           7043 non-null   object 
 12  online_backup             7043 non-null   object 
 13  device_protection         7043 non-null   object 
 14  tech_sup

In [3]:
train, validate, test, x_train, y_train, x_validate, y_validate, x_test, y_test = pp.model_telco_data(telco)

In [4]:
# running a function which determines which way to default our target variable
# We are using the 'churn' variable as target because we are attempting to predict
print(f"Baseline accuracy is {round(evaluate.baseline(telco, 'churn')*100,2)}%")

Baseline accuracy is 73.46%


## Running KNN with N = 5 (default setting)

In [5]:
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)

In [6]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train, y_train)
y_predictions = knn.predict(x_train)

In [7]:
y_predictions = knn.predict(x_train)

In [8]:
y_predictions

array([0, 1, 0, ..., 0, 0, 1])

In [9]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(x_train, y_train)))

Accuracy of KNN classifier on training set: 0.84


In [10]:
confusion_matrix(y_train, y_predictions)

array([[2627,  264],
       [ 376,  670]])

In [11]:
print(classification_report(y_train, y_predictions))

              precision    recall  f1-score   support

           0       0.87      0.91      0.89      2891
           1       0.72      0.64      0.68      1046

    accuracy                           0.84      3937
   macro avg       0.80      0.77      0.78      3937
weighted avg       0.83      0.84      0.83      3937



In [12]:
evaluate.print_classification_metrics(y_train, y_predictions)

Accuracy: 83.74
True Positive Rate: 64.05
False Positive Rate: 9.13
True Negative Rate: 90.87
False Negative Rate: 35.95
Precision: 71.73
Recall: 64.05
F1 Score: 67.68
Support (0): 104600
Support (1): 289100


## Running the KNN model with N = 10

In [13]:
knn = KNeighborsClassifier(n_neighbors=10)

In [14]:
knn.fit(x_train, y_train)

In [17]:
# creating baseline based on 
y_preds = knn.predict(x_train)

In [18]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(x_train, y_train)))

Accuracy of KNN classifier on training set: 0.82


In [19]:
print(classification_report(y_train, y_preds))

              precision    recall  f1-score   support

           0       0.84      0.93      0.89      2891
           1       0.74      0.52      0.61      1046

    accuracy                           0.82      3937
   macro avg       0.79      0.73      0.75      3937
weighted avg       0.82      0.82      0.81      3937



In [20]:
evaluate.print_classification_metrics(y_train, y_preds)

Accuracy: 82.35
True Positive Rate: 51.91
False Positive Rate: 6.64
True Negative Rate: 93.36
False Negative Rate: 48.09
Precision: 73.88
Recall: 51.91
F1 Score: 60.98
Support (0): 104600
Support (1): 289100


## editing the independent variables in dataframes

In [7]:
#limited list experiment
'''
feature_list = ['tenure','payment_type_Electronic check','contract_type_Two year','internet_service_type_Fiber optic','charges_lower_quartile', 'charges_higher_quartile', 'mid_charge']
x_train = x_train[feature_list]
x_validate = x_validate[feature_list]
x_test = x_test[feature_list]
'''

#expansive but without monthly charges
drop_list= ['charges_lower_quartile', 'charges_higher_quartile', 'mid_charge']
x_train = x_train.drop(columns=drop_list)
x_validate = x_validate.drop(columns=drop_list)
x_test = x_test.drop(columns=drop_list)

## Evaluating differences in KNN model performance

In [8]:
train_predict = {
    'model': [],
    'accuracy': [],
    'true_positive_rate': [],
    'false_positive_rate': [],
    'true_negative_rate': [],
    'false_negative_rate': [],
    'precision': [],
    'recall': [],
    'f1_score': [],
    'support_0': [],
    'support_1': []
}


for n in [5, 7, 10]:
    
    knn = KNeighborsClassifier(n_neighbors=n)
    '''
    # building baseline predictions 
    knn.fit(x_train, y_train)
    y_predictions = knn.predict(baseline_train)
    train_predict['model'].append(f'baseline_{n}')
    train_predict['accuracy'].append(accuracy)
    train_predict['true_positive_rate'].append(true_positive_rate)
    train_predict['false_positive_rate'].append(false_positive_rate)
    train_predict['true_negative_rate'].append(true_negative_rate)
    train_predict['false_negative_rate'].append(false_negative_rate)
    train_predict['precision'].append(precision)
    train_predict['recall'].append(recall)
    train_predict['f1_score'].append(f1_score)
    train_predict['support_0'].append(support_pos)
    train_predict['support_1'].append(support_neg)
    '''

    # building knn model predictions
    knn.fit(x_train, y_train)
    y_preds = knn.predict(x_train)

    TN, FP, FN, TP = confusion_matrix(y_train, y_preds).ravel()
    ALL = TP + TN + FP + FN

    accuracy = (TP + TN)/ALL
    true_positive_rate = TP/(TP+FN)
    false_positive_rate = FP/(FP+TN)
    true_negative_rate = TN/(TN+FP)
    false_negative_rate = FN/(FN+TP)
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    f1_score = 2*(precision*recall)/(precision+recall)
    support_pos = TP + FN
    support_neg = FP + TN

    train_predict['model'].append(f'knn_n_{n}')
    train_predict['accuracy'].append(accuracy)
    train_predict['true_positive_rate'].append(true_positive_rate)
    train_predict['false_positive_rate'].append(false_positive_rate)
    train_predict['true_negative_rate'].append(true_negative_rate)
    train_predict['false_negative_rate'].append(false_negative_rate)
    train_predict['precision'].append(precision)
    train_predict['recall'].append(recall)
    train_predict['f1_score'].append(f1_score)
    train_predict['support_0'].append(support_pos)
    train_predict['support_1'].append(support_neg)
    
train_predict = pd.DataFrame(train_predict).T

In [9]:
train_predict

Unnamed: 0,0,1,2
model,knn_n_5,knn_n_7,knn_n_10
accuracy,0.846584,0.826264,0.821184
true_positive_rate,0.655832,0.616635,0.511472
false_positive_rate,0.0844,0.09789,0.066759
true_negative_rate,0.9156,0.90211,0.933241
false_negative_rate,0.344168,0.383365,0.488528
precision,0.737634,0.695043,0.73489
recall,0.655832,0.616635,0.511472
f1_score,0.694332,0.653495,0.603157
support_0,1046,1046,1046


In [45]:
#train_predict_original=train_predict
train_predict_original

Unnamed: 0,0,1
model,knn_n_5,knn_n_10
accuracy,0.840234,0.824994
true_positive_rate,0.635755,0.524857
false_positive_rate,0.085783,0.066413
true_negative_rate,0.914217,0.933587
false_negative_rate,0.364245,0.475143
precision,0.728368,0.740891
recall,0.635755,0.524857
f1_score,0.678918,0.614438
support_0,1046,1046


## Evaluating Models on validation data set

In [10]:
validate_predict = {
    'model': [],
    'accuracy': [],
    'true_positive_rate': [],
    'false_positive_rate': [],
    'true_negative_rate': [],
    'false_negative_rate': [],
    'precision': [],
    'recall': [],
    'f1_score': [],
    'support_0': [],
    'support_1': []
}

for n in [5, 7, 10]:
    knn = KNeighborsClassifier(n_neighbors=n)
    
    knn.fit(x_train, y_train)
    
    y_preds = knn.predict(x_validate)
    
    TN, FP, FN, TP = confusion_matrix(y_validate, y_preds).ravel()
    ALL = TP + TN + FP + FN
    
    accuracy = (TP + TN)/ALL
    true_positive_rate = TP/(TP+FN)
    false_positive_rate = FP/(FP+TN)
    true_negative_rate = TN/(TN+FP)
    false_negative_rate = FN/(FN+TP)
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    f1_score = 2*(precision*recall)/(precision+recall)
    support_pos = TP + FN
    support_neg = FP + TN
    
    validate_predict['model'].append(f'knn_n_{n}')
    validate_predict['accuracy'].append(accuracy)
    validate_predict['true_positive_rate'].append(true_positive_rate)
    validate_predict['false_positive_rate'].append(false_positive_rate)
    validate_predict['true_negative_rate'].append(true_negative_rate)
    validate_predict['false_negative_rate'].append(false_negative_rate)
    validate_predict['precision'].append(precision)
    validate_predict['recall'].append(recall)
    validate_predict['f1_score'].append(f1_score)
    validate_predict['support_0'].append(support_pos)
    validate_predict['support_1'].append(support_neg)
    
validate_predict = pd.DataFrame(validate_predict).T

In [12]:
validate_predict

Unnamed: 0,0,1,2
model,knn_n_5,knn_n_7,knn_n_10
accuracy,0.761848,0.770735,0.777251
true_positive_rate,0.503341,0.518931,0.407572
false_positive_rate,0.144471,0.138015,0.088781
true_negative_rate,0.855529,0.861985,0.911219
false_negative_rate,0.496659,0.481069,0.592428
precision,0.558025,0.576733,0.624573
recall,0.503341,0.518931,0.407572
f1_score,0.529274,0.546307,0.493261
support_0,449,449,449


In [89]:
validate_predict_original

Unnamed: 0,0,1,2
model,knn_n_5,knn_n_7,knn_n_10
accuracy,0.76955,0.747038,0.771327
true_positive_rate,0.481069,0.398664,0.405345
false_positive_rate,0.125908,0.126715,0.096045
true_negative_rate,0.874092,0.873285,0.903955
false_negative_rate,0.518931,0.601336,0.594655
precision,0.580645,0.532738,0.604651
recall,0.481069,0.398664,0.405345
f1_score,0.526188,0.456051,0.485333
support_0,449,449,449


## altering the training set per the findings of best features

In [13]:
compare = pd.concat([train_predict, validate_predict], axis = 1)

In [14]:
compare

Unnamed: 0,0,1,2,0.1,1.1,2.1
model,knn_n_5,knn_n_7,knn_n_10,knn_n_5,knn_n_7,knn_n_10
accuracy,0.846584,0.826264,0.821184,0.761848,0.770735,0.777251
true_positive_rate,0.655832,0.616635,0.511472,0.503341,0.518931,0.407572
false_positive_rate,0.0844,0.09789,0.066759,0.144471,0.138015,0.088781
true_negative_rate,0.9156,0.90211,0.933241,0.855529,0.861985,0.911219
false_negative_rate,0.344168,0.383365,0.488528,0.496659,0.481069,0.592428
precision,0.737634,0.695043,0.73489,0.558025,0.576733,0.624573
recall,0.655832,0.616635,0.511472,0.503341,0.518931,0.407572
f1_score,0.694332,0.653495,0.603157,0.529274,0.546307,0.493261
support_0,1046,1046,1046,449,449,449


- Looks like the N = 10 KNN model works better on the validate data than n = 5 model
- I think I am locking down this model on N = 10. 

In [6]:
target = train.iloc[:,0]
independent = train.iloc[:,1:]


In [71]:
# determining best, strongest features for the model
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(independent,target)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(independent.columns)

#concat two dataframes for better visualization 
feature_scores = pd.concat([dfcolumns,dfscores],axis=1)

# renaming the two columns to make sense
feature_scores.columns = ['customer_features','score']


feature_scores = feature_scores.nlargest(10, 'score')
best_features = feature_scores['customer_features']

In [101]:
feature_scores

Unnamed: 0,customer_features,score
1,tenure,9425.628007
25,payment_type_Electronic check,276.262478
21,contract_type_Two year,274.407274
22,internet_service_type_Fiber optic,243.599093
8,online_security_No internet service,158.972011
10,online_backup_No internet service,158.972011
12,device_protection_No internet service,158.972011
14,tech_support_No internet service,158.972011
16,streaming_tv_No internet service,158.972011
18,streaming_movies_No internet service,158.972011


In [100]:
x_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1407 entries, 111 to 2588
Data columns (total 7 columns):
 #   Column                             Non-Null Count  Dtype
---  ------                             --------------  -----
 0   tenure                             1407 non-null   int64
 1   payment_type_Electronic check      1407 non-null   uint8
 2   contract_type_Two year             1407 non-null   uint8
 3   internet_service_type_Fiber optic  1407 non-null   uint8
 4   charges_lower_quartile             1407 non-null   bool 
 5   charges_higher_quartile            1407 non-null   bool 
 6   mid_charge                         1407 non-null   bool 
dtypes: bool(3), int64(1), uint8(3)
memory usage: 30.2 KB


## Testing the N = 10 KNN model

In [15]:
test_predict = {
    'model': [],
    'accuracy': [],
    'true_positive_rate': [],
    'false_positive_rate': [],
    'true_negative_rate': [],
    'false_negative_rate': [],
    'precision': [],
    'recall': [],
    'f1_score': [],
    'support_0': [],
    'support_1': []
}
n = 10

knn = KNeighborsClassifier(n_neighbors=n)

knn.fit(x_train, y_train)

y_preds = knn.predict(x_test)

TN, FP, FN, TP = confusion_matrix(y_test, y_preds).ravel()
ALL = TP + TN + FP + FN

accuracy = (TP + TN)/ALL
true_positive_rate = TP/(TP+FN)
false_positive_rate = FP/(FP+TN)
true_negative_rate = TN/(TN+FP)
false_negative_rate = FN/(FN+TP)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
f1_score = 2*(precision*recall)/(precision+recall)
support_pos = TP + FN
support_neg = FP + TN

test_predict['model'].append(f'knn_n_{n}')
test_predict['accuracy'].append(accuracy)
test_predict['true_positive_rate'].append(true_positive_rate)
test_predict['false_positive_rate'].append(false_positive_rate)
test_predict['true_negative_rate'].append(true_negative_rate)
test_predict['false_negative_rate'].append(false_negative_rate)
test_predict['precision'].append(precision)
test_predict['recall'].append(recall)
test_predict['f1_score'].append(f1_score)
test_predict['support_0'].append(support_pos)
test_predict['support_1'].append(support_neg)

test_predict = pd.DataFrame(test_predict).T

In [None]:
#exported_csv = pd.DataFrame(y_preds, y_test)
#exported_csv.to_csv('exported_csv', index = True)
#gfg_csv_data = df.to_csv('GfG.csv', index = True)

In [16]:
test_predict.T['accuracy'][0]

0.7711442786069652

In [None]:
print(f"{baseline_accuracy} {test_predict.T['accuracy']}")

In [None]:
{round(test_predict.T['accuracy'][0]*100,2)}%

In [None]:
print(f"Baseline accuracy = {round(baseline_accuracy[0]*100,2)}%") 
print(f"KNN model Accuracy = {round(test_predict.T['accuracy'][0]*100,2)}%")