### K-Nearest Neighbour Classification

In [24]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, LabelBinarizer  


In [25]:
df = pd.read_csv('Orange_Telecom_Churn_Data.csv')
df.head(1).T

Unnamed: 0,0
state,KS
account_length,128
area_code,415
phone_number,382-4657
intl_plan,no
voice_mail_plan,yes
number_vmail_messages,25
total_day_minutes,265.1
total_day_calls,110
total_day_charge,45.07


In [26]:
## To remove useless features: 
## state, area_code, phone number.

# Remove extraneous columns
df.drop(['state', 'area_code', 'phone_number'], axis=1, inplace=True)
df.columns
df.head(2)


Unnamed: 0,account_length,intl_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churned
0,128,no,yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,107,no,yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False


In [27]:
df.head(2)

Unnamed: 0,account_length,intl_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churned
0,128,no,yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,107,no,yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False


In [29]:
## Encoding the data using LabelBinarizer.
lb = LabelBinarizer()

for col in ['intl_plan', 'voice_mail_plan', 'churned']:
    df[col] = lb.fit_transform(df[col])

In [32]:
## Seperate the target column from input columns.

X_cols = [x for x in df.columns if x != 'churned']

X = df[X_cols]
y = df['churned']

print(X.head(2))
print(y.head(2))

   account_length  intl_plan  voice_mail_plan  number_vmail_messages  \
0             128          0                1                     25   
1             107          0                1                     26   

   total_day_minutes  total_day_calls  total_day_charge  total_eve_minutes  \
0              265.1              110             45.07              197.4   
1              161.6              123             27.47              195.5   

   total_eve_calls  total_eve_charge  total_night_minutes  total_night_calls  \
0               99             16.78                244.7                 91   
1              103             16.62                254.4                103   

   total_night_charge  total_intl_minutes  total_intl_calls  \
0               11.01                10.0                 3   
1               11.45                13.7                 3   

   total_intl_charge  number_customer_service_calls  
0                2.7                              1  
1        

In [55]:
## Applying the model using weights as distance.
n = int(input("Enter the neighbours: "))
print("K value set: ", n)
knn = KNeighborsClassifier(n_neighbors=n, p=1) ## Do both for p=1 and 2 i.e Eucluidian and Manhattan
knn = knn.fit(X, y)
y_pred = knn.predict(X)

## Finding the accuracy model.
def accuracy(real, predict):
    return sum(y == y_pred) / float(real.shape[0])

print(accuracy(y, y_pred))

## For k=3 -> Euclid: 0.9182
## For k=3 -> Manhattan: 0.9164

K value set:  3
0.9182


In [None]:
## Automating KNN algo for different weights.
def autoKNN(k_val):
    score_list = []
    for i in range(1, k_val):
        knn = KNeighborsClassifier(n_neighbors=i, p=1)  
        knn = knn.fit(X, y)
        y_pred = knn.predict(X)
        accr = accuracy(y, y_pred)
        score_list.append((i, accr))
    score_df = pd.DataFrame(score_list, columns=['i', 'accr'])
    print(score_df)

k_val = int(input("Enter the k value end range: "))
autoKNN(k_val)

   i   accr
0  1  0.896
1  2  0.896
2  3  0.896
3  4  0.896
4  5  0.896
