In [6]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import pandas as pd

# read the dataset
df = pd.read_csv("../datasets/supervised_learning_python/telecom_churn_clean.csv")
df.head()   

Unnamed: 0.1,Unnamed: 0,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,customer_service_calls,churn
0,0,128,415,0,1,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,0
1,1,107,415,0,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,0
2,2,137,415,0,0,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,0
3,3,84,408,1,0,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,0
4,4,75,415,1,0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,0


In [15]:
# X is the features and y is the target variable
# we are using the total_day_charge and total_eve_charge as features to predict the churn
# the values are stored in numpy arrays
X = df[["total_day_charge", "total_eve_charge"]].values
y = df["churn"].values


# show the shape of the dataset
print(X.shape, y.shape)

# show the first 5 rows of the dataset
print("X:")
print(X[:5])
print("y:")
print(y[:5])

(3333, 2) (3333,)
X:
[[45.07 16.78]
 [27.47 16.62]
 [41.38 10.3 ]
 [50.9   5.26]
 [28.34 12.61]]
y:
[0 0 0 0 0]


In [16]:
# stratify=y makes sure that the training and testing sets have the same proportion of churned and non-churned customers as the original dataset    
# random_state=42 is used to ensure reproducibility of the results
# test_size=0.3 means that 30% of the data will be used for testing and 70% for training
# X_train, X_test, y_train, y_test are the training and testing sets
# the data is split into training and testing sets
# the training set is used to train the model
# the testing set is used to evaluate the performance of the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)



In [17]:
# create a KNN classifier with 6 neighbors  
knn = KNeighborsClassifier(n_neighbors=6)
# train the model
knn.fit(X_train, y_train)
# evaluate the model
# the score is the accuracy of the model
# the accuracy is the percentage of correct predictions
print(knn.score(X_test, y_test))

0.872
