In [40]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
import math

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score, roc_curve, precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier

In [18]:
data = pd.read_csv("../data/customer_data.csv").set_index('customerID')
df = data.copy()

# data preparation for training

In [None]:
def datapreparation(df=df):
    
    df.TotalCharges = df.TotalCharges.replace(" ",np.nan)
    df.TotalCharges.fillna(0, inplace = True)
    df.TotalCharges = df.TotalCharges.astype(float)
    
    cols1 = ['Partner', 'Dependents', 'PaperlessBilling', 'Churn', 'PhoneService']
    for col in cols1:
        df[col] = df[col].apply(lambda x: 0 if x == "No" else 1)
   
    df.gender = df.gender.apply(lambda x: 0 if x == "Male" else 1)
    df.MultipleLines = df.MultipleLines.map({'No phone service': 0, 'No': 0, 'Yes': 1})
    
    cols2 = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']
    for col in cols2:
        df[col] = df[col].map({'No internet service': 0, 'No': 0, 'Yes': 1})
    
    df = pd.get_dummies(df, columns=['InternetService', 'Contract', 'PaymentMethod'], drop_first=True)
    
    return df

df = datapreparation()

In [41]:
from sklearn.preprocessing import MinMaxScaler

- the data columns distributions is not normally shaped so we will use the MinMaxScaler

### Splitting data
- We've seen that the dataset has imbalanced classes thus we need to keep the same initial distribution in the training and testing dataset. Solution: set the *stratify* parameter to the target column.
- Random state: since the random state must be a fixed just so that the train and test sets will be the same every time we try to split the data. I will set it to a random but **fixed** integer. 

In [None]:
def split_data():
    columns = df.columns
    X = df.drop(columns=["Churn"]) # every column except the target: churn
    Y = df["Churn"]
    return  train_test_split(X,Y, test_size=0.2, random_state=100, stratify = df.Churn)

x_train, y_train, x_predict, y_predict  = split_data()

### Training the model
- The weights are uniform because from the exploration, we found that there are some columns inversely related to churning but so important nevertheless.
- the metric is haming becaus ewe are dealing with categorical and boolean data mostly

In [None]:
def train_model():
    samples_number = x_train.shape[0]
    n_neighbors = int(math.sqrt(samples_number))
    model = KNeighborsClassifier(n_neighbors=n_neighbors, weights = 'uniform', metric = 'hamming')
    model.fit(x_train, y_train)
    return model

In [None]:
def best_parameter():
    k_values = range(1, 31)
    cv_scores = []
    for k in k_values:
        