In [None]:
import numpy as np
import pandas as pd
import re
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.decomposition import IncrementalPCA
import seaborn as snb
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [None]:
def draw_roc( actual, predicted ):
    fpr, tpr, thresholds = metrics.roc_curve( actual, predicted,
                                              drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, predicted )
    plt.figure(figsize=(5, 5))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curve')
    plt.legend(loc="lower right")
    plt.show()

    return None

In [None]:
scaled_churn_data = pd.read_csv('traindata.csv')

In [None]:
#t = pd.read_csv('telecom_churn_data.csv')
scaled_churn_data['Unnamed: 0'].head()
scaled_churn_data = scaled_churn_data.drop(['Unnamed: 0'],axis = 1)

In [None]:
churn = scaled_churn_data['churn']
scaled_churn_data = scaled_churn_data.drop(['churn'],axis = 1)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(svd_solver='randomized', random_state=42)
pca.fit(scaled_churn_data)

In [None]:
%matplotlib inline
fig = plt.figure(figsize = (12,8))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.show()

In [None]:
np.cumsum(pca.explained_variance_ratio_)

Here we see that 57 components are sufficient to explain 95%  of the variance

In [None]:
#however we can take incremental PCA to get the values of data corresponding to new components
pca_final = IncrementalPCA(n_components=57)
churn_data_pca = pca_final.fit_transform(scaled_churn_data)
churn_data_pca.shape

In [None]:
correlation_mat = np.corrcoef(churn_data_pca.transpose())

In [None]:
corrmat_nodiag = correlation_mat - np.diagflat(correlation_mat.diagonal())
print(np.max(corrmat_nodiag))
print(np.min(corrmat_nodiag))
correlation_mat.shape

The correlation is almost close to zero.So now we are good to start with modelling

So first lets get our hands dirty with simple logistic regression

In [None]:
#load test data
scaled_churn_data_test = pd.read_csv('testdata.csv')
churn_test = scaled_churn_data_test['churn']
scaled_churn_data_test = scaled_churn_data_test.drop(['churn'],axis = 1)

In [None]:
scaled_churn_data_test = scaled_churn_data_test.drop(['Unnamed: 0'],axis = 1)

In [None]:
churn_test_data = pca_final.transform(scaled_churn_data_test)

In [None]:
churn_test_data.shape

In [None]:
learner_pca = LogisticRegression(random_state=0, class_weight='balanced')
model_pca = learner_pca.fit(churn_data_pca,churn)

In [None]:
pred_prob_train = model_pca.predict_proba(churn_data_pca)[:,1]
churn_pred = pd.Series(pred_prob_train).map(lambda x:1 if x>0.5 else 0)

In [None]:
#help(metrics.confusion_matrix)

In [None]:
# Let's take a look at the confusion matrix again 
confusion = metrics.confusion_matrix(churn,churn_pred )
confusion

In [None]:
#help(metrics.confusion_matrix)
metrics.accuracy_score(churn, churn_pred)

Lets check sensitivity and specificity of the model

In [None]:
sensitivity = confusion[1,1]/(confusion[1,1]+confusion[1,0])
specificity = confusion[0,0]/(confusion[0,0]+confusion[0,1])
print(sensitivity)
print(specificity)

In [None]:
draw_roc(churn, pred_prob_train)

In [None]:
for cutoff in [0.5,0.45,0.4,0.35,0.3,0.25,0.2]:
    churn_pred = pd.Series(pred_prob_train).map(lambda x:1 if x>cutoff else 0)
    confusion = metrics.confusion_matrix(churn,churn_pred )
    print(str(cutoff)+":")
    print(confusion)
    sensitivity = confusion[1,1]/(confusion[1,1]+confusion[1,0])
    specificity = confusion[0,0]/(confusion[0,0]+confusion[0,1])
    print("Sensitivity:")
    print(sensitivity)
    print("Specificity:")
    print(specificity)
    print("Accuracy")
    print(metrics.accuracy_score(churn, churn_pred))
    print("")

With this model we can pick cuttoff as 0.45 as sensitivity is more important here.It gives almost 86% sensitivity and decent accuracy of 77%.Its important to identify churners

We see that model has pretty good sensitivity,specificity and good accuracy.However sensitivity is more important
in our model.However this is in case of training data.Lets check for test data.

In [None]:
pred_prob_test = model_pca.predict_proba(churn_test_data)[:,1]
churn_pred_test = pd.Series(pred_prob_test).map(lambda x:1 if x>0.45 else 0)
confusion = metrics.confusion_matrix(churn_test,churn_pred_test )
print(confusion)

In [None]:
sensitivity = confusion[1,1]/(confusion[1,1]+confusion[1,0])
specificity = confusion[0,0]/(confusion[0,0]+confusion[0,1])
print(sensitivity)
print(specificity)
metrics.accuracy_score(churn_test, churn_pred_test)

The model seems to be doing good job on test data as well with sensitivity ,specificity and accuracy being very much
in par with training data set.As of now regularization is not required as there is no significant difference between the
metrics of train and test performances,no sign of overfitting

Now lets take another model SVM and evaluate the performance

In [None]:
svc_model = SVC(C=10,kernel = 'linear',class_weight='balanced')
svc_model.fit(churn_data_pca,churn)
pred_svm = svc_model.predict(churn_data_pca)


In [None]:
metrics.confusion_matrix(churn,pred_svm)

In [None]:
folds = KFold(n_splits = 5,shuffle = True,random_state = 4)
hyperparamers = [{'gamma' :[0.001,0.01,0.05,0.1,0.5,10,30,50,100],'C':[10,15,20,25,30,50,100]}]
model_svc = SVC(kernel = "rbf",class_weight='balanced')  
model_grid_search = GridSearchCV(estimator = model_svc,param_grid = hyperparameters,scoring = 'accuracy',fold = folds,verbose = 1,return_train_score = True)
model_grid_search.fit(churn_data_pca,churn)