# <font color='blue'>Telecom Churn Case Study</font>
* Institution: IIIT, Bangalore and UpGrad
* Course: PG Diploma in Machine Lerning and AI March 2018
* Date: 14-Aug-2018
* Submitted by:
    1. Pandinath Siddineni (ID- APFE187000194)
    2. AKNR Chandra Sekhar (ID- APFE187000315)
    3. Brajesh Kumar       (ID- APFE187000149)
    4. Shweta Tiwari
-----------------------------------

# <font color='blue'>PART 3: LASSO & DECISSION TREE</font>

In [None]:
import os.path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

pd.options.display.float_format = '{:.2f}'.format

In [None]:
# Load clean telecom data file
master_df = pd.read_csv('telecom_churn_data_clean.csv', low_memory=False)
master_df.head()

In [None]:
print('Dataframe Shape: ', master_df.shape)
print("Dataframe Info: \n"); master_df.info()

In [None]:
# Drop MemberID/Phone-number
telecom = master_df.drop(['mobile_number'], axis=1)

# Create X (independent variable) & y (dependent variable) 
df_telecom = telecom.drop(['churn'], axis=1)
X = telecom.drop(['churn'], axis=1)
y = telecom['churn']

In [None]:
#list(master_df)
master_df.shape

### Data Standardization/Normalization

In [None]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler


scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)

### Split Data into Train & Test

In [None]:
# Split in train & Test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7, random_state=1)

In [None]:
print("X_train Dataframe Shape {}".format(X_train.shape))
print("X_test Dataframe Shape {}".format(X_test.shape))

y_train_imb = (y_train != 0).sum()/(y_train == 0).sum()
y_test_imb = (y_test != 0).sum()/(y_test == 0).sum()
print("Imbalance in Train Data: {}".format(y_train_imb))
print("Imbalance in Test Data: {}".format(y_test_imb))

### Balance data set by oversampling

In [None]:
# (Training) Balance Data-Set --- SMOT
from imblearn.over_sampling import SMOTE

sm = SMOTE(kind = "regular")
X_tr,y_tr = sm.fit_sample(X_train,y_train)

In [None]:
print("X_tr Dataframe Shape {}".format(X_tr.shape))
print("y_tr Dataframe Shape {}".format(y_tr.shape))

data_imbalance = (y_tr != 0).sum()/(y_tr == 0).sum()
print("Imbalance in Train Data: {}".format(data_imbalance))

### Feature reduction using LASSO

In [None]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
 
lsvc = LinearSVC(C=0.02, penalty="l1", dual=False).fit(X_tr, y_tr)
model = SelectFromModel(lsvc, prefit=True)
X_lasso = model.transform(X_tr)
pos = model.get_support(indices=True)
 ### Feature reduction using RFE
print(X_lasso.shape)
print(pos)

In [None]:

#feature vector for decision tree#feature 
lasso_features = list(df_telecom.columns[pos])
print("Features identified by LASSO for model buidling: ", lasso_features)

In [None]:
X_train = X_lasso
y_train = y_tr

In [None]:
print ("Feature space holds %d observations and %d features" % X_train.shape)
print ("Unique target labels:", np.unique(y_train))

### Decision Tree with default hyperparameter

In [None]:
# Importing decision tree classifier from sklearn library
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

# Fitting the decision tree with default hyperparameters, apart from
# max_depth which is 5 so that we can plot and read the tree.
dt_default = DecisionTreeClassifier(max_depth=5)
dt_default.fit(X_train, y_train)

In [None]:
# Let's check the evaluation metrics of our default model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Making predictions
X_test = pd.DataFrame(data=X_test).iloc[:, pos]
y_pred_default = dt_default.predict(X_test)

# Printing classification report
print(classification_report(y_test, y_pred_default))

In [None]:
# Printing confusion matrix and accuracy
print(confusion_matrix(y_test,y_pred_default))
print(accuracy_score(y_test,y_pred_default))

### Hyperparameter Tuning

NOTE: 
1. Hyperparameter Tunning is commented as it takes heavy computing power and time. It can be run by uncommenting it.
2. Getting 86% accuracy that looks to be pretty good.

### Tuning max_depth

In [None]:
# # GridSearchCV to find optimal max_depth
# from sklearn.model_selection import KFold
# from sklearn.model_selection import GridSearchCV


# # specify number of folds for k-fold CV
# n_folds = 5

# # parameters to build the model on
# parameters = {'max_depth': range(1, 40)}

# # instantiate the model
# dtree = DecisionTreeClassifier(criterion = "gini", 
#                                random_state = 100)

# # fit tree on training data
# tree = GridSearchCV(dtree, parameters, 
#                     cv=n_folds, 
#                    scoring="accuracy")
# tree.fit(X_train, y_train)

In [None]:
# # scores of GridSearch CV
# scores = tree.cv_results_
# pd.DataFrame(scores).head()

In [None]:
# # plotting accuracies with max_depth
# plt.figure()
# plt.plot(scores["param_max_depth"], 
#          scores["mean_train_score"], 
#          label="training accuracy")
# plt.plot(scores["param_max_depth"], 
#          scores["mean_test_score"], 
#          label="test accuracy")
# plt.xlabel("max_depth")
# plt.ylabel("Accuracy")
# plt.legend()
# plt.show()

conclusion for max depth:
You can see that as we increase the value of max_depth, both training and test score increase till about max-depth = 10, after which the test score is constant. Note that the scores are average accuracies across the 5-folds.

we can consider max_depth=10

### Tuning min_samples_leaf

In [None]:
# # GridSearchCV to find optimal max_depth
# from sklearn.model_selection import KFold
# from sklearn.model_selection import GridSearchCV


# # specify number of folds for k-fold CV
# n_folds = 5

# # parameters to build the model on
# parameters = {'min_samples_leaf': range(5, 200, 20)}

# # instantiate the model
# dtree = DecisionTreeClassifier(criterion = "gini", 
#                                random_state = 100)

# # fit tree on training data
# tree = GridSearchCV(dtree, parameters, 
#                     cv=n_folds, 
#                    scoring="accuracy")
# tree.fit(X_train, y_train)

In [None]:
# # scores of GridSearch CV
# scores = tree.cv_results_
# pd.DataFrame(scores).head()

In [None]:
# # plotting accuracies with min_samples_leaf
# plt.figure()
# plt.plot(scores["param_min_samples_leaf"], 
#          scores["mean_train_score"], 
#          label="training accuracy")
# plt.plot(scores["param_min_samples_leaf"], 
#          scores["mean_test_score"], 
#          label="test accuracy")
# plt.xlabel("min_samples_leaf")
# plt.ylabel("Accuracy")
# plt.legend()
# plt.show()

conclusion for min_samples_leaf:
at low values of min_samples_leaf seems overfitted. At values 125,the model becomes more stable and the training and test accuracy start to converge.
min_samples_leaf=125

### Tuning min_samples_split

In [None]:
# # GridSearchCV to find optimal min_samples_split
# from sklearn.model_selection import KFold
# from sklearn.model_selection import GridSearchCV


# # specify number of folds for k-fold CV
# n_folds = 5

# # parameters to build the model on
# parameters = {'min_samples_split': range(5, 200, 20)}

# # instantiate the model
# dtree = DecisionTreeClassifier(criterion = "gini", 
#                                random_state = 100)

# # fit tree on training data
# tree = GridSearchCV(dtree, parameters, 
#                     cv=n_folds, 
#                    scoring="accuracy")
# tree.fit(X_train, y_train)

In [None]:
# # scores of GridSearch CV
# scores = tree.cv_results_
# pd.DataFrame(scores).head()

In [None]:
# # plotting accuracies with min_samples_leaf
# plt.figure()
# plt.plot(scores["param_min_samples_split"], 
#          scores["mean_train_score"], 
#          label="training accuracy")
# plt.plot(scores["param_min_samples_split"], 
#          scores["mean_test_score"], 
#          label="test accuracy")
# plt.xlabel("min_samples_split")
# plt.ylabel("Accuracy")
# plt.legend()
# plt.show()

as increase min_samples_split, the tree overfits lesser since the model is less complex

In [None]:
# # Create the parameter grid 
# param_grid = {
#     'max_depth': range(5, 15, 5),
#     'min_samples_leaf': range(50, 150, 50),
#     'min_samples_split': range(50, 150, 50),
#     'criterion': ["entropy", "gini"]
# }

# n_folds = 5

# # Instantiate the grid search model
# dtree = DecisionTreeClassifier()
# grid_search = GridSearchCV(estimator = dtree, param_grid = param_grid, 
#                           cv = n_folds, verbose = 1)

# # Fit the grid search to the data
# grid_search.fit(X_train,y_train)

In [None]:
# # cv results
# cv_results = pd.DataFrame(grid_search.cv_results_)
# cv_results

In [None]:
# # printing the optimal accuracy score and hyperparameters
# print("best accuracy", grid_search.best_score_)
# print(grid_search.best_estimator_)

In [None]:
# # model with optimal hyperparameters
# clf_gini = DecisionTreeClassifier(criterion = "gini", 
#                                   random_state = 100,
#                                   max_depth=10, 
#                                   min_samples_leaf=50,
#                                   min_samples_split=50)
# clf_gini.fit(X_train, y_train)

In [None]:
# # accuracy score
# clf_gini.score(X_test,y_test)

# <font color='blue'>SUMMARY PART 3: LASSO & DECISSION TREE</font>
OBSERVATIONS
1. Getting around 86.0% accuracy 
2. Confusion matix shows lot of false positives still exist.

NEXT STEPS:
1. Try Random Forrest.