# Classification

## Data Importing and Splitting

In [176]:
##Importing the dataset
import pandas as pd

data = pd.read_csv('churn.csv')

##Defining features and target variaible
#Dropping State, Churn (target variable), Total_day_charge, Total_eve_charge, Total_night_charge, and Total_intl_charge (high correlation with minutes)
X = data.drop(columns=['Churn', 'Total_day_charge', 'Total_eve_charge', 'Total_night_charge', 'Total_intl_charge']) 
X = pd.DataFrame(X)

#Defining target variable
y = data['Churn']

##Dividing the dataset into training and test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Data Cleaning and Transformation

In [177]:
##Filling null entries of numerical features with the median of each and Z-standardizing it
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

X_train_num = X_train.select_dtypes(include=[np.number])
X_train_cat = X_train[["State", "Area_code", "International_plan", "Voice_mail_plan"]]

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

##Transforming categorical features and combining it with the numerical features
from sklearn.compose import ColumnTransformer

num_attribs = list(X_train_num)
cat_attribs = ["State", "Area_code", "International_plan", "Voice_mail_plan"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

X_train = full_pipeline.fit_transform(X_train)

#Transforming X_train format from sparse matrix to matrix
import scipy.sparse
X_train = X_train.todense()

##PCA transformation
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca.fit(X_train)
X_train = pca.transform(X_train)
X_train = pd.DataFrame(X_train)

##Transforming test dataset using pipeline defined above and PCA
X_test_num = X_test.select_dtypes(include=[np.number])
X_test_cat = X_test[["State", "Area_code", "International_plan", "Voice_mail_plan"]]

X_test = full_pipeline.fit_transform(X_test)
X_test = X_test.todense()

pca.fit(X_test)
X_test = pca.transform(X_test)
X_test = pd.DataFrame(X_test)



## Model Selection

In [185]:
##Importing performance measure metrics
import timeit
from sklearn import metrics

##Generating basic dummy classifier model
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")

start = timeit.default_timer()
dummy_clf.fit(X_train, y_train)
stop = timeit.default_timer()
dummy_time = stop-start

y_test_pred = dummy_clf.predict(X_test)
dummy_accuracy = metrics.accuracy_score(y_test, y_test_pred)
dummy_precision = metrics.precision_score(y_test, y_test_pred, zero_division=1)
dummy_recall = metrics.recall_score(y_test, y_test_pred, zero_division=1)

dummy_performance = [[dummy_accuracy,dummy_precision,dummy_recall,dummy_time]]
dummy_performance = pd.DataFrame(dummy_performance, columns=['Accuracy','Precision','Recall',
                                                             'Model Training Time'])
print('Dummy classifier performance metrics:')
print(dummy_performance)
print('')

##Generating basic logistic regression model
from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression()

start = timeit.default_timer()
log_clf.fit(X_train, y_train)
stop = timeit.default_timer()
log_time = stop-start

y_test_pred = log_clf.predict(X_test)
log_accuracy = metrics.accuracy_score(y_test, y_test_pred)
log_precision = metrics.precision_score(y_test, y_test_pred, zero_division=1)
log_recall = metrics.recall_score(y_test, y_test_pred, zero_division=1)

log_performance = [[log_accuracy,log_precision,log_recall,log_time]]
log_performance = pd.DataFrame(log_performance, columns=['Accuracy','Precision','Recall',
                                                         'Model Training Time'])
print('Logistic regression performance metrics:')
print(log_performance)
print('')

##Generating basic random forest model
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier()

start = timeit.default_timer()
rf_clf.fit(X_train, y_train)
stop = timeit.default_timer()
rf_time = stop-start

y_test_pred = rf_clf.predict(X_test)
rf_accuracy = metrics.accuracy_score(y_test, y_test_pred)
rf_precision = metrics.precision_score(y_test, y_test_pred, zero_division=1)
rf_recall = metrics.recall_score(y_test, y_test_pred, zero_division=1)

rf_performance = [[rf_accuracy,rf_precision,rf_recall,rf_time]]
rf_performance = pd.DataFrame(rf_performance, columns=['Accuracy','Precision','Recall',
                                                       'Model Training Time'])
print('Random forest performance metrics:')
print(rf_performance)
print('')

##Generating basic gradient boosting model
from sklearn.ensemble import GradientBoostingClassifier

gb_clf = GradientBoostingClassifier()

start = timeit.default_timer()
gb_clf.fit(X_train, y_train)
stop = timeit.default_timer()
gb_time = stop-start

y_test_pred = gb_clf.predict(X_test)
gb_accuracy = metrics.accuracy_score(y_test, y_test_pred)
gb_precision = metrics.precision_score(y_test, y_test_pred, zero_division=1)
gb_recall = metrics.recall_score(y_test, y_test_pred, zero_division=1)

gb_performance = [[gb_accuracy,gb_precision,gb_recall,gb_time]]
gb_performance = pd.DataFrame(gb_performance, columns=['Accuracy','Precision','Recall',
                                                       'Model Training Time'])
print('Gradient boosting performance metrics:')
print(gb_performance)
print('')

##Generating basic lightgbm model
from lightgbm import LGBMClassifier

lgbm_clf = LGBMClassifier()

start = timeit.default_timer()
lgbm_clf.fit(X_train, y_train)
stop = timeit.default_timer()
lgbm_time = stop-start

y_test_pred = lgbm_clf.predict(X_test)
lgbm_accuracy = metrics.accuracy_score(y_test, y_test_pred)
lgbm_precision = metrics.precision_score(y_test, y_test_pred, zero_division=1)
lgbm_recall = metrics.recall_score(y_test, y_test_pred, zero_division=1)

lgbm_performance = [[lgbm_accuracy,lgbm_precision,lgbm_recall,lgbm_time]]
lgbm_performance = pd.DataFrame(lgbm_performance, columns=['Accuracy','Precision','Recall',
                                                           'Model Training Time'])
print('LightGBM performance metrics:')
print(lgbm_performance)
print('')

##Generating basic gradient boosting model
from xgboost import XGBClassifier

xgb_clf = XGBClassifier()

start = timeit.default_timer()
xgb_clf.fit(X_train, y_train)
stop = timeit.default_timer()
xgb_time = stop-start

y_test_pred = xgb_clf.predict(X_test)
xgb_accuracy = metrics.accuracy_score(y_test, y_test_pred)
xgb_precision = metrics.precision_score(y_test, y_test_pred, zero_division=1)
xgb_recall = metrics.recall_score(y_test, y_test_pred, zero_division=1)

xgb_performance = [[xgb_accuracy,xgb_precision,xgb_recall,xgb_time]]
xgb_performance = pd.DataFrame(xgb_performance, columns=['Accuracy','Precision','Recall',
                                                         'Model Training Time'])
print('XGBoost performance metrics:')
print(xgb_performance)
print('')

Dummy classifier performance metrics:
   Accuracy  Precision  Recall  Model Training Time
0   0.85206        1.0     0.0              0.00136

Logistic regression performance metrics:
   Accuracy  Precision  Recall  Model Training Time
0   0.85206        1.0     0.0             0.004906

Random forest performance metrics:
   Accuracy  Precision    Recall  Model Training Time
0  0.801498   0.153846  0.075949             0.288834

Gradient boosting performance metrics:
   Accuracy  Precision    Recall  Model Training Time
0  0.818352   0.178571  0.063291              0.15907

LightGBM performance metrics:
   Accuracy  Precision    Recall  Model Training Time
0  0.808989   0.151515  0.063291             0.093911





XGBoost performance metrics:
   Accuracy  Precision    Recall  Model Training Time
0  0.807116   0.147059  0.063291             0.717068

