In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

import src.data_cleaning as dc

In [2]:
df = dc.create_df('bigml_59c28831336c6604c800002a.csv')
X = df.drop('churn', axis=1)
y = df['churn']

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [4]:
class Dataprep(BaseEstimator):
    
    def __init__(self):
        pass
        
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X, y=None):
        
        df = X.copy()
        df['international plan'] = df['international plan'] == 'yes'
        df['voice mail plan'] = df['voice mail plan'] == 'yes'
    
        df['account months'] = np.ceil(df['account length']/30).astype(int)
        df['total minutes'] = df['total day minutes']+df['total eve minutes']+df['total night minutes']+df['total intl minutes']
        df['total calls'] = df['total day calls']+df['total eve calls']+df['total night calls']+df['total intl calls']
        df['total cost'] = df['total day charge']+df['total eve charge']+df['total night charge']+df['total intl charge']
        df['cost per day'] = df['total cost']/df['account length']
    
        df.drop(['state', 'phone number', 'area code'], axis=1, inplace=True)
    
        return df

## Testing Random Forest

In [5]:
rfc = RandomForestClassifier(random_state=42, n_estimators=100)

pipe = Pipeline(steps=[
    ("dataprep", Dataprep()),
    ("model", rfc)
])

In [6]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('dataprep', Dataprep()),
                ('model',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=100, n_jobs=None,
                                        oob_score=False, random_state=42,
                                        verbose=0, warm_start=False))],
         verbose=False)

In [7]:
confusion_matrix(y_train, pipe.predict(X_train))

array([[2141,    0],
       [   0,  358]])

In [8]:
print(classification_report(y_test, pipe.predict(X_test)))

              precision    recall  f1-score   support

       False       0.98      1.00      0.99       709
        True       1.00      0.86      0.93       125

    accuracy                           0.98       834
   macro avg       0.99      0.93      0.96       834
weighted avg       0.98      0.98      0.98       834



In [9]:
roc_auc_score(y_train, pipe.predict(X_train))

1.0

In [10]:
cross_val_score(pipe, X_train, y_train, cv=5)

array([0.974    , 0.968    , 0.984    , 0.96     , 0.9759519])

## Testing Gradient Boosting

In [11]:
gb = GradientBoostingClassifier()

In [12]:
pipe2 = Pipeline(steps=[
    ("dataprep", Dataprep()),
    ("model", gb)
])

In [13]:
pipe2.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('dataprep', Dataprep()),
                ('model',
                 GradientBoostingClassifier(ccp_alpha=0.0,
                                            criterion='friedman_mse', init=None,
                                            learning_rate=0.1, loss='deviance',
                                            max_depth=3, max_features=None,
                                            max_leaf_nodes=None,
                                            min_impurity_decrease=0.0,
                                            min_impurity_split=None,
                                            min_samples_leaf=1,
                                            min_samples_split=2,
                                            min_weight_fraction_leaf=0.0,
                                            n_estimators=100,
                                            n_iter_no_change=None,
                                            presort='deprecated',
           

In [14]:
print(classification_report(y_train, pipe2.predict(X_train)))

              precision    recall  f1-score   support

       False       0.98      1.00      0.99      2141
        True       1.00      0.88      0.93       358

    accuracy                           0.98      2499
   macro avg       0.99      0.94      0.96      2499
weighted avg       0.98      0.98      0.98      2499



In [15]:
roc_auc_score(y_train, pipe2.predict(X_train))

0.9385474860335196

In [16]:
cross_val_score(pipe2, X_train, y_train, cv=5, scoring='roc_auc')

array([0.90111297, 0.93876558, 0.96274663, 0.88820742, 0.92391734])

In [27]:
print(classification_report(y_test, pipe2.predict(X_test)))

              precision    recall  f1-score   support

       False       0.98      1.00      0.99       709
        True       0.99      0.88      0.93       125

    accuracy                           0.98       834
   macro avg       0.99      0.94      0.96       834
weighted avg       0.98      0.98      0.98       834



## Getting Feature Importances

In [31]:
def blurg(X):        
    df = X.copy()
    df['international plan'] = df['international plan'] == 'yes'
    df['voice mail plan'] = df['voice mail plan'] == 'yes'
    
    df['account months'] = np.ceil(df['account length']/30).astype(int)
    df['total minutes'] = df['total day minutes']+df['total eve minutes']+df['total night minutes']+df['total intl minutes']
    df['total calls'] = df['total day calls']+df['total eve calls']+df['total night calls']+df['total intl calls']
    df['total cost'] = df['total day charge']+df['total eve charge']+df['total night charge']+df['total intl charge']
    df['cost per day'] = df['total cost']/df['account length']
    
    df.drop(['state', 'phone number', 'area code'], axis=1, inplace=True)
    
    return df

In [37]:
gb1 = GradientBoostingClassifier()

X_train2 = blurg(X_train)
X_test2 = blurg(X_test)

gb1.fit(X_train2, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [54]:
yarrr = zip(list(X_train2.columns),list(gb1.feature_importances_))

In [46]:
print(X_train2.columns[-2], X_train2.columns[-5], X_train2.columns[1])

total cost account months international plan


In [47]:
print(X_train2.columns)

Index(['account length', 'international plan', 'voice mail plan',
       'number vmail messages', 'total day minutes', 'total day calls',
       'total day charge', 'total eve minutes', 'total eve calls',
       'total eve charge', 'total night minutes', 'total night calls',
       'total night charge', 'total intl minutes', 'total intl calls',
       'total intl charge', 'customer service calls', 'account months',
       'total minutes', 'total calls', 'total cost', 'cost per day'],
      dtype='object')


In [56]:
dfyarr = pd.DataFrame(yarrr)

In [69]:
dfyarr.sort_values(by=['Weight'], ascending=False, inplace=True)

In [66]:
dfyarr.rename(columns={0:'Feature', 1:'Weight'}, inplace=True)

In [70]:
dfyarr

Unnamed: 0,Feature,Weight
20,total cost,0.45112
16,customer service calls,0.13393
1,international plan,0.111497
2,voice mail plan,0.080141
14,total intl calls,0.070461
3,number vmail messages,0.054885
15,total intl charge,0.037766
13,total intl minutes,0.026017
10,total night minutes,0.006625
5,total day calls,0.005366
