In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline

from sklearn.metrics import confusion_matrix, classification_report

import src.data_cleaning as dc

In [2]:
df = dc.create_df('bigml_59c28831336c6604c800002a.csv')

In [3]:
df['churn'].value_counts()

False    2850
True      483
Name: churn, dtype: int64

In [4]:
X = df.drop('churn', axis=1)
y = df['churn']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [6]:
X_train.dtypes

state                      object
account length              int64
area code                   int64
phone number               object
international plan         object
voice mail plan            object
number vmail messages       int64
total day minutes         float64
total day calls             int64
total day charge          float64
total eve minutes         float64
total eve calls             int64
total eve charge          float64
total night minutes       float64
total night calls           int64
total night charge        float64
total intl minutes        float64
total intl calls            int64
total intl charge         float64
customer service calls      int64
dtype: object

In [7]:
X_train.drop(['state', 'phone number', 'international plan', 'voice mail plan'], axis=1, inplace=True)
X_test.drop(['state', 'phone number', 'international plan', 'voice mail plan'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [8]:
ss = StandardScaler()

X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.transform(X_test)

In [9]:
y_train.value_counts()

False    2141
True      358
Name: churn, dtype: int64

### Dummy Model

In [10]:
dummy = DummyClassifier(random_state=42)

dummy.fit(X_train_scaled, y_train)
preds_dumb = dummy.predict(X_train_scaled)
print(classification_report(y_train, preds_dumb))

              precision    recall  f1-score   support

       False       0.86      0.85      0.86      2141
        True       0.15      0.16      0.16       358

    accuracy                           0.75      2499
   macro avg       0.51      0.51      0.51      2499
weighted avg       0.76      0.75      0.76      2499



### Model 1 : RandomForest

In [11]:
rf1 = RandomForestClassifier(max_depth=20, random_state=42, min_samples_leaf=3, n_estimators=1000)

rf1.fit(X_train_scaled, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=20, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [12]:
rf1.score(X_train_scaled, y_train)

0.9551820728291317

In [13]:
rf1.feature_importances_

array([0.03242416, 0.00800068, 0.04569875, 0.15844243, 0.03342019,
       0.16587861, 0.07834606, 0.03120913, 0.07994629, 0.03964869,
       0.03688218, 0.0392288 , 0.03852835, 0.02547047, 0.03638024,
       0.15049496])

In [14]:
preds_1 = rf1.predict(X_train_scaled)

confusion_matrix(y_train, preds_1)

array([[2141,    0],
       [ 112,  246]])

In [15]:
cross_val_score(rf1, X_train_scaled, y_train, cv=3)

array([0.91606715, 0.91716687, 0.92307692])

In [16]:
print(classification_report(y_train, preds_1))

              precision    recall  f1-score   support

       False       0.95      1.00      0.97      2141
        True       1.00      0.69      0.81       358

    accuracy                           0.96      2499
   macro avg       0.98      0.84      0.89      2499
weighted avg       0.96      0.96      0.95      2499



In [17]:
# preds1_test = rf1.predict(X_test_scaled)

In [18]:
# print(classification_report(y_test, preds1_test))

### Model 2 : Logistic Regression

In [19]:
logreg = LogisticRegression(random_state=42)

In [20]:
logreg.fit(X_train_scaled, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [21]:
preds_2 = logreg.predict(X_train_scaled)

In [22]:
print(classification_report(y_train, preds_2))

              precision    recall  f1-score   support

       False       0.87      0.99      0.92      2141
        True       0.49      0.09      0.15       358

    accuracy                           0.86      2499
   macro avg       0.68      0.54      0.53      2499
weighted avg       0.81      0.86      0.81      2499



### Model 3: KNN


In [23]:
knn = KNeighborsClassifier()
knn.fit(X_train_scaled, y_train)

preds_knn = knn.predict(X_train)

print(classification_report(y_train, preds_knn))

              precision    recall  f1-score   support

       False       0.89      0.16      0.27      2141
        True       0.15      0.88      0.25       358

    accuracy                           0.26      2499
   macro avg       0.52      0.52      0.26      2499
weighted avg       0.78      0.26      0.26      2499



### Feature Engineering

In [25]:
X_train

Unnamed: 0,account length,area code,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls
367,45,415,0,78.2,127,13.29,253.4,108,21.54,255.0,100,11.48,18.0,3,4.86,1
3103,115,415,0,195.9,111,33.30,227.0,108,19.30,313.2,113,14.09,13.2,1,3.56,2
549,121,408,31,237.1,63,40.31,205.6,117,17.48,196.7,85,8.85,10.1,5,2.73,4
2531,180,415,0,143.3,134,24.36,180.5,113,15.34,184.2,87,8.29,10.1,4,2.73,1
2378,112,510,0,206.2,122,35.05,164.5,94,13.98,140.3,101,6.31,12.6,7,3.40,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,106,510,0,274.4,120,46.65,198.6,82,16.88,160.8,62,7.24,6.0,3,1.62,1
1130,122,415,0,35.1,62,5.97,180.8,89,15.37,251.6,58,11.32,12.7,2,3.43,1
1294,66,408,0,87.6,76,14.89,262.0,111,22.27,184.6,125,8.31,9.2,5,2.48,1
860,169,415,0,179.2,111,30.46,175.2,130,14.89,228.6,92,10.29,9.9,6,2.67,2


In [38]:
X['is_intnl_user'] = X['total intl minutes'] > 0
X['charge_per_minute'] = X['total day minutes'] / X['total day charge']
X['has_intnl_plan'] = X['international plan'] == 'yes'
X['has_vm_plan'] = X['voice mail plan'] == 'yes'

In [40]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, random_state=42)

In [41]:
ss.fit(X_train2)

ValueError: could not convert string to float: 'MD'