In [39]:
# 1. Import things

import pandas as pd
import seaborn as sns
from sklearn.model_selection import cross_validate, StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier


In [24]:
# 2. Load the data

data = pd.read_csv('../data/telecom-customer-churn-encoded.csv')
data.dtypes

Unnamed: 0            int64
gender              float64
SeniorCitizen         int64
Partner             float64
Dependents          float64
tenure              float64
PhoneService        float64
MultipleLines       float64
InternetService     float64
OnlineSecurity      float64
OnlineBackup        float64
DeviceProtection    float64
TechSupport         float64
StreamingTV         float64
StreamingMovies     float64
Contract            float64
PaperlessBilling    float64
PaymentMethod       float64
MonthlyCharges      float64
TotalCharges        float64
Churn                object
customerID           object
dtype: object

In [29]:
data = data.dropna()

In [21]:
X_columns = [
       'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges',
]

In [34]:
y = (data["Churn"] == "Yes").astype(int)
X = data[X_columns]



In [42]:
estimator = RandomForestClassifier()
scoring = ["roc_auc", "accuracy", "f1", "precision", "recall"]
cv = StratifiedKFold(n_splits=10)

In [53]:
results = cross_validate(
    estimator=estimator, X=X, y=y, scoring=scoring, cv=cv)
results = pd.DataFrame(results)
results.mean()

fit_time          18.757802
score_time         0.032384
test_roc_auc       0.844997
test_accuracy      0.802187
test_f1            0.573392
test_precision     0.672027
test_recall        0.500785
dtype: float64

How do we check if it's overfitted? We can't visualize a forest!

In [43]:
results = cross_validate(
    estimator=estimator, X=X, y=y, scoring=scoring, cv=cv,
    return_train_score=True)
results = pd.DataFrame(results)
results.mean()

fit_time           0.459035
score_time         0.037950
test_roc_auc       0.827152
train_roc_auc      0.999929
test_accuracy      0.795504
train_accuracy     0.997614
test_f1            0.563535
train_f1           0.995510
test_precision     0.651645
train_precision    0.995778
test_recall        0.497047
train_recall       0.995244
dtype: float64

In [44]:
param_grid = {
    "max_depth": [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
}

estimator = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid=param_grid,
    cv=5
)
results = cross_validate(
    estimator=estimator, X=X, y=y, scoring=scoring, cv=cv,
    return_train_score=True, return_estimator=True, n_jobs=-1)
results = pd.DataFrame(results)
results.mean()

  results.mean()


fit_time           38.441954
score_time          0.061893
test_roc_auc        0.843443
train_roc_auc       0.920114
test_accuracy       0.800907
train_accuracy      0.847697
test_f1             0.569405
train_f1            0.675514
test_precision      0.670722
train_precision     0.775028
test_recall         0.495440
train_recall        0.599309
dtype: float64

In [52]:
results["estimator"][5].best_estimator_


Exercises:
1. Which was the `max_depth` feature used in each fold? Which was the one used the most?
2. Can you find a better model by tunning other parameters?