In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score

import pandas as pd
import numpy as np
import itertools
import pickle
from datetime import datetime

In [2]:
df = pd.read_csv("../data/processed/churn_enum.csv")
df.head()

Unnamed: 0,customerID,tenure,MonthlyCharges,TotalCharges,InternetService,StreamingTV,SeniorCitizen,PaymentMethod,Dependents,TechSupport,...,StreamingMovies,OnlineSecurity,gender,Churn,Contract,MultipleLines,PhoneService,Partner,DeviceProtection,PaperlessBilling
0,7590-VHVEG,0.013889,0.115423,0.003437,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5575-GNVDE,0.472222,0.385075,0.217564,0,0,0,1,0,0,...,0,1,1,0,1,1,1,1,1,1
2,3668-QPYBK,0.027778,0.354229,0.012453,0,0,0,1,0,0,...,0,1,1,1,0,1,1,1,0,0
3,7795-CFOCW,0.625,0.239303,0.211951,0,0,0,2,0,1,...,0,1,1,0,1,0,0,1,1,1
4,9237-HQITU,0.027778,0.521891,0.017462,1,0,0,0,0,0,...,0,0,0,1,0,1,1,1,0,0


In [3]:
y = df["Churn"]
x = df[["tenure", "Contract", "InternetService", "MonthlyCharges"]]

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [5]:
parameters = {"min_samples_split": [2, 4, 8, 12],
              "min_samples_leaf": [1, 2, 4, 8, 12],
              "max_features": ["auto", "log2", None],
              "n_estimators": [100, 150, 200]}

In [6]:
grid_search = list(itertools.product(*parameters.values()))

In [7]:
roc_auc_scores = []
acc_scores = []

for params in grid_search:
    rf = RandomForestClassifier(min_samples_split=params[0], 
                                min_samples_leaf=params[1],
                                max_features=params[2],
                                n_estimators = params[3])
    rf.fit(x_train, y_train)
    pred = rf.predict(x_test)
    roc_auc_scores.append(roc_auc_score(y_test, pred))
    acc_scores.append(accuracy_score(y_test, pred))

In [8]:
acc_scores[np.argmax(roc_auc_scores)]

0.8001135718341851

In [9]:
roc_auc_scores[np.argmax(roc_auc_scores)]

0.6962880257714976

In [10]:
best_params = grid_search[np.argmax(roc_auc_scores)]
best_model = RandomForestClassifier(min_samples_split=best_params[0], 
                                min_samples_leaf=best_params[1],
                                max_features=best_params[2])
best_model.fit(x_train, y_train)

RandomForestClassifier(min_samples_leaf=12, min_samples_split=8)

In [11]:
best_params

(8, 12, 'auto', 150)

In [None]:
filename = datetime.now().strftime("%Y_%m_%d_%H_%M")

with open(f"../models/random_forest/{filename}.pkl", "wb") as f:
    pickle.dump(best_model, f)