In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import RandomizedSearchCV

In [2]:
df = pd.read_csv("/starbucks.csv")

In [3]:
int_cols = ['age', 'income', 'gender__F', 'gender__M', 'gender__O', 'days_as_member',
     'number_of_transactions', 'reward', 'difficulty', 'duration', 'web_channel',
     'email_channel', 'mobile_channel', 'social_channel', 'offer_type__bogo', 
     'offer_type__discount', 'offer_type__informational']

In [7]:
X = df.loc[:, df.columns[3:]]
X[int_cols] = X[int_cols].astype(int)
y = df.loc[:, ['customer_responded']].astype('category')

In [8]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0, shuffle=True)

# **Random forest model**

In [9]:
clf = RandomForestClassifier()
clf.fit(train_X, train_y)

  


RandomForestClassifier()

In [20]:
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [10]:
pred_rf = clf.predict(val_X)
pred_rf

array([1., 0., 1., ..., 1., 1., 1.])

# **Evaluate random forest model**

In [39]:
accuracy_rf = accuracy_score(val_y, pred_rf)
print("RF model accuracy: %0.2f" % (accuracy_rf*100), "%")

RF model accuracy: 93.07 %


In [16]:
print(classification_report(val_y,pred_rf))

              precision    recall  f1-score   support

         0.0       0.94      0.88      0.91      6666
         1.0       0.92      0.96      0.94      9926

    accuracy                           0.93     16592
   macro avg       0.93      0.92      0.93     16592
weighted avg       0.93      0.93      0.93     16592



# **Tuned model**

In [21]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 1000, num = 8)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 200, num = 8)]
max_depth.append(None)

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

{'n_estimators': [50, 185, 321, 457, 592, 728, 864, 1000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 37, 64, 91, 118, 145, 172, 200, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [22]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf_t = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_tuned = RandomizedSearchCV(estimator = rf_t, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=10, random_state=42, n_jobs = -1)
# Fit the random search model
rf_tuned.fit(train_X, train_y)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


  self.best_estimator_.fit(X, y, **fit_params)


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 37, 64, 91, 118, 145,
                                                      172, 200, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [50, 185, 321, 457, 592,
                                                         728, 864, 1000]},
                   random_state=42, verbose=10)

In [23]:
pred_rf_tuned = rf_tuned.predict(val_X)
pred_rf_tuned

array([1., 0., 1., ..., 1., 1., 1.])

In [38]:
accuracy_tuned = accuracy_score(val_y, pred_rf_tuned)
print("Tuned model accuracy: %0.2f" % (accuracy_tuned*100), "%")

Tuned model accuracy: 93.12 %


In [40]:
print("RF model accuracy: %0.2f" % (accuracy_rf*100), "%")
print("Tuned model accuracy: %0.2f" % (accuracy_tuned*100), "%")

RF model accuracy: 93.07 %
Tuned model accuracy: 93.12 %


In [34]:
rf_tuned.best_params_

{'bootstrap': True,
 'max_depth': None,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 864}