In [1]:
# Add needed libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

# VIF for multi-collinearity detection
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Models and modeling tools
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
import statsmodels.api as sm
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# Change inline plots default size
plt.rcParams['figure.figsize'] = [12, 8]

## Summary: ##



## Final Model: ##

In [18]:
# Read in the rawData
columns = ['Happy', 'OnTime', 'OrderCorrect', 'FoundEverything', 'GoodPrice', 'DeliverySatisfaction', 'EasyToUse']
rawData = pd.read_csv('ACME-HappinessSurvey2020.csv', header=0, names=columns)

# Separate data in independent (features) and dependent (target) datasets
# Exclude 'OrderCorrect' and 'DeliverySatisfaction' from features due to poor performance
x_final = rawData[['OnTime', 'FoundEverything', 'GoodPrice', 'EasyToUse']]
y_final = rawData[['Happy']]

clf = RandomForestClassifier(max_depth=11, min_samples_split=6, min_samples_leaf=1)
scores = cross_val_score(clf, x_final, np.ravel(y_final), cv=5)
print (scores.mean(), scores.std())

clf = clf.fit(x_final, np.ravel(y_final))

#### Add test set here ####
## testData = pd.read_csv()
## x_test = testData[['OnTime', 'FoundEverything', 'GoodPrice', 'EasyToUse']]
## y_test = testData[['Happy']]
## clf.predict(x_test)

0.7230769230769231 0.14289649500258453


## Work: ##

In [3]:
## Num Estimators Param Search ##
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 19)]
for nest in n_estimators:
    clf = RandomForestClassifier(n_estimators=nest)
    scores = cross_val_score(clf, x_final, np.ravel(y_final), cv=5)
    print (nest, scores.mean(), scores.std())

200 0.6993846153846154 0.1344217508463259
300 0.6993846153846154 0.1344217508463259
400 0.6913846153846154 0.1341467888051949
500 0.6913846153846154 0.1341467888051949
600 0.6993846153846154 0.1344217508463259
700 0.6993846153846154 0.1344217508463259
800 0.699076923076923 0.1283397856929642
900 0.6913846153846154 0.1341467888051949
1000 0.707076923076923 0.12814784361231057
1100 0.6913846153846154 0.1341467888051949
1200 0.6913846153846154 0.1341467888051949
1300 0.6993846153846154 0.1344217508463259
1400 0.6993846153846154 0.1344217508463259
1500 0.6993846153846154 0.1344217508463259
1600 0.6993846153846154 0.1344217508463259
1700 0.6913846153846154 0.1341467888051949
1800 0.6913846153846154 0.1341467888051949
1900 0.6993846153846154 0.1344217508463259
2000 0.6993846153846154 0.1344217508463259


In [4]:
## Max Depth Samples Leaf Param Search ##
max_depth = [int(x) for x in np.linspace(1, 75, num = 38)]
for md in max_depth:
    clf = RandomForestClassifier(max_depth=md)
    scores = cross_val_score(clf, x_final, np.ravel(y_final), cv=5)
    print (md, scores.mean(), scores.std())

1 0.5793846153846153 0.0894511866489171
3 0.62 0.1475127113166862
5 0.6516923076923077 0.10735595826570564
7 0.6596923076923077 0.11346464141780474
9 0.6913846153846154 0.1341467888051949
11 0.6993846153846154 0.12453596709622372
13 0.6993846153846154 0.1344217508463259
15 0.6913846153846154 0.1341467888051949
17 0.707076923076923 0.12814784361231057
19 0.707076923076923 0.12814784361231057
21 0.6913846153846154 0.1341467888051949
23 0.6993846153846154 0.1344217508463259
25 0.7073846153846154 0.12431835445515627
27 0.707076923076923 0.12814784361231057
29 0.6993846153846154 0.1344217508463259
31 0.707076923076923 0.11773644220242603
33 0.6993846153846154 0.1344217508463259
35 0.6993846153846154 0.1344217508463259
37 0.6913846153846154 0.1341467888051949
39 0.707076923076923 0.12814784361231057
41 0.707076923076923 0.14234419490265562
43 0.6913846153846154 0.1341467888051949
45 0.691076923076923 0.12293222263079255
47 0.699076923076923 0.1283397856929642
49 0.707076923076923 0.128147843

In [5]:
## Min Samples Split Param Search ##
min_samples_split = [2, 3, 4, 5, 6, 7, 8, 9, 10]
for mss in min_samples_split:
    clf = RandomForestClassifier(max_depth=11, min_samples_split=mss)
    scores = cross_val_score(clf, x_final, np.ravel(y_final), cv=5)
    print (mss, scores.mean(), scores.std())

2 0.699076923076923 0.1283397856929642
3 0.6993846153846154 0.1344217508463259
4 0.6913846153846154 0.1341467888051949
5 0.6836923076923077 0.14139658441009187
6 0.7153846153846153 0.14054500847872195
7 0.707076923076923 0.1354321594839468
8 0.699076923076923 0.13795325509648537
9 0.6753846153846154 0.12970608558989535
10 0.6513846153846153 0.13276343690005465


In [10]:
## Min Samples Leaf Param Search ##
min_samples_leaf = [1, 2, 3, 4, 5, 6]
for msl in min_samples_leaf:
    clf = RandomForestClassifier(max_depth=11, min_samples_split=6, min_samples_leaf=msl)
    scores = cross_val_score(clf, x_final, np.ravel(y_final), cv=5)
    print (msl, scores.mean(), scores.std())

1 0.7073846153846153 0.14344006851099128
2 0.699076923076923 0.14693910504598826
3 0.6753846153846153 0.16451039067199755
4 0.6030769230769231 0.1266407115523488
5 0.5870769230769232 0.11541373991404467
6 0.5633846153846154 0.10452665920810394
