In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [2]:
np.random.seed(1001)
df = pd.read_csv("bench.csv")
df.dropna(inplace=True)
df.head(5)

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [3]:
df.shape

(31321, 15)

In [4]:
df = pd.get_dummies(df)
df.shape

(31321, 108)

In [5]:
df = df.sample(frac=1)
df_copy = df.copy(deep=True)
df.head()

Unnamed: 0,A,C,E,K,L,M,O,B_ ?,B_ Federal-gov,B_ Local-gov,...,N_ Portugal,N_ Puerto-Rico,N_ Scotland,N_ South,N_ Taiwan,N_ Thailand,N_ Trinadad&Tobago,N_ United-States,N_ Vietnam,N_ Yugoslavia
12439,44,181762,10,0,0,50,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1814,49,188330,9,0,0,40,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
12333,18,256967,6,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
18331,59,59469,13,0,0,50,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
27343,41,223548,9,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
target = np.array(df["O"])
df = df.drop("O", axis=1)
df_list = list(df.columns)
df = StandardScaler().fit_transform(df)
x_train, x_test, y_train, y_test = train_test_split(df, target, test_size=0.3, random_state=101)

In [7]:
print("Orginal")
model = RandomForestClassifier(random_state=101).fit(x_train, y_train)
pred = model.predict(x_test)
print(confusion_matrix(y_test, pred))
print(accuracy_score(y_test, pred))

Orginal
[[6656  511]
 [ 855 1375]]
0.8546344578056827


In [8]:
# Manual Search
model = RandomForestClassifier(n_estimators=10, random_state=101).fit(x_train, y_train)
pred = model.predict(x_test)
print(confusion_matrix(y_test, pred))
print(accuracy_score(y_test, pred))
# Performance decreased

[[6647  520]
 [ 932 1298]]
0.8454826008300521


In [9]:
# Random Search
# I will set the parameter set
random_search = {"criterion": ["entropy", "gini"],
                "max_depth": list(np.linspace(10, 50, 5, dtype=int)),
                "max_features": ["auto", "sqrt", "log2", None],
                "min_samples_leaf": [4, 6, 8, 12],
                "min_samples_split": [5, 7, 10, 14],
                "n_estimators": list(np.linspace(20, 100, 5, dtype=int))}
clsfr = RandomForestClassifier()
# Total combination: 2 * 5 * 4 * 4 * 4 * 5 = 3200
model = RandomizedSearchCV(estimator=clsfr, param_distributions=random_search, n_iter=20,
                           cv=4, random_state=101, n_jobs=-1)
model.fit(x_train, y_train)
print("Random")
predfor = model.best_estimator_.predict(x_test)
print(confusion_matrix(y_test, predfor))
print(accuracy_score(y_test, predfor))
acc1 = accuracy_score(y_test, predfor)

Random
[[6788  379]
 [ 909 1321]]
0.8629349792486964


In [10]:
# Grid Search
grid_search = {
    "criterion": [model.best_params_["criterion"]],
    "max_depth": [model.best_params_["max_depth"]],
    "max_features": [model.best_params_["max_features"]],
    "min_samples_leaf": [model.best_params_["min_samples_leaf"] - 1, 
    model.best_params_["min_samples_leaf"],
    model.best_params_["min_samples_leaf"] + 1],
    "min_samples_split": [model.best_params_["min_samples_split"] - 1,
                          model.best_params_["min_samples_split"],
                          model.best_params_["min_samples_split"] +1],
    "n_estimators": [model.best_params_["n_estimators"] - 10,
                    model.best_params_["n_estimators"],
                    model.best_params_["n_estimators"] + 10] 
}
clsfr = RandomForestClassifier()
model = GridSearchCV(estimator=clsfr, param_grid=grid_search, cv=4, n_jobs=-1)
model.fit(x_train, y_train)
print("Grid")
predfor = model.best_estimator_.predict(x_test)
print(confusion_matrix(y_test, predfor))
print(accuracy_score(y_test, predfor))
acc2 = accuracy_score(y_test, predfor)

Grid
[[6784  383]
 [ 885 1345]]
0.8650633180802384


In [12]:
from sklearn.model_selection import cross_val_score

In [13]:
# Bayesian Optimization
space = {
    "criterion": hp.choice("criterion",  ["entropy", "gini"]),
    "max_depth": hp.quniform("max_depth", 10, 50, 5),
    "max_features": hp.choice("max_features", ["auto", "sqrt", "log2", None]),
    "min_samples_leaf": hp.choice("min_samples_leaf", [4, 6, 8, 12]),
    "min_samples_split": hp.choice("min_samples_split", [5, 7, 10, 14]),
    "n_estimators": hp.choice("n_estimators", [20, 40 , 60, 80, 100])
}

def objective(space):
    model = RandomForestClassifier(criterion = space["criterion"],
                                  max_depth = space["max_depth"],
                                  max_features = space["max_features"],
                                  min_samples_leaf = space["min_samples_leaf"],
                                  min_samples_split = space["min_samples_split"],
                                  n_estimators = space["n_estimators"]  
                                  )
    accuracy = cross_val_score(model, x_train, y_train, cv=4).mean()
    return {"loss": -accuracy, "status": STATUS_OK}

trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=20, trials=trials)
best
crit = {0: "entropy", 1: "gini"}
feat = {0: "auto", 1: "sqrt", 2: "log2", 3: None}
leaf = {0:4, 1:6, 2:8, 3:12}
split = {0:5, 1:7, 2:10, 3:14}
est = {0:20, 1:40, 2:60, 3:80, 4:100}

trainfor  = RandomForestClassifier(criterion=crit[best["criterion"]],
                                  max_depth= best["max_depth"],
                                  max_features=feat[best["max_features"]],
                                  min_samples_leaf=leaf[best["min_samples_leaf"]],
                                  min_samples_split=split[best["min_samples_split"]],
                                  n_estimators= est[best["n_estimators"]]).fit(x_train, y_train)

predfor = trainfor.predict(x_test)
print(confusion_matrix(y_test, predfor))
print(accuracy_score(y_test, predfor))
acc3 = accuracy_score(y_test, predfor)

100%|██████████| 20/20 [02:10<00:00,  6.50s/trial, best loss: -0.8595602992154716]
[[6794  373]
 [ 909 1321]]
0.863573480898159
