# Search for best parameters for Random Forest classifier

## Read data

In [None]:
# Pandas is used for data manipulation
import pandas as pd
time='80_100'

# Read in data as a dataframe
features = pd.read_csv('../features/features_training1/features_{}.csv'.format(time))

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# One Hot Encoding
features_num=features.to_numpy()
features[:] = np.nan_to_num(features_num)
np.where(pd.isnull(features_num))
features.describe(include='all')

# Extract features and labels and print feature names
labels = features['quality']
features = features.drop('quality', axis = 1)

labels[1:6]
names=features.columns
print(names)

y = labels.map({'native':1,"non-native":0})
x = features.values
# Convert to numpy arrays
features = np.array(x)
labels = np.array(y)

## Specify training and test sets

In [None]:
# Training and Testing Sets
from sklearn.model_selection import train_test_split

train_features, test_features, train_labels, test_labels = train_test_split(features, labels, 
                                                                            test_size = 0.25, random_state = 42)



## Set a base model with RF classifier 

In [None]:
from sklearn.ensemble import RandomForestClassifier

base_model = RandomForestClassifier(n_estimators = 10,random_state = 42)

from pprint import pprint

# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(base_model.get_params())

from sklearn import metrics
base_model.fit(train_features,train_labels);
pred_labels=base_model.predict(test_features)
base_accuracy=metrics.accuracy_score(test_labels, pred_labels)
print("Base model Accuracy:",metrics.accuracy_score(test_labels, pred_labels))


## Random Search with Cross Validation

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pprint(random_grid)

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier(random_state = 42)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid,
                              n_iter = 100, scoring='neg_mean_absolute_error', 
                              cv = 3, verbose=2, random_state=42, n_jobs=-1,
                              return_train_score=True)

# Fit the random search model
rf_random.fit(train_features, train_labels);

rf_random.best_params_

### Evaluate the Best Random Search Model

In [None]:
best_random = rf_random.best_estimator_
best_random.fit(train_features,train_labels);
pred_labels=best_random.predict(test_features)
random_accuracy=metrics.accuracy_score(test_labels, pred_labels)
print("Best random model Accuracy:",metrics.accuracy_score(test_labels, pred_labels))
print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

## Grid Search 

We can now perform grid search building on the result from the random search. 
We will test a range of hyperparameters around the best values returned by random search. 

In [None]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [5, 10,  50,  110],
    'min_samples_leaf': [1, 3,  5],
    'min_samples_split': [2,  8, 12],
    'n_estimators': [100, 300, 1000, 1500],
    'max_features' : ['auto', 'sqrt'],
    'oob_score' : [ True],
     'warm_start' : [False, True]
}

# Create a base model
rf = RandomForestClassifier(random_state = 42)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2, return_train_score=True)

# Fit the grid search to the data
gid_search.fit(train_features, train_labels);
grid_search.best_params_



### Test RF classifier with the best parameters

In [None]:
rf_param = RandomForestClassifier(bootstrap= True, max_depth=50, max_features='auto', min_samples_leaf=1, min_samples_split=2, n_estimators = 1000,oob_score= True,
                                  random_state = 42)

rf_param.fit(train_features, train_labels);

pred_labels_best=rf.param.predict(test_features)
best_accuracy=metrics.accuracy_score(test_labels, pred_labels_best)
grid_accuracy=metrics.accuracy_score(test_labels, pred_labels)
print("Best Grid model Accuracy:",metrics.accuracy_score(test_labels, pred_labels))

#### Evaluate the Best Model from Grid Search

In [None]:
pred_labels_best=rf_param.predict(test_features)
best_accuracy=metrics.accuracy_score(test_labels, pred_labels_best)
print("Base model Accuracy:",metrics.accuracy_score(test_labels, pred_labels_best))
print(rf_param.oob_score_)
d = grid_search.best_estimator_
grid_accuracy =metrics.accuracy_score(test_labels, pred_labels)
print("Best Grid model Accuracy:",metrics.accuracy_score(test_labels, pred_labels))
print(best_grid.oob_score_)
best_grid = grid_search.best_estimator_
grid_accuracy =metrics.accuracy_score(test_labels, pred_labels)
print("Best Grid model Accuracy:",metrics.accuracy_score(test_labels, pred_labels))
print(best_grid.oob_score_)