In [1]:
import pandas as pd
import numpy as np
from scipy.optimize import minimize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import classification_report, mean_squared_error as mse, r2_score as r2
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from joblib import dump, load


# Preprocessing

In [2]:
x_df = pd.read_csv('../datasets/winequality_red_x_train.csv')
y_df = pd.read_csv('../datasets/winequality_red_y_train.csv')

In [3]:
def reverse_ohe(row):
    if row['3'] == 1:
        return 3
    elif row['4'] == 1:
        return 4
    elif row['5'] == 1:
        return 5
    elif row['6'] == 1:
        return 6
    elif row['7'] == 1:
        return 7
    elif row['8'] == 1:
        return 8
y_df['label'] = y_df.apply(reverse_ohe, axis=1)
y_df

Unnamed: 0,3,4,5,6,7,8,label
0,0,0,0,1,0,0,6
1,0,0,0,1,0,0,6
2,0,0,1,0,0,0,5
3,0,0,0,0,1,0,7
4,0,0,1,0,0,0,5
...,...,...,...,...,...,...,...
1066,0,0,0,1,0,0,6
1067,0,0,0,1,0,0,6
1068,0,0,1,0,0,0,5
1069,0,0,0,0,1,0,7


In [None]:
#split training and testing set 
x_train, x_test, y_train, y_test = train_test_split(x_df, y_df['label'], test_size=0.2, random_state=0)

In [5]:
y_train.value_counts()

5    360
6    349
7    101
4     28
8     11
3      7
Name: label, dtype: int64

### SMOTE
Model with smote performed worse.

In [None]:
#smote
sm = SMOTE(random_state=0)
x_train_res, y_train_res = sm.fit_resample(x_train, y_train.ravel())

In [None]:
y_train_res = pd.DataFrame(y_train_res)
y_train_res.value_counts()

# Model Selection

In [6]:
models = [
    LogisticRegression(), 
    MLPClassifier(), 
    KNeighborsClassifier(), 
    SVC(),
    GaussianProcessClassifier(), 
    QuadraticDiscriminantAnalysis(), 
    DecisionTreeClassifier(), 
    RandomForestClassifier(), 
    AdaBoostClassifier(), 
    GaussianNB()
] 

In [7]:
for model in models:
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)

    print('\n\n\n' + str(model))
    print(classification_report(y_test, y_pred))
    




LogisticRegression()
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         6
           5       0.73      0.75      0.74       104
           6       0.51      0.68      0.58        76
           7       0.50      0.11      0.18        28

    accuracy                           0.62       215
   macro avg       0.35      0.31      0.30       215
weighted avg       0.60      0.62      0.59       215



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))





MLPClassifier()
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         6
           5       0.75      0.73      0.74       104
           6       0.54      0.72      0.62        76
           7       0.67      0.29      0.40        28

    accuracy                           0.65       215
   macro avg       0.39      0.35      0.35       215
weighted avg       0.64      0.65      0.63       215




KNeighborsClassifier()
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         6
           5       0.58      0.68      0.63       104
           6       0.49      0.51      0.50        76
           7       0.62      0.29      0.39        28

    accuracy                           0.55       215
   macro avg       0.34      0.30      0.30       215
weighted avg       0.53      0.5

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))





RandomForestClassifier()
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         6
           5       0.70      0.71      0.71       104
           6       0.54      0.68      0.60        76
           7       0.93      0.46      0.62        28

    accuracy                           0.65       215
   macro avg       0.43      0.37      0.39       215
weighted avg       0.65      0.65      0.64       215




AdaBoostClassifier()
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         6
           5       0.63      0.73      0.68       104
           6       0.43      0.54      0.48        76
           7       0.00      0.00      0.00        28

    accuracy                           0.54       215
   macro avg       0.21      0.25      0.23       215
weighted avg       0.46  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Hyperparameter Tuning

### RandomForest hyperparameter tuning

In [10]:
model = RandomForestClassifier()
param_grid = [{   
    # 'n_estimators' : [25, 50, 75, 100, 125, 150, 175, 200],
    'criterion': ['gini', 'entropy', 'log_loss'],
    # 'max_depth': [20, 40, 60, 80, 100, None],
    'bootstrap': [True, False],
    # 'min_samples_split' : [2, 3, 4, 5],
    # 'min_samples_leaf' : [1, 2, 3, 4, 5],
    'max_features' : ['sqrt', 'log2', 'auto', None],
    }
]


clf = GridSearchCV(model, param_grid = param_grid, refit=True, verbose=3)
best_clf = clf.fit(x_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5] END bootstrap=True, criterion=gini, max_features=sqrt;, score=0.651 total time=   0.2s
[CV 2/5] END bootstrap=True, criterion=gini, max_features=sqrt;, score=0.667 total time=   0.2s
[CV 3/5] END bootstrap=True, criterion=gini, max_features=sqrt;, score=0.702 total time=   0.2s
[CV 4/5] END bootstrap=True, criterion=gini, max_features=sqrt;, score=0.649 total time=   0.2s
[CV 5/5] END bootstrap=True, criterion=gini, max_features=sqrt;, score=0.719 total time=   0.2s
[CV 1/5] END bootstrap=True, criterion=gini, max_features=log2;, score=0.657 total time=   0.2s
[CV 2/5] END bootstrap=True, criterion=gini, max_features=log2;, score=0.690 total time=   0.2s
[CV 3/5] END bootstrap=True, criterion=gini, max_features=log2;, score=0.708 total time=   0.2s
[CV 4/5] END bootstrap=True, criterion=gini, max_features=log2;, score=0.632 total time=   0.2s
[CV 5/5] END bootstrap=True, criterion=gini, max_features=log2;, score=0.6

  warn(


[CV 1/5] END bootstrap=True, criterion=gini, max_features=auto;, score=0.640 total time=   0.2s


  warn(


[CV 2/5] END bootstrap=True, criterion=gini, max_features=auto;, score=0.661 total time=   0.1s


  warn(


[CV 3/5] END bootstrap=True, criterion=gini, max_features=auto;, score=0.702 total time=   0.2s


  warn(


[CV 4/5] END bootstrap=True, criterion=gini, max_features=auto;, score=0.643 total time=   0.1s


  warn(


[CV 5/5] END bootstrap=True, criterion=gini, max_features=auto;, score=0.702 total time=   0.2s
[CV 1/5] END bootstrap=True, criterion=gini, max_features=None;, score=0.605 total time=   0.4s
[CV 2/5] END bootstrap=True, criterion=gini, max_features=None;, score=0.649 total time=   0.4s
[CV 3/5] END bootstrap=True, criterion=gini, max_features=None;, score=0.708 total time=   0.4s
[CV 4/5] END bootstrap=True, criterion=gini, max_features=None;, score=0.620 total time=   0.4s
[CV 5/5] END bootstrap=True, criterion=gini, max_features=None;, score=0.678 total time=   0.4s
[CV 1/5] END bootstrap=True, criterion=entropy, max_features=sqrt;, score=0.622 total time=   0.2s
[CV 2/5] END bootstrap=True, criterion=entropy, max_features=sqrt;, score=0.643 total time=   0.2s
[CV 3/5] END bootstrap=True, criterion=entropy, max_features=sqrt;, score=0.702 total time=   0.2s
[CV 4/5] END bootstrap=True, criterion=entropy, max_features=sqrt;, score=0.649 total time=   0.2s
[CV 5/5] END bootstrap=True,

  warn(


[CV 1/5] END bootstrap=True, criterion=entropy, max_features=auto;, score=0.628 total time=   0.2s


  warn(


[CV 2/5] END bootstrap=True, criterion=entropy, max_features=auto;, score=0.649 total time=   0.2s


  warn(


[CV 3/5] END bootstrap=True, criterion=entropy, max_features=auto;, score=0.673 total time=   0.2s


  warn(


[CV 4/5] END bootstrap=True, criterion=entropy, max_features=auto;, score=0.637 total time=   0.2s


  warn(


[CV 5/5] END bootstrap=True, criterion=entropy, max_features=auto;, score=0.673 total time=   0.2s
[CV 1/5] END bootstrap=True, criterion=entropy, max_features=None;, score=0.634 total time=   0.5s
[CV 2/5] END bootstrap=True, criterion=entropy, max_features=None;, score=0.661 total time=   0.5s
[CV 3/5] END bootstrap=True, criterion=entropy, max_features=None;, score=0.690 total time=   0.5s
[CV 4/5] END bootstrap=True, criterion=entropy, max_features=None;, score=0.661 total time=   0.5s
[CV 5/5] END bootstrap=True, criterion=entropy, max_features=None;, score=0.696 total time=   0.5s
[CV 1/5] END bootstrap=True, criterion=log_loss, max_features=sqrt;, score=0.605 total time=   0.2s
[CV 2/5] END bootstrap=True, criterion=log_loss, max_features=sqrt;, score=0.667 total time=   0.2s
[CV 3/5] END bootstrap=True, criterion=log_loss, max_features=sqrt;, score=0.696 total time=   0.2s
[CV 4/5] END bootstrap=True, criterion=log_loss, max_features=sqrt;, score=0.655 total time=   0.2s
[CV 5/

  warn(


[CV 1/5] END bootstrap=True, criterion=log_loss, max_features=auto;, score=0.640 total time=   0.2s


  warn(


[CV 2/5] END bootstrap=True, criterion=log_loss, max_features=auto;, score=0.667 total time=   0.2s


  warn(


[CV 3/5] END bootstrap=True, criterion=log_loss, max_features=auto;, score=0.713 total time=   0.2s


  warn(


[CV 4/5] END bootstrap=True, criterion=log_loss, max_features=auto;, score=0.643 total time=   0.2s


  warn(


[CV 5/5] END bootstrap=True, criterion=log_loss, max_features=auto;, score=0.678 total time=   0.2s
[CV 1/5] END bootstrap=True, criterion=log_loss, max_features=None;, score=0.610 total time=   0.5s
[CV 2/5] END bootstrap=True, criterion=log_loss, max_features=None;, score=0.620 total time=   0.5s
[CV 3/5] END bootstrap=True, criterion=log_loss, max_features=None;, score=0.667 total time=   0.5s
[CV 4/5] END bootstrap=True, criterion=log_loss, max_features=None;, score=0.614 total time=   0.5s
[CV 5/5] END bootstrap=True, criterion=log_loss, max_features=None;, score=0.690 total time=   0.5s
[CV 1/5] END bootstrap=False, criterion=gini, max_features=sqrt;, score=0.628 total time=   0.1s
[CV 2/5] END bootstrap=False, criterion=gini, max_features=sqrt;, score=0.678 total time=   0.2s
[CV 3/5] END bootstrap=False, criterion=gini, max_features=sqrt;, score=0.655 total time=   0.1s
[CV 4/5] END bootstrap=False, criterion=gini, max_features=sqrt;, score=0.637 total time=   0.1s
[CV 5/5] END

  warn(


[CV 1/5] END bootstrap=False, criterion=gini, max_features=auto;, score=0.651 total time=   0.1s


  warn(


[CV 2/5] END bootstrap=False, criterion=gini, max_features=auto;, score=0.649 total time=   0.1s


  warn(


[CV 3/5] END bootstrap=False, criterion=gini, max_features=auto;, score=0.702 total time=   0.1s


  warn(


[CV 4/5] END bootstrap=False, criterion=gini, max_features=auto;, score=0.637 total time=   0.1s


  warn(


[CV 5/5] END bootstrap=False, criterion=gini, max_features=auto;, score=0.678 total time=   0.1s
[CV 1/5] END bootstrap=False, criterion=gini, max_features=None;, score=0.547 total time=   0.4s
[CV 2/5] END bootstrap=False, criterion=gini, max_features=None;, score=0.515 total time=   0.4s
[CV 3/5] END bootstrap=False, criterion=gini, max_features=None;, score=0.620 total time=   0.4s
[CV 4/5] END bootstrap=False, criterion=gini, max_features=None;, score=0.520 total time=   0.4s
[CV 5/5] END bootstrap=False, criterion=gini, max_features=None;, score=0.544 total time=   0.4s
[CV 1/5] END bootstrap=False, criterion=entropy, max_features=sqrt;, score=0.669 total time=   0.2s
[CV 2/5] END bootstrap=False, criterion=entropy, max_features=sqrt;, score=0.637 total time=   0.2s
[CV 3/5] END bootstrap=False, criterion=entropy, max_features=sqrt;, score=0.667 total time=   0.2s
[CV 4/5] END bootstrap=False, criterion=entropy, max_features=sqrt;, score=0.649 total time=   0.2s
[CV 5/5] END boots

  warn(


[CV 1/5] END bootstrap=False, criterion=entropy, max_features=auto;, score=0.622 total time=   0.2s


  warn(


[CV 2/5] END bootstrap=False, criterion=entropy, max_features=auto;, score=0.643 total time=   0.2s


  warn(


[CV 3/5] END bootstrap=False, criterion=entropy, max_features=auto;, score=0.690 total time=   0.2s


  warn(


[CV 4/5] END bootstrap=False, criterion=entropy, max_features=auto;, score=0.632 total time=   0.2s


  warn(


[CV 5/5] END bootstrap=False, criterion=entropy, max_features=auto;, score=0.637 total time=   0.3s
[CV 1/5] END bootstrap=False, criterion=entropy, max_features=None;, score=0.517 total time=   0.7s
[CV 2/5] END bootstrap=False, criterion=entropy, max_features=None;, score=0.608 total time=   0.6s
[CV 3/5] END bootstrap=False, criterion=entropy, max_features=None;, score=0.550 total time=   0.6s
[CV 4/5] END bootstrap=False, criterion=entropy, max_features=None;, score=0.544 total time=   0.6s
[CV 5/5] END bootstrap=False, criterion=entropy, max_features=None;, score=0.561 total time=   0.6s
[CV 1/5] END bootstrap=False, criterion=log_loss, max_features=sqrt;, score=0.622 total time=   0.2s
[CV 2/5] END bootstrap=False, criterion=log_loss, max_features=sqrt;, score=0.643 total time=   0.2s
[CV 3/5] END bootstrap=False, criterion=log_loss, max_features=sqrt;, score=0.702 total time=   0.2s
[CV 4/5] END bootstrap=False, criterion=log_loss, max_features=sqrt;, score=0.614 total time=   0

  warn(


[CV 1/5] END bootstrap=False, criterion=log_loss, max_features=auto;, score=0.663 total time=   0.3s


  warn(


[CV 2/5] END bootstrap=False, criterion=log_loss, max_features=auto;, score=0.649 total time=   0.2s


  warn(


[CV 3/5] END bootstrap=False, criterion=log_loss, max_features=auto;, score=0.673 total time=   0.2s


  warn(


[CV 4/5] END bootstrap=False, criterion=log_loss, max_features=auto;, score=0.620 total time=   0.2s


  warn(


[CV 5/5] END bootstrap=False, criterion=log_loss, max_features=auto;, score=0.649 total time=   0.2s
[CV 1/5] END bootstrap=False, criterion=log_loss, max_features=None;, score=0.500 total time=   0.6s
[CV 2/5] END bootstrap=False, criterion=log_loss, max_features=None;, score=0.579 total time=   0.6s
[CV 3/5] END bootstrap=False, criterion=log_loss, max_features=None;, score=0.532 total time=   0.6s
[CV 4/5] END bootstrap=False, criterion=log_loss, max_features=None;, score=0.532 total time=   0.6s
[CV 5/5] END bootstrap=False, criterion=log_loss, max_features=None;, score=0.550 total time=   0.6s


In [13]:
print(best_clf.best_params_)

{'bootstrap': True, 'criterion': 'gini', 'max_features': 'sqrt'}


In [14]:
y_pred = best_clf.predict(x_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         6
           5       0.71      0.74      0.72       104
           6       0.59      0.71      0.65        76
           7       0.93      0.50      0.65        28

    accuracy                           0.67       215
   macro avg       0.45      0.39      0.40       215
weighted avg       0.67      0.67      0.66       215



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
model = RandomForestClassifier()
param_grid = [{   
    'n_estimators' : [25, 50, 75, 100, 125, 150, 175, 200],
    'max_depth': [20, 40, 60, 80, 100, None],
    'min_samples_split' : [2, 3, 4, 5],
    'min_samples_leaf' : [1, 2, 3, 4, 5],
    }
]


clf = GridSearchCV(model, param_grid = param_grid, refit=True, verbose=3)
best_clf = clf.fit(x_train, y_train)

Fitting 5 folds for each of 960 candidates, totalling 4800 fits
[CV 1/5] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=25;, score=0.628 total time=   0.0s
[CV 2/5] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=25;, score=0.649 total time=   0.0s
[CV 3/5] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=25;, score=0.661 total time=   0.0s
[CV 4/5] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=25;, score=0.637 total time=   0.0s
[CV 5/5] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=25;, score=0.684 total time=   0.0s
[CV 1/5] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.645 total time=   0.0s
[CV 2/5] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.667 total time=   0.1s
[CV 3/5] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.719 total time=   0.0s


In [16]:
print(best_clf.best_params_)

{'max_depth': 60, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}


In [21]:
y_pred = best_clf.predict(x_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         6
           5       0.71      0.76      0.73       104
           6       0.59      0.70      0.64        76
           7       0.93      0.46      0.62        28

    accuracy                           0.67       215
   macro avg       0.45      0.38      0.40       215
weighted avg       0.67      0.67      0.66       215



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Test for multiple columns in y_df

In [3]:
#split training and testing set 
x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2, random_state=0)

In [21]:
model = RandomForestClassifier()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print('\n\n\n' + str(model))
print(classification_report(y_test, y_pred))




RandomForestClassifier()
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.00      0.00      0.00         6
           2       0.74      0.64      0.69       104
           3       0.71      0.58      0.64        76
           4       0.91      0.36      0.51        28
           5       0.00      0.00      0.00         0

   micro avg       0.74      0.56      0.64       215
   macro avg       0.39      0.26      0.31       215
weighted avg       0.73      0.56      0.63       215
 samples avg       0.56      0.56      0.56       215



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
#with results from hyperparameter tuning
model = RandomForestClassifier(max_depth=60, min_samples_leaf=1, min_samples_split=2, n_estimators=150)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print('\n\n\n' + str(model))
print(classification_report(y_test, y_pred))




RandomForestClassifier(max_depth=60, n_estimators=150)
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.00      0.00      0.00         6
           2       0.73      0.65      0.69       104
           3       0.65      0.54      0.59        76
           4       0.92      0.39      0.55        28
           5       0.00      0.00      0.00         0

   micro avg       0.71      0.56      0.63       215
   macro avg       0.38      0.26      0.31       215
weighted avg       0.70      0.56      0.61       215
 samples avg       0.56      0.56      0.56       215



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
dump(model, '../models/clf_red_wine.joblib')

['../models/clf_red_wine.joblib']

# from aldrin

In [None]:
from hyperopt import fmin, tpe, hp, Trials
from sklearn.ensemble import RandomForestClassifier

#define RandomForestClassifier Hyperparameters
space = {'max_depth': hp.choice('max_depth', range(1,100)),
         'criterion': hp.choice('criterion', ["gini","entropy","log_loss"]),
         'n_estimators': hp.choice('n_estimators', range(1,100))
        }

#define target function
def objective(params):

#create model instance with params
    model = RandomForestClassifier(**params)

#train model
    model.fit(X_train, y_train)

#evaluate and return model score

    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1_macro')
    
    # Extract the best score
    best_score = max(scores)

    # Loss must be minimized
    loss = 1 - best_score

    # Dictionary with information for evaluation
    return {'loss': loss, 'params': params, 'status': STATUS_OK}

#minimize target function with hyperopt
best = fmin(objective,
            space,
            algo=tpe.suggest,
            max_evals=200)

best