In [None]:
#standard libraries
import os, re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
%pylab inline

#ML libraries
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import *
from sklearn.metrics import *
from sklearn.utils import shuffle
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizerSearchCV
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier


#BO libraries
import GPy
import GPyOpt
rs=1234

In [None]:
#data preparation and loading
np.random.seed(rs)
random.seed(rs)
text=[]
clas = []
classname = ["pos", "neg"]
#load training examples
for item in classname:
    for file in os.listdir("Data/aclImdb/test/" +item ):
            filename = "Data/aclImdb/test/" +item + "//"+file
            fl = open(filename, "r", encoding="utf8").read()
            fl = re.sub("\n", " ", fl)
            text.append(fl)
            clas.append(item)
#load testing examples
for item in classname:
    for file in os.listdir("Data/aclImdb/train/" +item ):
            filename = "Data/aclImdb/train/" +item + "//"+file
            fl = open(filename, "r", encoding="utf8").read()
            fl = re.sub("\n", " ", fl)
            text.append(fl)
            clas.append(item)
#store in dataframe
dataframe = pd.DataFrame(clas, columns=['class'])
dataframe["text"] = text
dataframe = shuffle(dataframe)
dataframe = dataframe.sample(frac=1).reset_index(drop=True)
print("We have "+str(len(text))+" classified examples")

#choose 5000 to be fixed training set
#leave remaing 45000 to be test set/represent the population
Y = dataframe["class"].tolist()
X = dataframe["text"].tolist()
X_train_fixed=X[:1000]
Y_train_fixed=Y[:1000]
X_pop=X[1000:]
Y_pop=Y[1000:]
#prepare for training
count_vect = CountVectorizer(min_df = 10, ngram_range = (1, 3),stop_words="english",analyzer='word',max_features=500)
X_train_counts = count_vect.fit_transform(X_train_fixed)
tfidf_transformer = TfidfTransformer()
X_train_fixed_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_pop_counts=count_vect.transform(X_pop)
X_pop_tfidf=tfidf_transformer.transform(X_pop_counts)

In [None]:
#set up domain of parameters
#specify if continous and give a range
# or if discrete give a list of possible values
domain=[{'name': 'max_features',      'type': 'continuous', 'domain': (0.00001,1)},
    {'name': 'max_depth', 'type':'discrete', 'domain':tuple(range(2,6))},
    {'name': 'min_samples_split',  'type': 'continuous', 'domain': (0.00000001,0.5)},
    {'name': 'min_samples_leaf',  'type': 'continuous', 'domain': (0.00000001,0.5)}] 

In [None]:
#set up the function to optimize: The performance of our ML model
def fit_RF(x):
    x = np.atleast_2d(x)
    fs = np.zeros((x.shape[0],1))
    for i in range(x.shape[0]):
        clf=RandomForestClassifier(random_state=1234,n_estimators=1000,max_features=x[i,0],max_depth=int(x[i,1]),min_samples_split=x[i,2],min_samples_leaf=x[i,3],n_jobs=4)
        fs[i]=-np.mean(cross_validate(clf, X_train_fixed_tfidf, Y_train_fixed, cv=5,n_jobs=5)['test_score'])
    return fs


In [None]:
#repeat many times
many_scores2=[]
for j in range(0,50):
    print(j)
    #set up and initialize BO model
    opt = GPyOpt.methods.BayesianOptimization(f =fit_RF,  # function to optimize       
                                          domain = domain,         # box-constrains of the problem
                                          acquisition_type ='LCB',
                                          initial_design_type="random",
                                          initial_design_numdata=15,
                                          kernel=GPy.kern.Matern52(len(domain)) #type of Gaussian Process most appropriate for ML models
                                         )   
    opt.run_optimization(max_iter=66)  
    scores=opt.Y*[-1]
    scores=[np.max(scores[:i]) for i in range(1,len(scores))]
    many_scores2.append(scores)