In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
import nltk
from nltk.corpus import stopwords
import string
from nltk import word_tokenize, FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

In [2]:
df = pd.read_csv('../Data/data.csv', encoding='latin-1')
df.rename(columns={'emotion_in_tweet_is_directed_at':'object', 'is_there_an_emotion_directed_at_a_brand_or_product':'sentiment'}, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweet_text  9092 non-null   object
 1   object      3291 non-null   object
 2   sentiment   9093 non-null   object
dtypes: object(3)
memory usage: 213.2+ KB


Remove row with missing text data.

In [3]:
df.dropna(thresh=2, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9092 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweet_text  9092 non-null   object
 1   object      3291 non-null   object
 2   sentiment   9092 non-null   object
dtypes: object(3)
memory usage: 284.1+ KB


In [4]:
df.head()

Unnamed: 0,tweet_text,object,sentiment
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [5]:
df['sentiment'].value_counts()

No emotion toward brand or product    5388
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: sentiment, dtype: int64

In [6]:
df['sentiment'].replace("I can't tell", "Unknown", inplace=True)
df['sentiment'].replace("No emotion toward brand or product", "None", inplace=True)
df['sentiment'].replace(" emotion", "", inplace=True)
df['sentiment'].value_counts()

None                5388
Positive emotion    2978
Negative emotion     570
Unknown              156
Name: sentiment, dtype: int64

In [7]:
df.head()

Unnamed: 0,tweet_text,object,sentiment
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [8]:
X = df[['tweet_text', 'object']]
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=18, stratify=y)

In [9]:
X_train.reset_index(inplace=True)
X_train.drop('index', axis=1, inplace=True)
X_test.reset_index(inplace=True)
X_test.drop('index', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


## Remove Stopwords

In [10]:
#Create List of stopwords & punctuation
stopwords_list = stopwords.words('english') + list(string.punctuation)
stopwords_list += ["''", '""', '...', '``']

The '@' and '#' symbols carry special meaning in text on twitter and other social media platforms.  To include this meaning in the analysis, we'll remove these characters from the stopwords list.  The dataset also includes '{link}' in place of any actual url links.  We'll leave '{' and '}' in the text to reflect this.

In [11]:
stopwords_list.remove('#')
stopwords_list.remove('@')
stopwords_list.remove('{')
stopwords_list.remove('}')

In [12]:
def remove_stopwords(text):
    tokens = nltk.word_tokenize(text)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stopwords_list]
    return stopwords_removed    

In [13]:
processed = list(map(remove_stopwords, X_train['tweet_text']))

In [14]:
joined_text_list = []
for word_list in processed:
    joined_text = ' '.join(word_list)
    joined_text_list.append(joined_text)
joined_text_list[:5]

['ipad2 3 weeks w ipad since gave @ mention # sxsw withdrawal',
 'rt @ mention first shots w/ipad 2 # sxsw { link }',
 "rt @ mention ning amp mobile roadie thrilled offer unofficial # sxsw insider 's guide iphone fun austin { link }",
 "rt @ mention bounced catch google 's marissa mayer speak always admired intelligent classy successful # sxsw",
 'part journalsim support democracy yes informed populous yes ipad focus support # newsapps # sxsw']

In [15]:
X_train['processed_text'] = joined_text_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [16]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6819 entries, 0 to 6818
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tweet_text      6819 non-null   object
 1   object          2467 non-null   object
 2   processed_text  6819 non-null   object
dtypes: object(3)
memory usage: 159.9+ KB


In [17]:
X_train.head()

Unnamed: 0,tweet_text,object,processed_text
0,No ipad2 for me. Now I have 3 weeks w no iPad ...,,ipad2 3 weeks w ipad since gave @ mention # sx...
1,RT @mention First shots w/iPad 2 from #sxsw {l...,,rt @ mention first shots w/ipad 2 # sxsw { link }
2,RT @mention Ning &amp; Mobile Roadie are thril...,,rt @ mention ning amp mobile roadie thrilled o...
3,RT @mention Bounced over to catch Google's Mar...,,rt @ mention bounced catch google 's marissa m...
4,Part of Journalsim is the support of democracy...,iPad,part journalsim support democracy yes informed...


## TF-IDF Vectorization

In [18]:
vectorizer = TfidfVectorizer()
tf_idf_data_train = vectorizer.fit_transform(X_train['processed_text'])
tf_idf_data_train.shape

(6819, 8530)

6.8k tweets with 8.5k unique words in the vocabulary.

## RF Classifier

In [19]:
#Create pipeline
pipe_forest = Pipeline([('forest', RandomForestClassifier(random_state=70, n_jobs=-1, bootstrap=True))])

In [22]:
# Create the grid parameter
grid_forest = [{'forest__n_estimators': [200],
             'forest__max_depth': [1, 5, 25, 50],
             'forest__min_samples_split': [2, 5, 25], 
             'forest__min_samples_leaf': [1, 5, 25], 
             'forest__criterion': ['gini', 'entropy'],
             'forest__max_features': ['auto', 'sqrt', 'log2'],
             'forest__max_samples': [None, .2, .5, .8]
             }]

# Create the grid, with "pipe" as the estimator
gridsearch_forest = GridSearchCV(estimator=pipe_forest, 
                          param_grid=grid_forest,
                          scoring='accuracy',
                          return_train_score=True, #Include training results in cv_results
                          cv=3, #Use 3 folds in CV process
                          n_jobs=-1, #Use paralell computing
                          verbose=4) #Give updates on progress during fitting

In [23]:
gridsearch_forest.fit(tf_idf_data_train, y_train)

Fitting 3 folds for each of 864 candidates, totalling 2592 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:   19.9s
[Parallel(n_jobs=-1)]: Done 205 tasks      | elapsed:   43.6s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 597 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 866 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 1185 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 12.1min
[Parallel(n_jobs=-1)]: Done 2592 out of 2592 | elapsed: 13.0min finished


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('forest',
                                        RandomForestClassifier(n_jobs=-1,
                                                               random_state=70))]),
             n_jobs=-1,
             param_grid=[{'forest__criterion': ['gini', 'entropy'],
                          'forest__max_depth': [1, 5, 25, 50],
                          'forest__max_features': ['auto', 'sqrt', 'log2'],
                          'forest__max_samples': [None, 0.2, 0.5, 0.8],
                          'forest__min_samples_leaf': [1, 5, 25],
                          'forest__min_samples_split': [2, 5, 25],
                          'forest__n_estimators': [200]}],
             return_train_score=True, scoring='accuracy', verbose=4)

In [24]:
gridsearch_forest.best_params_

{'forest__criterion': 'gini',
 'forest__max_depth': 50,
 'forest__max_features': 'auto',
 'forest__max_samples': None,
 'forest__min_samples_leaf': 1,
 'forest__min_samples_split': 5,
 'forest__n_estimators': 200}

In [27]:
gridsearch_forest_df = pd.DataFrame.from_dict(gridsearch_forest.cv_results_)
gridsearch_forest_df.sort_values(by=['mean_test_score'], ascending=False).head(20)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_forest__criterion,param_forest__max_depth,param_forest__max_features,param_forest__max_samples,param_forest__min_samples_leaf,param_forest__min_samples_split,...,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
361,7.114472,0.189051,0.178533,0.046561,gini,50,sqrt,,1,5,...,0.641443,0.640123,0.64115,0.000748,1,0.758689,0.756049,0.761989,0.758909,0.00243
325,7.917075,0.074119,0.143317,0.046349,gini,50,auto,,1,5,...,0.641443,0.640123,0.64115,0.000748,1,0.758689,0.756049,0.761989,0.758909,0.00243
360,7.832189,0.144521,0.143255,0.047762,gini,50,sqrt,,1,2,...,0.642763,0.640123,0.639977,0.002337,3,0.782446,0.778047,0.783326,0.781273,0.002309
324,8.283978,0.224978,0.142927,0.047153,gini,50,auto,,1,2,...,0.642763,0.640123,0.639977,0.002337,3,0.782446,0.778047,0.783326,0.781273,0.002309
362,5.887313,0.576168,0.178125,0.048393,gini,50,sqrt,,1,25,...,0.638803,0.635284,0.637337,0.001496,5,0.723053,0.719754,0.724373,0.722393,0.001943
326,6.987861,0.671505,0.176646,0.048612,gini,50,auto,,1,25,...,0.638803,0.635284,0.637337,0.001496,5,0.723053,0.719754,0.724373,0.722393,0.001943
352,7.261401,0.291828,0.178529,0.049368,gini,50,auto,0.8,1,5,...,0.637923,0.631324,0.63543,0.002926,7,0.745491,0.742411,0.743731,0.743877,0.001262
388,7.006231,0.117164,0.142067,0.047635,gini,50,sqrt,0.8,1,5,...,0.637923,0.631324,0.63543,0.002926,7,0.745491,0.742411,0.743731,0.743877,0.001262
387,7.104897,0.363592,0.143401,0.047068,gini,50,sqrt,0.8,1,2,...,0.635724,0.636164,0.63499,0.00136,9,0.765728,0.760449,0.763528,0.763235,0.002165
351,7.262914,0.406968,0.212097,0.002764,gini,50,auto,0.8,1,2,...,0.635724,0.636164,0.63499,0.00136,9,0.765728,0.760449,0.763528,0.763235,0.002165


In [None]:
best_models = gridsearch_forest_df.loc[gridsearch_forest_df['rank_test_score'] < 6]
best_models

In [33]:
#Create pipeline - Lock in parameters based on previous model

#Remove entropy based on previous results
#auto & sqrt are the same according to docs
pipe_forest2 = Pipeline([('forest', RandomForestClassifier(random_state=70, n_jobs=-1, bootstrap=True, criterion='gini', max_features='sqrt'))])

In [34]:
# Create the grid parameter
grid_forest2 = [{'forest__n_estimators': [200],
             'forest__max_depth': [50, 75, 100],
             'forest__min_samples_split': [2, 5, 25], 
             'forest__min_samples_leaf': [1, 5, 25], 
             'forest__max_samples': [.2, .5, .8] #Remove option to use all samples in bootstrp.  To combat overfitting.
             }]

# Create the grid, with "pipe" as the estimator
gridsearch_forest2 = GridSearchCV(estimator=pipe_forest2, 
                          param_grid=grid_forest2,
                          scoring='accuracy',
                          return_train_score=True, #Include training results in cv_results
                          cv=3, #Use 3 folds in CV process
                          n_jobs=-1, #Use paralell computing
                          verbose=4) #Give updates on progress during fitting

In [35]:
gridsearch_forest2.fit(tf_idf_data_train, y_train)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:   39.2s
[Parallel(n_jobs=-1)]: Done 205 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 243 out of 243 | elapsed:  2.1min finished


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('forest',
                                        RandomForestClassifier(max_features='sqrt',
                                                               n_jobs=-1,
                                                               random_state=70))]),
             n_jobs=-1,
             param_grid=[{'forest__max_depth': [50, 75, 100],
                          'forest__max_samples': [0.2, 0.5, 0.8],
                          'forest__min_samples_leaf': [1, 5, 25],
                          'forest__min_samples_split': [2, 5, 25],
                          'forest__n_estimators': [200]}],
             return_train_score=True, scoring='accuracy', verbose=4)

In [36]:
gridsearch_forest2.best_params_

{'forest__max_depth': 100,
 'forest__max_samples': 0.8,
 'forest__min_samples_leaf': 1,
 'forest__min_samples_split': 2,
 'forest__n_estimators': 200}

In [37]:
gridsearch_forest_df2 = pd.DataFrame.from_dict(gridsearch_forest2.cv_results_)
gridsearch_forest_df2.sort_values(by=['mean_test_score'], ascending=False).head(20)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_forest__max_depth,param_forest__max_samples,param_forest__min_samples_leaf,param_forest__min_samples_split,param_forest__n_estimators,params,...,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
72,11.390336,0.408708,0.143235,0.049102,100,0.8,1,2,200,"{'forest__max_depth': 100, 'forest__max_sample...",...,0.649362,0.647602,0.649362,0.001437,1,0.91597,0.907611,0.921249,0.914944,0.005615
74,9.036112,1.21951,0.177156,0.047008,100,0.8,1,25,200,"{'forest__max_depth': 100, 'forest__max_sample...",...,0.651562,0.641003,0.647456,0.004619,2,0.823141,0.822481,0.826221,0.823948,0.00163
73,10.733454,0.204149,0.180419,0.048814,100,0.8,1,5,200,"{'forest__max_depth': 100, 'forest__max_sample...",...,0.641003,0.644963,0.645696,0.004163,3,0.883854,0.875935,0.888033,0.882607,0.005017
46,7.740358,0.0881,0.111053,0.001586,75,0.8,1,5,200,"{'forest__max_depth': 75, 'forest__max_samples...",...,0.643643,0.645842,0.644523,0.00095,4,0.821161,0.820282,0.824681,0.822041,0.001901
45,8.464817,0.181826,0.14209,0.046208,75,0.8,1,2,200,"{'forest__max_depth': 75, 'forest__max_samples...",...,0.645403,0.641883,0.643936,0.001496,5,0.857237,0.846018,0.860757,0.854671,0.006285
47,7.299459,0.290567,0.178249,0.048391,75,0.8,1,25,200,"{'forest__max_depth': 75, 'forest__max_samples...",...,0.646722,0.641003,0.643496,0.002392,6,0.777167,0.775627,0.780246,0.77768,0.00192
64,8.980183,0.121701,0.211513,0.001281,100,0.5,1,5,200,"{'forest__max_depth': 100, 'forest__max_sample...",...,0.644523,0.638803,0.64203,0.002392,7,0.812802,0.815662,0.820722,0.816395,0.003274
63,9.259192,0.394029,0.211585,0.001459,100,0.5,1,2,200,"{'forest__max_depth': 100, 'forest__max_sample...",...,0.642323,0.639683,0.640856,0.001097,8,0.849098,0.842719,0.849758,0.847192,0.003174
65,7.163476,1.523602,0.211788,0.00374,100,0.5,1,25,200,"{'forest__max_depth': 100, 'forest__max_sample...",...,0.644083,0.634844,0.63851,0.004005,9,0.768148,0.769908,0.779806,0.772621,0.005132
37,6.856656,0.10325,0.177202,0.04855,75,0.5,1,5,200,"{'forest__max_depth': 75, 'forest__max_samples...",...,0.638363,0.638363,0.637337,0.001452,10,0.767268,0.774307,0.776507,0.772694,0.00394


In [66]:
gridsearch_forest_df2['train_test_diff'] = gridsearch_forest_df2['mean_train_score'] - gridsearch_forest_df2['mean_test_score']

#sort by difference btwn train vs test scores
min_overfitting = gridsearch_forest_df2.sort_values(by=['train_test_diff'], ascending=True)

#Remove unneeded columns
min_overfitting = min_overfitting[['train_test_diff', 'mean_test_score', 'mean_train_score', 'param_forest__max_depth', 'param_forest__max_samples', 'param_forest__min_samples_leaf', 'param_forest__min_samples_split']]

In [68]:
min_overfitting = min_overfitting.loc[min_overfitting['train_test_diff'] < 0.05]
min_overfitting.sort_values(by=['mean_test_score'], ascending=False).head(25)

Unnamed: 0,train_test_diff,mean_test_score,mean_train_score,param_forest__max_depth,param_forest__max_samples,param_forest__min_samples_leaf,param_forest__min_samples_split
77,0.03153,0.625458,0.656988,100,0.8,5,25
75,0.033363,0.624138,0.657501,100,0.8,5,2
76,0.033363,0.624138,0.657501,100,0.8,5,5
49,0.031163,0.624138,0.655301,75,0.8,5,5
48,0.031163,0.624138,0.655301,75,0.8,5,2
50,0.030796,0.623845,0.654641,75,0.8,5,25
29,0.042748,0.621059,0.663807,75,0.2,1,25
23,0.024124,0.620326,0.644449,50,0.8,5,25
21,0.025444,0.619886,0.645329,50,0.8,5,2
22,0.025444,0.619886,0.645329,50,0.8,5,5


* max_depth in 75 to >100 range
* max_samples around 0.8
* min_samples_leaf >5, <20
* min_samples_split >2, <25

In [69]:
#Create pipeline - Lock in parameters based on previous model

#Remove entropy based on previous results
#auto & sqrt are the same according to docs
pipe_forest3 = Pipeline([('forest', RandomForestClassifier(random_state=70, n_jobs=-1, bootstrap=True, criterion='gini', max_features='sqrt'))])

In [70]:
# Create the grid parameter
grid_forest3 = [{'forest__n_estimators': [200],
             'forest__max_depth': [75, 85, 110, 150],
             'forest__min_samples_split': [4, 7, 10, 15], 
             'forest__min_samples_leaf': [7, 10, 12, 15], 
             'forest__max_samples': [.7, .8, .9]
             }]

# Create the grid, with "pipe" as the estimator
gridsearch_forest3 = GridSearchCV(estimator=pipe_forest3, 
                          param_grid=grid_forest3,
                          scoring='accuracy',
                          return_train_score=True, #Include training results in cv_results
                          cv=3, #Use 3 folds in CV process
                          n_jobs=-1, #Use paralell computing
                          verbose=4) #Give updates on progress during fitting

In [71]:
gridsearch_forest3.fit(tf_idf_data_train, y_train)

Fitting 3 folds for each of 192 candidates, totalling 576 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:   34.5s
[Parallel(n_jobs=-1)]: Done 205 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 576 out of 576 | elapsed:  3.9min finished


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('forest',
                                        RandomForestClassifier(max_features='sqrt',
                                                               n_jobs=-1,
                                                               random_state=70))]),
             n_jobs=-1,
             param_grid=[{'forest__max_depth': [75, 85, 110, 150],
                          'forest__max_samples': [0.7, 0.8, 0.9],
                          'forest__min_samples_leaf': [7, 10, 12, 15],
                          'forest__min_samples_split': [4, 7, 10, 15],
                          'forest__n_estimators': [200]}],
             return_train_score=True, scoring='accuracy', verbose=4)

In [72]:
gridsearch_forest3.best_params_

{'forest__max_depth': 75,
 'forest__max_samples': 0.9,
 'forest__min_samples_leaf': 7,
 'forest__min_samples_split': 15,
 'forest__n_estimators': 200}

In [73]:
gridsearch_forest_df3 = pd.DataFrame.from_dict(gridsearch_forest3.cv_results_)
gridsearch_forest_df3.sort_values(by=['mean_test_score'], ascending=False).head(20)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_forest__max_depth,param_forest__max_samples,param_forest__min_samples_leaf,param_forest__min_samples_split,param_forest__n_estimators,params,...,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
35,4.391086,0.073613,0.143997,0.049403,75,0.9,7,15,200,"{'forest__max_depth': 75, 'forest__max_samples...",...,0.614166,0.615926,0.613873,0.001808,1,0.628465,0.634844,0.630004,0.631104,0.002718
83,4.343822,0.042848,0.108909,0.001129,85,0.9,7,15,200,"{'forest__max_depth': 85, 'forest__max_samples...",...,0.614166,0.615926,0.613726,0.002,2,0.628245,0.635064,0.629784,0.631031,0.00292
179,4.449345,0.098457,0.143732,0.046444,150,0.9,7,15,200,"{'forest__max_depth': 150, 'forest__max_sample...",...,0.614166,0.615926,0.613726,0.002,2,0.628245,0.635064,0.629784,0.631031,0.00292
131,4.338524,0.18684,0.109974,0.002477,110,0.9,7,15,200,"{'forest__max_depth': 110, 'forest__max_sample...",...,0.614166,0.615926,0.613726,0.002,2,0.628245,0.635064,0.629784,0.631031,0.00292
128,3.856939,0.289372,0.144278,0.049895,110,0.9,7,4,200,"{'forest__max_depth': 110, 'forest__max_sample...",...,0.614166,0.615926,0.613286,0.00259,5,0.630884,0.635284,0.630004,0.632057,0.002309
81,4.319267,0.186667,0.143671,0.049532,85,0.9,7,7,200,"{'forest__max_depth': 85, 'forest__max_samples...",...,0.614166,0.615926,0.613286,0.00259,5,0.630884,0.635064,0.630004,0.631984,0.002207
32,4.031406,0.087684,0.144441,0.049655,75,0.9,7,4,200,"{'forest__max_depth': 75, 'forest__max_samples...",...,0.614166,0.615926,0.613286,0.00259,5,0.630664,0.634844,0.629784,0.631764,0.002207
33,4.321893,0.10697,0.109104,0.001041,75,0.9,7,7,200,"{'forest__max_depth': 75, 'forest__max_samples...",...,0.614166,0.615926,0.613286,0.00259,5,0.630664,0.634844,0.629784,0.631764,0.002207
80,4.004467,0.074456,0.110058,0.001762,85,0.9,7,4,200,"{'forest__max_depth': 85, 'forest__max_samples...",...,0.614166,0.615926,0.613286,0.00259,5,0.630884,0.635064,0.630004,0.631984,0.002207
129,4.326383,0.083914,0.111555,0.001731,110,0.9,7,7,200,"{'forest__max_depth': 110, 'forest__max_sample...",...,0.614166,0.615926,0.613286,0.00259,5,0.630884,0.635284,0.630004,0.632057,0.002309


In [None]:
# Create the grid parameter
grid_forest = [{'forest__n_estimators': [200],
             'forest__max_depth': [1, 5, 15, 25, 50],
             'forest__min_samples_split': [2, 5, 10, 25, 50], 
             'forest__min_samples_leaf': [1, 3, 5, 10, 25], 
             'forest__criterion': ['gini', 'entropy'],
             'forest__max_features': ['auto', 'sqrt', 'log2'],
             'forest__max_samples': [None, .2, .5, .8]
             }]

# Create the grid, with "pipe" as the estimator
gridsearch_forest = RandomizedSearchCV(estimator=pipe_forest, 
                          param_distributions=grid_forest, 
                          return_train_score=True, #Include training results in cv_results
                          cv=5, #Use 5 folds in CV process
                          n_iter=500, #Try 500 hyperparameter combinations
                          n_jobs=-1, #Use paralell computing
                          verbose=8) #Give updates on progress during fitting