# Random Forest Hyper-parametrization

In [1]:
import numpy as np
import pandas as pd
import sys

from data_processor import sentence_to_words, extract_BoW_features
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/Life/jsaldiva/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/Life/jsaldiva/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Load data

In [2]:
data = pd.read_csv('./data/train_test_data.csv')

### Separate labels from data

In [3]:
X, y = data['sentence'], data['label']

## Prepare data

### Normalize sentences

In [4]:
%%time
X_clean = [sentence_to_words(sentence) for sentence in X]
X_clean_with_steeming = [sentence_to_words(sentence, steeming=True) for sentence in X]
X_clean_with_lemmatization = [sentence_to_words(sentence, lemmatization=True) for sentence in X]
X_clean_with_steeming_lemmatization = [sentence_to_words(sentence, True, True) for sentence in X]

CPU times: user 15.2 s, sys: 1.17 s, total: 16.4 s
Wall time: 16.6 s


In [5]:
preprocess_data = {
    'clean': {'data': X_clean, 'labels': y},
    'clean_steeming': {'data': X_clean_with_steeming, 'labels': y},
    'clean_lemmatization': {'data': X_clean_with_lemmatization, 'labels': y},
    'clean_steeming_lemmatization': {'data': X_clean_with_steeming_lemmatization, 'labels': y}
}

### Transform sentences

In [6]:
transformations = ['tc', 'tf', 'tfidf']
max_features=[100, 200, 400, 500, 800]
ngram_ranges=[(1,1),(1,2),(1,3)]
total_cycles = len(transformations)*len(max_features)*len(ngram_ranges)*len(preprocess_data)

In [7]:
transformed_txts = []

In [8]:
with tqdm(total=total_cycles, file=sys.stdout) as pbar:
    for transformation in transformations:        
        for max_feature in max_features:
            for ngram_range in ngram_ranges:
                for data_type, data_dict in preprocess_data.items():
                    pbar.update(1)
                    data = data_dict['data']
                    transformed_data, _ = extract_BoW_features(data, max_feature, transformation, ngram_range)
                    transformed_txts.append(
                        {
                            'transformation': transformation,
                            'max_features': max_feature,
                            'ngram_range': ngram_range,
                            'data': transformed_data,
                            'data_type': data_type,
                            'labels': data_dict['labels']
                        }
                    )

100%|██████████| 180/180 [00:17<00:00, 10.57it/s]


## Set parameters

In [9]:
n_estimators = [int(x) for x in np.linspace(start=100, stop=1000, num=10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 100, num = 10)]
max_depth.append(None)
min_samples_split = [2, 5, 7, 10]
min_samples_leaf = [1, 2, 3]
bootstrap = [True, False]

### Set parameters grid 

In [10]:
param_grid = {'n_estimators': n_estimators,
              'max_features': max_features,
              'max_depth': max_depth,
              'min_samples_split': min_samples_split,
              'min_samples_leaf': min_samples_leaf,
              'bootstrap': bootstrap}

## Run hyper-parametrization

In [17]:
outputs = []
with tqdm(total=len(transformed_txts), file=sys.stdout) as pbar:
    for transformed_txt in transformed_txts:
        pbar.update(1)
        transformed_X = transformed_txt['data']
        y = transformed_txt['labels']
        X_train, _, y_train, _ = train_test_split(transformed_X, y, test_size=0.20, random_state=42)
        rf_classifier = RandomForestClassifier()
        rf_random = RandomizedSearchCV(estimator = rf_classifier, param_distributions = param_grid, n_iter = 50, cv = 5, verbose=2, random_state=42, n_jobs = -1)
        rf_random.fit(X_train, y_train)
        loop_dict = {
            'transformation': transformed_txt['transformation'],
            'max_features': transformed_txt['max_features'],
            'ngram_range': transformed_txt['ngram_range'],
            
        }
        loop_dict.update(rf_random.best_params_)
        outputs.append(loop_dict)

  0%|          | 0/180 [00:00<?, ?it/s]Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   16.6s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:   25.9s finished


  1%|          | 2/180 [00:26<38:47, 13.08s/it]Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.4s


  1%|          | 2/180 [00:42<1:03:45, 21.49s/it]


KeyboardInterrupt: 

In [16]:
len(outputs)

48

## Print out best parameters

In [42]:
rf_random.best_params_

{'n_estimators': 500,
 'min_samples_split': 7,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 20,
 'bootstrap': False}