# Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.chdir(
print(os.getcwd())

In [1]:
import pandas as pd
import numpy as np
import time

# sklearn
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.pipeline import Pipeline
from commons.metrics import *
from sklearn.metrics import classification_report

In [4]:
df = pd.read_csv('datasets\processed_data.csv')

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df['Description'], df['JobTitle'], test_size=0.3,
                                                    random_state=42, stratify=df['JobTitle'])

## RandomForestClassifier

In [7]:
pipline = Pipeline(steps=[
    ('bow', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('model', RandomForestClassifier())
])

In [8]:

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(2, 15, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

On each iteration, the algorithm will choose a difference combination of the features. Altogether, there are 5 * 2 * 6 * 3 * 3 * 2 = 1080 settings! However, the benefit of a random search is that we are not trying every combination, but selecting at random to sample a wide range of values.

In [9]:
# hyperparameter tuning
def grid_search():
    param_grid = {
        'model__n_estimators': n_estimators,
        'model__max_features': max_features,
        'model__max_depth': max_depth,
        'model__min_samples_split': min_samples_split,
        'model__min_samples_leaf': min_samples_leaf,
        'model__bootstrap': bootstrap
    }

    search = RandomizedSearchCV(pipline, param_grid, cv=5, n_jobs=-1, verbose=3)
    search.fit(X_train,y_train)

    print("Best parameter (CV score=%0.3f):" % search.best_score_)
    print(search.best_params_)

In [10]:
start = time.time()

grid_search()

end = time.time()
print('execution time in minutes: ', (end - start)/60) 

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed: 15.3min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 30.8min finished


Best parameter (CV score=0.516):
{'model__n_estimators': 200, 'model__min_samples_split': 2, 'model__min_samples_leaf': 2, 'model__max_features': 'auto', 'model__max_depth': 15, 'model__bootstrap': True}
execution time in minutes:  31.473552453517915


In [8]:
# adding best hyperparameter

rf = RandomForestClassifier(min_samples_split = 2, max_depth=15, n_estimators = 200, min_samples_leaf = 2,
                            max_features = 'auto',bootstrap = True)

pipline = Pipeline(steps=[
    ('bow', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('model', rf)
])

In [9]:
pipline.fit(X_train,y_train)

Pipeline(steps=[('bow', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('model',
                 RandomForestClassifier(max_depth=15, min_samples_leaf=2,
                                        n_estimators=200))])

In [10]:
predictions = pipline.predict(X_test)

In [11]:
print(classification_report(y_test, predictions))

                                                       precision    recall  f1-score   support

                                      .NET Programmer       0.00      0.00      0.00         1
                                 ASIC Design Engineer       0.00      0.00      0.00         1
                                           Accountant       0.00      0.00      0.00         9
                             Accountant - Multifamily       0.00      0.00      0.00         1
                                   Accounting Analyst       0.00      0.00      0.00         1
                                 Accounting Assistant       0.00      0.00      0.00         1
                                Accounting Consultant       0.00      0.00      0.00         1
                Accounts Payable / Receivable Analyst       0.00      0.00      0.00         1
                Accounts Payable / Receivable Manager       0.00      0.00      0.00         2
            Accounts Payable or Receivable Specia

  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
# model evaluation

evaluate(y_test, predictions)

  _warn_prf(average, modifier, msg_start, len(result))


{'precision': '0.58', 'recall': '0.52', 'f1': '0.50', 'accuracy': '0.52'}