In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support, f1_score, precision_score, recall_score

In [3]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
pd.options.display.max_colwidth = 5000
# import matplotlib
# import nltk
#import swifter
#import collections
#from wordcloud import WordCloud
#from sklearn.model_selection import train_test_split
import logging.handlers
#from prettytable import PrettyTable
from datetime import date
# import pickle


In [4]:
# number of prototypical words to keep.
k = 200
log_to_file = True


try:
    log.info("*******************************")
    log.info("Log is already initiated.")
except:
    if log_to_file:
        logging.basicConfig(filename=f"../data/log/log_CV training_Kaggle_.log",
                            filemode='a',
                            format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
                            datefmt='%H:%M:%S',
                            level=logging.DEBUG)
        log = logging.getLogger("Bot")
        log.addHandler(logging.StreamHandler())
        log.info("###########################################################")

    else:
        log = logging.getLogger("Bot")
        log.setLevel(logging.DEBUG)
        log.addHandler(logging.StreamHandler())

###########################################################


In [5]:
# Read data file (2019&2020)
data = pd.read_feather('../data/feather_files/KaggleDataset_cleaned.feather')
data.head(3)

Unnamed: 0,X,Y
0,trey radel slam,0
1,video full high cost break,0
2,please join today remember fall hero honor woman currently military service,1


In [6]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data.X)
Y = data.Y

In [7]:
test_size = .2
X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=test_size, random_state=100,shuffle=True, stratify=data.Y)
log.info(f"Taking {round(.2*100,2)}% test subset. The resulting train shape is {X_train.shape} and test shape is {Y_valid.shape}" )


Taking 20.0% test subset. The resulting train shape is (3947, 7600) and test shape is (987,)


In [8]:
# best parameter
best_fit = RandomForestClassifier(bootstrap=False, class_weight='balanced_subsample',
                       max_depth=80, min_samples_leaf=2, min_samples_split=5, n_estimators=1000, n_jobs=4, verbose=1)

best_fit.fit(X_train, Y_train)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.8s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    4.2s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    7.7s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    9.6s finished


RandomForestClassifier(bootstrap=False, class_weight='balanced_subsample',
                       max_depth=80, min_samples_leaf=2, min_samples_split=5,
                       n_estimators=1000, n_jobs=4, verbose=1)

In [None]:
class_weight = ['balanced', 'balanced_subsample']
n_estimators = [50, 100, 150, 500, 1000]
max_features = ['auto', 'sqrt']
max_depth = [10, 50, 80, 100, 120]
min_samples_split = [2, 5, 6, 7, 10]
min_samples_leaf = [1, 2, 4, 6]
bootstrap = [True, False]
param_grid = {'class_weight' : class_weight,
               'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
               
rf = RandomForestClassifier(verbose=10)
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = param_grid, 
                               n_iter = 10, cv = 5, random_state=6, n_jobs = -1,verbose = 600)
                               
rf_random.fit(X_train,Y_train)
optimized_rf = rf_random.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [None]:
def SCORER(y_true, y_pred):
    scr = pd.Series(dtype=np.float32)
    scr["f1_score"] = f1_score(y_true, y_pred)
    scr["Accuracy"] = accuracy_score(y_true, y_pred)
    scr['Precision'] = precision_score(y_true, y_pred)
    scr['Recall'] = recall_score(y_true, y_pred)
    return scr

pred_train = best_fit.predict(X_train)
pred_valid = best_fit.predict(X_valid)
scores = pd.DataFrame()
scores[f'Train'] = SCORER(Y_train, pred_train)
scores[f'Valid'] = SCORER(Y_valid, pred_valid)
log.info(tabulate(scores, headers='keys', tablefmt='psql'))
log.info(tabulate(scores, headers='keys', tablefmt='latex_raw'))

In [None]:
log.info("Best parameters")
log.info(optimized_rf)
log.info("Best score")
log.info(rf_random.best_score_)
preds = rf_random.predict(X_valid)
log.info(f"Confusion Matrix:")
log.info(confusion_matrix(Y_valid, preds))
log.info("Accuracy")
log.info(accuracy_score(Y_valid,preds))
log.info("Precision Recall Fscore Support")
log.info(precision_recall_fscore_support(Y_valid, preds))

---------
## Run best fit

In [None]:
model_outfile = "../data/models/randomforest_26Nov.pickle"
log.info(f"Saving trained model to {model_outfile}")
pickle.dump(best_fit, open(model_outfile, 'wb'))

In [None]:

preds = best_fit.predict(X_valid)
log.info(f"Confusion Matrix:")
log.info(confusion_matrix(Y_valid, preds))
log.info("Accuracy")
log.info(accuracy_score(Y_valid,preds))
log.info("Precision Recall Fscore Support")
log.info(precision_recall_fscore_support(Y_valid, preds))