# ULMFit

In [1]:
# warnings
from warnings import simplefilter
simplefilter("ignore", category=FutureWarning)
simplefilter("ignore", category=DeprecationWarning)

# custom built functions
from logs.get_logs import logger
from dataPrep.get_data_fold import data_read
from utils.utils import set_seed

# import libraries
# import libraries
seed=42
import os
os.environ['PYTHONHASHSEED'] = str(seed)
import random
random.seed(seed)
import numpy as np
np.random.seed(seed)
import time
import pandas as pd

# NLP Preprocessing
from gensim.utils import simple_preprocess

import csv
import fasttext

from sklearn.metrics import classification_report, confusion_matrix, f1_score
import seaborn as sns
import matplotlib.pyplot as plt

# Data

In [2]:
## inputs
# Choose model
model_select = "fastText" # Options: SVM, RoBERTa, LSTM, Longformer, OpenAIGPT2

# Choose
model_tokenize=0
TokenizeCombine=0
model_train = 1
model_predict = 0

# logger
task = "_Train_Test" # Train Test
taskName = model_select + task
root_dir = '/home/ravi/PROJECTS_DATA/DataModelsResults'
model_folder = root_dir + "/Results/" + model_select + "/"
log_dir_fname = model_folder + taskName +".log"
print("log_dir_fname: {}".format(log_dir_fname))
logger = logger(log_dir_fname=log_dir_fname)

log_dir_fname: /home/ravi/PROJECTS_DATA/DataModelsResults/Results/fastText/fastText_Train_Test.log


In [3]:
## Load data. Get K-Fold data. Save 5 fold indices (80% train, 20% test)
all_train_data, train_data, val_data, test_data = data_read(logger, root_dir)
del train_data, val_data

# # For Trial run
# frac = 0.01
# # for id-ying the threshold of compute to run models
# all_train_data = all_train_data.sample(frac=frac, replace=True, random_state=42)
# test_data = test_data.sample(frac=frac, replace=True, random_state=42)
# logger.info("all_train_data {}, all_train_data.shape {}".format(frac, all_train_data.shape))
# logger.info("frac {}, test_data.shape {}".format(frac, test_data.shape))

2023-04-30 17:45:25,487 | get_data_fold.py: 11: data_read() |  INFO: all_train_data.shape (301715, 14)
2023-04-30 17:45:25,489 | get_data_fold.py: 12: data_read() |  INFO: train_data.shape (241372, 14)
2023-04-30 17:45:25,490 | get_data_fold.py: 13: data_read() |  INFO: dev_data.shape (60343, 14)
2023-04-30 17:45:25,491 | get_data_fold.py: 14: data_read() |  INFO: test_data.shape (75429, 14)


In [4]:
all_train_data = all_train_data[["label", "article"]]
test_data = test_data[["label", "article"]]

In [5]:
all_train_data.head()

Unnamed: 0,label,article
0,2,"Discord, a Slack-like chat app originally desi..."
1,2,"It all started in May, when a paper titled ""Th..."
2,0,"Campaigning in Kentucky over the weekend, Bern..."
3,1,"Two nights ago, I watched a video that made me..."
4,2,0 Jim Grant has always offered a good balance ...


In [6]:
# Prefixing each row of the category column with '__label__'
all_train_data.iloc[:, 0] = all_train_data.iloc[:, 0].apply(lambda x: '__label__' + str(x))
test_data_yTrue_yPred=test_data.copy()
test_data.iloc[:, 0] = test_data.iloc[:, 0].apply(lambda x: '__label__' + str(x))

In [7]:
all_train_data.head()

Unnamed: 0,label,article
0,__label__2,"Discord, a Slack-like chat app originally desi..."
1,__label__2,"It all started in May, when a paper titled ""Th..."
2,__label__0,"Campaigning in Kentucky over the weekend, Bern..."
3,__label__1,"Two nights ago, I watched a video that made me..."
4,__label__2,0 Jim Grant has always offered a good balance ...


In [8]:
print(len(all_train_data), len(test_data))

301715 75429


In [9]:
# Saving the CSV file as a text file to train/test the classifier
all_train_data[['label', 'article']].to_csv(model_folder+'train.txt', 
                                          index = False, 
                                          sep = ' ',
                                          header = None, 
                                          quoting = csv.QUOTE_NONE, 
                                          quotechar = "", 
                                          escapechar = " ")

In [10]:
test_data[['label', 'article']].to_csv(model_folder+'test.txt', 
                                     index = False, 
                                     sep = ' ',
                                     header = None, 
                                     quoting = csv.QUOTE_NONE, 
                                     quotechar = "", 
                                     escapechar = " ")

In [None]:
# Training the fastText classifier
model = fasttext.train_supervised(model_folder+'train.txt', lr=0.1, epoch=1000, wordNgrams=2, bucket=200000, dim=50, loss='hs')

Read 647M words
Number of words:  11904981
Number of labels: 3
Progress:  26.6% words/sec/thread:  949364 lr:  0.073377 avg.loss:  0.119560 ETA:   2h57m26s41s 16.0% words/sec/thread:  952359 lr:  0.084010 avg.loss:  0.152571 ETA:   3h22m31s 18.8% words/sec/thread:  951345 lr:  0.081167 avg.loss:  0.141427 ETA:   3h15m52s 19.4% words/sec/thread:  951876 lr:  0.080645 avg.loss:  0.139897 ETA:   3h14m30s avg.loss:  0.138084 ETA:   3h13m45s ETA:   3h11m52s 20.6% words/sec/thread:  950409 lr:  0.079412 avg.loss:  0.135228 ETA:   3h11m49s 20.8% words/sec/thread:  950431 lr:  0.079200 avg.loss:  0.134641 ETA:   3h11m18s 20.8% words/sec/thread:  950049 lr:  0.079165 avg.loss:  0.134571 ETA:   3h11m18s 21.1% words/sec/thread:  951119 lr:  0.078947 avg.loss:  0.134029 ETA:   3h10m34s 21.2% words/sec/thread:  950965 lr:  0.078831 avg.loss:  0.133972 ETA:   3h10m19s 23.8% words/sec/thread:  950568 lr:  0.076162 avg.loss:  0.126165 ETA:   3h 3m57s 25.1% words/sec/thread:  950544 lr:  0.074881 avg.l

In [None]:
# Evaluating performance on the entire test file
model.test(model_folder+'test.txt')                      

In [None]:
# Predicting on a single input
model.predict(test_data.iloc[2, 1])

In [None]:
# Save the trained model
model.save_model(model_folder+'model.bin')

# Predict on Test data

In [None]:
predictions = []
for tweet in test_data['article']:
    yPred=model.predict(tweet)[0][0]
    # print()
    predictions.append(int(yPred.split('_')[-1]))
    # break

In [None]:
test_data_yTrue_yPred["y_pred"]=np.array(predictions)

In [None]:
test_data_yTrue_yPred.head(1)

In [None]:
test_data_yTrue_yPred.to_json(model_folder+'/test_data_yTrue_yPred.json', orient = 'records')

In [None]:
# metrics
target_names = ['Liberal', 'Conservative', 'Restricted']
classi_report = classification_report(test_data_yTrue_yPred.label, test_data_yTrue_yPred.y_pred, target_names=target_names, digits=4)
logger.info("classi_report:\n{}".format(classi_report))
logger.info("Testing f1_weighted score: {}".format(f1_score(test_data_yTrue_yPred.label, test_data_yTrue_yPred.y_pred, average='weighted')))
logger.info("Plot ConfusionMatrix")
cm = confusion_matrix(test_data_yTrue_yPred.label, test_data_yTrue_yPred.y_pred)
fig, ax = plt.subplots(figsize=(3,3))
display_labels=['Liberal', 'Conservative', 'Restricted']
SVM_ConfusionMatrix = sns.heatmap(cm, annot=True, xticklabels=display_labels, yticklabels=display_labels, cmap='Blues', ax=ax, fmt='d')
plt.yticks(va="center")
plt.xticks(va="center")
fig.savefig(model_folder+'fasText_ConfusionMatrix.png', format='png', dpi=1200, bbox_inches='tight')
# logger.info("ULMFitmodel prediction time {} seconds".format(time.time()-SVMmodel__predict_st))