# Bi-directional LSTM Model

In [153]:
# import os
# os.environ['KMP_DUPLICATE_LIB_OK']='True'

import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

#Data preprocessing
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

#PyTorch LSTM
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

#Tokenization for LSTM
from collections import Counter
from gensim.models import Word2Vec

#Seed for reproducibility
import random

seed_value=42
random.seed(seed_value)
np.random.seed(seed_value)

## Read data

In [154]:
df = pd.read_csv("../data/processed/cleaned_data_1.csv", on_bad_lines='skip')
num_nan = df.isna().sum()
print(num_nan)
df = df.dropna()

Tweets        0
AuthorID      0
CreatedAt     0
text_clean    1
text_len      0
dtype: int64


In [155]:
possible_labels = df['AuthorID'].unique()
#convert labels into numeric values
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

df['label'] = df.AuthorID.replace(label_dict)
print(label_dict)

{23083404: 0, 267425142: 1, 35936474: 2, 42562446: 3, 460116210: 4, 448562247: 5, 2279776304: 6, 50811932: 7, 416814339: 8}


In [156]:
labels = list(label_dict.values())
max_len = np.max(df['text_len'])
print(max_len)

46


## Data split

In [157]:
X = df['text_clean']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=seed_value)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, stratify=y_train, random_state=seed_value)

In [159]:
# Oversample training data
ros = RandomOverSampler()
X_train, y_train = ros.fit_resample(np.array(X_train).reshape(-1, 1), np.array(y_train).reshape(-1, 1))
train_os = pd.DataFrame(list(zip([x[0] for x in X_train], y_train)), columns = ['text_clean', 'label'])
X_train = train_os['text_clean'].values
y_train = train_os['label'].values

## Tokenize data

In [160]:
# Function to get vocabulary of words from our data
def get_vocab(column):
    corpus = [word for text in column for word in text.split()]
    count_words = Counter(corpus)
    sorted_words = count_words.most_common()
    return sorted_words

In [161]:
# Function to tokenize the sentences by converting them to lists of numbers with padding
def Tokenize(column, seq_len, vocab):
    vocab_to_int = {w:i+1 for i, (w,c) in enumerate(vocab)}

    # Tokenize the columns text 
    text_int = []
    for text in column:
        r = [vocab_to_int[word] for word in text.split()]
        text_int.append(r)
    
    # Add padding
    features = np.zeros((len(text_int), seq_len), dtype = int)
    for i, review in enumerate(text_int):
        if len(review) <= seq_len:
            zeros = list(np.zeros(seq_len - len(review)))
            new = zeros + review
        else:
            new = review[: seq_len]
        features[i, :] = np.array(new)

    return features

In [135]:
vocabulary = get_vocab(df['text_clean'])

In [162]:
# Create list of words from training data
Word2vec_train_data = list(map(lambda x: x.split(), X_train))

# Set dimensions for ther number of features for each transformed word
EMBEDDING_DIM = 200

# Initialize Word2Vec model with the training words
word2vec_model = Word2Vec(Word2vec_train_data, vector_size=EMBEDDING_DIM)
word2vec_model.save('../models/word2vec.model')

In [137]:
print(f"Vocabulary size: {len(vocabulary) + 1}")
VOCAB_SIZE = len(vocabulary) + 1 #+1 for the padding

Vocabulary size: 22606


In [138]:
# Define empty embedding matrix
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
    
# Fill embedding matrix with the pre trained values from word2vec corresponding to 
# word (string), token (number associated to the word)
for word, token in vocabulary:
    if word2vec_model.wv.__contains__(word):
        embedding_matrix[token] = word2vec_model.wv.__getitem__(word)

print("Embedding Matrix Shape:", embedding_matrix.shape)

Embedding Matrix Shape: (22606, 200)


In [163]:
# Tokenize our split datasets

X_train = Tokenize(X_train, max_len, vocabulary)
X_test = Tokenize(X_test, max_len, vocabulary)
X_valid = Tokenize(X_valid, max_len, vocabulary)

In [141]:
print(X_train.shape)

(58788, 46)


## Datasets and Dataloaders

In [164]:
# Transform the datasets into tensor datasets and dataloaders,
# enabling data extraction in batches.

train_data = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
test_data = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test.values))
valid_data = TensorDataset(torch.from_numpy(X_valid), torch.from_numpy(y_valid.values))

BATCH_SIZE = 32

train_loader = DataLoader(train_data, shuffle=True, batch_size=BATCH_SIZE, drop_last=True) 
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=BATCH_SIZE, drop_last=True)
test_loader = DataLoader(test_data, shuffle=True, batch_size=BATCH_SIZE, drop_last=True)

## Bi-LSTM model

In [165]:
class BiLSTM_Authorship_Attribution(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes, lstm_layers, bidirectional,batch_size, dropout):
        super(BiLSTM_Authorship_Attribution,self).__init__()
        
        self.lstm_layers = lstm_layers
        self.num_directions = 2 if bidirectional else 1
        self.hidden_dim = hidden_dim
        self.num_classes = num_classes
        self.batch_size = batch_size
        

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.lstm = nn.LSTM(embedding_dim,
                            hidden_dim,
                            num_layers=lstm_layers,
                            dropout=dropout,
                            bidirectional=bidirectional,
                            batch_first=True)

        self.fc = nn.Linear(hidden_dim*self.num_directions, num_classes)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, x, hidden):
        self.batch_size = x.size(0)
        # embedding layer
        embedded = self.embedding(x)
        # lstm layers
        out, hidden = self.lstm(embedded, hidden)
        # Extract only the hidden state from the last LSTM cell
        out = out[:,-1,:]
        # fully connected layers
        out = self.fc(out)
        out = self.softmax(out)

        return out, hidden

    def init_hidden(self, batch_size):
        # Initialize lstm hidden and cell states
        h0 = torch.zeros((self.lstm_layers*self.num_directions, batch_size, self.hidden_dim)).detach().to(DEVICE)
        c0 = torch.zeros((self.lstm_layers*self.num_directions, batch_size, self.hidden_dim)).detach().to(DEVICE)
        hidden = (h0, c0)
        return hidden

In [68]:
def initialize_model(vocab_size, embedding_dim,
                    hidden_dim, num_classes, lstm_layers,
                    bidirectional, batch_size, dropout, LR, epochs=4):
    
    bi_lstm_classifier = BiLSTM_Authorship_Attribution(vocab_size, embedding_dim, 
                                          hidden_dim, num_classes, 
                                          lstm_layers, bidirectional, 
                                          batch_size, dropout)
    
    optimizer = torch.optim.AdamW(bi_lstm_classifier.parameters(), lr=LR, weight_decay = 5e-6)
    
    criterion = nn.NLLLoss()
    
    return bi_lstm_classifier, optimizer, criterion 

In [166]:
# Set hyperparameters

NUM_CLASSES = 9 # multiclass classification of 9 classes
HIDDEN_DIM = 100 # number of neurons of the internal state (internal neural network in the LSTM)
LSTM_LAYERS = 1 # number of stacked LSTM layers

LR = 3e-4 # learning rate
DROPOUT = 0.5 # LSTM Dropout
BIDIRECTIONAL = True # boolean mask to choose if to use a bidirectional LSTM or not
EPOCHS = 5 # number of training epoch
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [167]:
# Define model

model, optimizer, criterion = initialize_model(VOCAB_SIZE, EMBEDDING_DIM,
                                              HIDDEN_DIM, NUM_CLASSES,
                                              LSTM_LAYERS, BIDIRECTIONAL, 
                                              BATCH_SIZE, DROPOUT, LR, EPOCHS)
model = model.to(DEVICE)

# initialize embedding layer with the embedding matrix
model.embedding.weight.data.copy_(torch.from_numpy(embedding_matrix))
# allow the embedding matrix to be fined tuned to better adapt to out dataset and get higher accuracy
model.embedding.weight.requires_grad=True

print(model)

BiLSTM_Authorship_Attribution(
  (embedding): Embedding(22606, 200)
  (lstm): LSTM(200, 100, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=200, out_features=9, bias=True)
  (softmax): LogSoftmax(dim=1)
)




### Training 

In [25]:
total_step = len(train_loader)
total_step_val = len(valid_loader)

early_stopping_patience = 4
early_stopping_counter = 0

valid_acc_max = 0 # set initial best accuracy top 0

for e in range(EPOCHS):

    # lists to store the train and validation losses of every batch for each epoch
    train_loss, valid_loss  = [], []
    # lists to store the train and validation accuracy of every batch for each epoch
    train_acc, valid_acc  = [], []

    # lists to host the train and validation predictions of every batch for each epoch
    y_train_list, y_val_list = [], []

    # initalize number of total and correctly classified texts during training and validation
    correct, correct_val = 0, 0
    total, total_val = 0, 0
    running_loss, running_loss_val = 0, 0


    ####TRAINING LOOP####

    model.train()

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(DEVICE), labels.to(DEVICE) # load features and targets in device

        h = model.init_hidden(labels.size(0))

        model.zero_grad() # reset gradients 

        output, h = model(inputs,h) # get output and hidden states from LSTM network
        
        loss = criterion(output, labels)
        loss.backward()
        
        running_loss += loss.item()
        
        optimizer.step()

        y_pred_train = torch.argmax(output, dim=1) # get tensor of predicted values on the training set
        y_train_list.extend(y_pred_train.squeeze().tolist()) # transform tensor to list and the values to the list
        
        correct += torch.sum(y_pred_train==labels).item() # count correctly classified texts per batch
        total += labels.size(0) # count total texts per batch

    train_loss.append(running_loss / total_step)
    train_acc.append(100 * correct / total)

    ####VALIDATION LOOP####
    
    with torch.no_grad():
        
        model.eval()
        
        for inputs, labels in valid_loader:
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)

            val_h = model.init_hidden(labels.size(0))

            output, val_h = model(inputs, val_h)

            val_loss = criterion(output, labels)
            running_loss_val += val_loss.item()

            y_pred_val = torch.argmax(output, dim=1)
            y_val_list.extend(y_pred_val.squeeze().tolist())

            correct_val += torch.sum(y_pred_val==labels).item()
            total_val += labels.size(0)

        valid_loss.append(running_loss_val / total_step_val)
        valid_acc.append(100 * correct_val / total_val)

    # save model if validation accuracy increases
    if np.mean(valid_acc) >= valid_acc_max:
        torch.save(model.state_dict(), '../models/lstm_model.pt')
        print(f'Epoch {e+1}:Validation accuracy increased ({valid_acc_max:.6f} --> {np.mean(valid_acc):.6f}).  Saving model ...')
        valid_acc_max = np.mean(valid_acc)
        early_stopping_counter=0 # reset counter if validation accuracy increases
    else:
        print(f'Epoch {e+1}:Validation accuracy did not increase')
        early_stopping_counter+=1 # increase counter if validation accuracy does not increase
        
    if early_stopping_counter > early_stopping_patience:
        print('Early stopped at epoch :', e+1)
        break
    
    print(f'\tTrain_loss : {np.mean(train_loss):.4f} Val_loss : {np.mean(valid_loss):.4f}')
    print(f'\tTrain_acc : {np.mean(train_acc):.3f}% Val_acc : {np.mean(valid_acc):.3f}%')

Epoch 1:Validation accuracy increased (0.000000 --> 42.494420).  Saving model ...
	Train_loss : 1.6878 Val_loss : 1.5758
	Train_acc : 38.948% Val_acc : 42.494%
Epoch 2:Validation accuracy increased (42.494420 --> 51.450893).  Saving model ...
	Train_loss : 0.9422 Val_loss : 1.4308
	Train_acc : 68.524% Val_acc : 51.451%
Epoch 3:Validation accuracy increased (51.450893 --> 53.794643).  Saving model ...
	Train_loss : 0.6394 Val_loss : 1.4628
	Train_acc : 79.181% Val_acc : 53.795%
Epoch 4:Validation accuracy increased (53.794643 --> 54.631696).  Saving model ...
	Train_loss : 0.4874 Val_loss : 1.5244
	Train_acc : 84.445% Val_acc : 54.632%
Epoch 5:Validation accuracy did not increase
	Train_loss : 0.3976 Val_loss : 1.6612
	Train_acc : 87.144% Val_acc : 53.292%


### Test

In [30]:
# Load the best model

model.load_state_dict(torch.load('../models/lstm_model.pt'))

<All keys matched successfully>

In [168]:
model.eval()
y_pred_list = []
y_test_list = []
for inputs, labels in test_loader:
    inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
    test_h = model.init_hidden(labels.size(0))

    output, val_h = model(inputs, test_h)
    y_pred_test = torch.argmax(output, dim=1)
    y_pred_list.extend(y_pred_test.squeeze().tolist())
    y_test_list.extend(labels.squeeze().tolist())

In [37]:
labels = list(label_dict.values())
labels_string = map(str, labels)
labels_string = list(labels_string)

# Print classification report for model performance against the test dataset
print('Classification Report for Bi-LSTM :\n', classification_report(y_test_list, y_pred_list, target_names=labels_string))

['0', '1', '2', '3', '4', '5', '6', '7', '8']
Classification Report for Bi-LSTM :
               precision    recall  f1-score   support

           0       0.55      0.56      0.56      1224
           1       0.55      0.62      0.58      1611
           2       0.56      0.51      0.54      1811
           3       0.47      0.50      0.49      1020
           4       0.80      0.66      0.72       280
           5       0.56      0.58      0.57      1236
           6       0.27      0.24      0.25       180
           7       0.59      0.55      0.57      1217
           8       0.44      0.41      0.43       381

    accuracy                           0.55      8960
   macro avg       0.53      0.52      0.52      8960
weighted avg       0.55      0.55      0.55      8960

