In [2]:
import torch
from torch import nn
import torch.nn.functional as F

import torchtext
from torchtext import datasets
from torchtext import data
from torchtext.datasets import IMDB
from torchtext.vocab import Vocab
from torchtext.data import Example, Dataset

import mlxtend, torchmetrics

import matplotlib.pyplot as plt
import numpy as np
from transformers import BertTokenizer, BertModel
from tqdm.auto import tqdm
from torchmetrics import ConfusionMatrix
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import roc_curve, auc, precision_recall_curve
import spacy
import math
import random
from collections import defaultdict

# Models

## BiLSTM with Attention Head

In [None]:
class Model_1(nn.Module):
    def __init__(self, emb_matrix, hidden_size):
        super().__init__()
        #embed the index corresponding to the number
        self.embedding = nn.Embedding.from_pretrained(embeddings=emb_matrix, freeze=True, padding_idx=1) #added padding_idx = 1 to not affect gradient

        #feed batches into LSTM
        self.forward_and_backward_LSTM = nn.LSTM(input_size=300, #embedding dimension
                                    hidden_size=hidden_size, #hyperparameter 
                                    num_layers=2,
                                    bidirectional=True,
                                    batch_first=False,
                                    dropout=.5
                                    )
        
        self.attention_head = nn.MultiheadAttention(embed_dim=hidden_size*2, 
                                                    num_heads=1, 
                                                    batch_first=False)

        self.classifier = nn.Sequential(
                                        # nn.Linear(in_features=hidden_size*2*3, out_features=hidden_size*2*3),
                                        nn.Linear(in_features=hidden_size*2*4, out_features=hidden_size*2*4),
                                        nn.ReLU(),
                                        nn.Dropout(),
                                        # nn.Linear(in_features=hidden_size*2*3, out_features=1)
                                        nn.Linear(in_features=hidden_size*2*4, out_features=1)
                                        )
        
        

    def forward(self, input_indices, input_lengths):
        word_embedding = self.embedding(input_indices)
        #use pack_padded_sequence in order to make computation efficient
        packed_embedding = nn.utils.rnn.pack_padded_sequence(enforce_sorted=False, input=word_embedding, lengths=input_lengths.cpu())

        #pass through LSTM
        packed_output, (hn, cn) = self.forward_and_backward_LSTM(packed_embedding) #(seq_len, batch, hidden_size*2)
        
        
        #Max and mean pooling
        unpacked_output, lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        attention_output, attention_weights = self.attention_head(unpacked_output, unpacked_output, unpacked_output)


        h_final = torch.cat((hn[0], hn[1]), dim=1) # (batch_size, hidden_size*2)
        mean_pool = torch.mean(unpacked_output, dim=0)
        max_pool, _ = torch.max(unpacked_output, dim=0)
        attention_pool = torch.mean(attention_output, dim=0)

        #combine
        features_pre_classification = torch.cat([h_final, mean_pool, max_pool, attention_pool], dim=1) #(batch_size, hidden_size*2*3)

        res = self.classifier(features_pre_classification).squeeze(dim=1)

        return res

## 4 Layer Transformer

In [None]:
class Model_2_4(nn.Module):
    def __init__(self, emb_matrix):
        super().__init__()
        #embed the index corresponding to the number
        emb_dim = emb_matrix.size(1)
        vocab_size = emb_matrix.size(0)

        # self.embedding = BERTEmbeddings(emb_matrix).to(device)
        self.embedding = nn.Embedding.from_pretrained(embeddings=emb_matrix, freeze=True, padding_idx=0) #added padding_idx = 0 to not affect gradient
        
        self.transformer_encoder_layer = nn.TransformerEncoderLayer(
                                                      d_model=768, #hidden size H in paper
                                                      nhead =12,  # attention heads (768/12=64 per head)
                                                      dim_feedforward=1536, #~4*H
                                                      activation="gelu",
                                                      batch_first=True

        )

        self.transformer = nn.TransformerEncoder(self.transformer_encoder_layer, num_layers=4)

        #classification head
        self.dropout = nn.Dropout()
        self.fc = nn.Linear(768, 1)

        
        

    def forward(self, input_indices, input_lengths):
        word_embedding = self.embedding(input_indices)
        hidden_states = self.transformer(word_embedding)

        cls_embedding = hidden_states[:,0,:]

        res = self.fc(self.dropout(cls_embedding)).squeeze(1)

        return res

## 6 Layer Transformer

In [None]:
class Model_2_6(nn.Module):
    def __init__(self, emb_matrix):
        super().__init__()
        #embed the index corresponding to the number
        emb_dim = emb_matrix.size(1)
        vocab_size = emb_matrix.size(0)

        # self.embedding = BERTEmbeddings(emb_matrix).to(device)
        self.embedding = nn.Embedding.from_pretrained(embeddings=emb_matrix, freeze=True, padding_idx=0) #added padding_idx = 0 to not affect gradient
        
        self.transformer_encoder_layer = nn.TransformerEncoderLayer(
                                                      d_model=768, #hidden size H in paper
                                                      nhead =12,  # attention heads (768/12=64 per head)
                                                      dim_feedforward=1536, #~4*H
                                                      activation="gelu",
                                                      batch_first=True

        )

        self.transformer = nn.TransformerEncoder(self.transformer_encoder_layer, num_layers=6)

        #classification head
        self.dropout = nn.Dropout()
        self.fc = nn.Linear(768, 1)

        
        

    def forward(self, input_indices, input_lengths):
        word_embedding = self.embedding(input_indices)
        hidden_states = self.transformer(word_embedding)

        cls_embedding = hidden_states[:,0,:]

        res = self.fc(self.dropout(cls_embedding)).squeeze(1)

        return res

## 8 Layer Transformer

In [None]:
class Model_2_8(nn.Module):
    def __init__(self, emb_matrix):
        super().__init__()
        #embed the index corresponding to the number
        emb_dim = emb_matrix.size(1)
        vocab_size = emb_matrix.size(0)

        # self.embedding = BERTEmbeddings(emb_matrix).to(device)
        self.embedding = nn.Embedding.from_pretrained(embeddings=emb_matrix, freeze=True, padding_idx=0) #added padding_idx = 0 to not affect gradient
        
        self.transformer_encoder_layer = nn.TransformerEncoderLayer(
                                                      d_model=768, #hidden size H in paper
                                                      nhead =12,  # attention heads (768/12=64 per head)
                                                      dim_feedforward=1536, #~4*H
                                                      activation="gelu",
                                                      batch_first=True,
                                                      norm_first=True

        )

        self.transformer = nn.TransformerEncoder(self.transformer_encoder_layer, num_layers=8)

        #classification head
        self.dropout = nn.Dropout()
        self.fc = nn.Linear(768, 1)

        
        

    def forward(self, input_indices, input_lengths):
        word_embedding = self.embedding(input_indices)
        hidden_states = self.transformer(word_embedding)

        cls_embedding = hidden_states[:,0,:]

        res = self.fc(self.dropout(cls_embedding)).squeeze(1)

        return res

## Combinining BiLSTM and Transformer

In [None]:
class Model_3(nn.Module):    
    def __init__(self, emb_matrix, hidden_size=256):
        super().__init__()
        self.model1 = Model_1(emb_matrix, hidden_size)
        self.model2 = Model_2_4(emb_matrix)

        # Replace their final classifiers with "feature extractors"
        self.model1.classifier = nn.Identity()
        self.model2.fc = nn.Identity()

        # Joint classifier
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size*2*4 + 768, 512),  # concat Model1 feats + Model2 feats
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(512, 1)
        )

    def forward(self, input_indices, input_lengths):
        feats1 = self.model1(input_indices, input_lengths)  # [batch, hidden*2*4]
        feats2 = self.model2(input_indices, input_lengths)  # [batch, 768]

        combined_feats = torch.cat([feats1, feats2], dim=1)
        res = self.classifier(combined_feats).squeeze(1)

        return res

# Loading Models In