<a href="https://colab.research.google.com/github/janakurrek/mnli/blob/master/BiLSTM%2BAttention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [50]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
Downloading dataset...


In [69]:
import json
import torch
import wget
import os
from tqdm.notebook import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

print('Downloading dataset...')

# The URL for the dataset file.
url = 'https://raw.githubusercontent.com/networkdynamics/slur-corpus/main/kurrek.2020.slur-corpus.json'
datapath = './kurrek.2020.slur-corpus.json'

# Download the file (if we haven't already).
if not os.path.exists(datapath):
  wget.download(url, datapath)

corpus = []
with open(datapath) as f:
    for line in f:
        corpus.append(json.loads(line))

corpus = pd.DataFrame(corpus)

# remove CMP classes
corpus.drop(corpus[corpus.gold_label == 'CMP'].index, inplace=True)
corpus['binary_label'] = corpus['gold_label']
corpus['binary_label'] = corpus['binary_label'].replace(['APR', 'HOM'], 'NDG')
corpus['clean_labels'] = pd.factorize(corpus.binary_label)[0]

clean_datapath = '/content/drive/My Drive/Colab Notebooks/kurrek.2020.slur-corpus-clean.tsv'
corpus.to_csv(clean_datapath, sep='\t')

clean_corpus = pd.read_csv(clean_datapath, sep='\t')
clean_corpus[['body','clean_labels']].head()

Downloading dataset...


Unnamed: 0,body,clean_labels
0,Fuck that I dont wanna watch tranny porn.,0.0
1,Opie just wanted to have a good time. The same...,0.0
2,Fuck that faggot Fallon. Fucking sissy boy act...,0.0
3,BbBB...b.b..b.bb but OP's a faggot,0.0
4,Who even uses the word tranny except for trans...,1.0


**Pay "Attention" to Your Context**

https://www.aclweb.org/anthology/W19-3508.pdf

https://github.com/tuhinjubcse/ALW3-ACL2019

https://www.aclweb.org/anthology/N16-1174/

https://github.com/uvipen/Hierarchical-attention-networks-pytorch/blob/master/src/utils.py


*   Stacked Bi-LSTM models outperformed the simple Bi-LSTM models
*   Best performing model on abusive language datasets is the stacked Bi-LSTM with contextual attention



In [93]:
from torchtext import data
from torchtext.vocab import GloVe
import random

max_input_len = 320

# tokenizer settings
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

unk_token_idx = tokenizer.unk_token_id
pad_token_idx = tokenizer.pad_token_id

def tokenize_and_cut(sentence, max_input_length = 320):
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_input_length - 2]
    return tokens

# load text inputs and labels
TEXT = data.Field(batch_first=True, use_vocab=False, tokenize=tokenize_and_cut,
                  preprocessing=tokenizer.convert_tokens_to_ids,
                  pad_token=pad_token_idx, unk_token=unk_token_idx)

LABELS = data.Field(
    sequential=False,
    dtype=torch.float
)

# split into training, testing, validation set
dataset = data.TabularDataset(
    path=datapath, format='json',
    fields={'body': ('text', TEXT),
            'gold_label': ('labels', LABELS)}
)

D_trn, D_val, D_tst = dataset.split(split_ratio=[0.7, 0.1, 0.2], random_state=random.getstate())

# build vocabulary and count tokens
TEXT.build_vocab(D_trn)
LABELS.build_vocab(D_trn)

In [94]:
print(f"Unique tokens in text vocabulary: {len(TEXT.vocab)}")

Unique tokens in text vocabulary: 18654


In [104]:
import torch.nn as nn
import math

"""Hyperparameters"""
# training
num_epochs = 4
learning_rate = 0.001
batch_size = 128

# model hyperparameters
load_model = False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_size = len(TEXT.vocab)
output_size = 2 #  binary classification
embed_size = 50
hidden_size = 1024 # 2014 benchmark; slightly small
num_layers = 2 # benchmark did 4
dropout = 0.5
n_cells = input_size # placeholder

"""Keon Attention Class"""
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        
        self.hidden_size = hidden_size
        self.attn = nn.Linear(self.hidden_size * 4, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))
        stdv = 1. / math.sqrt(self.v.size(0))
        self.v.data.uniform_(-stdv, stdv)

    def forward(self, hidden, encoder_outputs):
        hidden = hidden.reshape((1, hidden.shape[1], hidden.shape[2] * 2))
        timestep = encoder_outputs.size(0)
        h = hidden.repeat(timestep, 1, 1).transpose(0, 1)
        encoder_outputs = encoder_outputs.transpose(0, 1)  # [B*T*H]
        attn_energies = self.score(h, encoder_outputs)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

    def score(self, hidden, encoder_outputs):
        # [B*T*2H]->[B*T*H]
        catted = torch.cat([hidden, encoder_outputs], 2)
        energy = F.relu(self.attn(catted))
        energy = energy.transpose(1, 2)  # [B*H*T]
        v = self.v.repeat(encoder_outputs.size(0), 1).unsqueeze(1)  # [B*1*H]
        energy = torch.bmm(v, energy)  # [B*1*T]
        return energy.squeeze(1)  # [B*T]

"""Classifier"""
class Classifier(nn.Module):
    def __init__(self, input_size, output_size, embed_size, device,
                 hidden_size, batch_size, dropout, n_layers, n_cells):
        
        super(Classifier, self).__init__()
        
        self.device = device
        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.n_cells = n_cells
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.embed = nn.Embedding(input_size, embed_size)
        self.attention = Attention(hidden_size)
        self.lstm = nn.LSTM(embed_size, hidden_size,
                            num_layers=n_layers, dropout=dropout, 
                            bidirectional=True)
        
        self.fc_hidden = nn.Linear(hidden_size * 2, hidden_size, bias=False)
        self.fc_output = nn.Linear(hidden_size,  output_size, bias=False)
    
    def encode(self, embed):
        # pass embedding input through lstm
        state_shape = self.n_cells, self.batch_size, self.hidden_size
        h0 = c0 = embed.new_zeros(state_shape)
        outputs, (ht, ct) = self.lstm(embed, (h0, c0))

        # pass outcomes through attention layer
        weights = self.attention(ht[-2:], outputs)
        context = weights.bmm(outputs.transpose(0, 1))
        context = context.transpose(0, 1)
        context = context.squeeze(0)
        return context
        
    def forward(self, comment):
        # seq_length, batch_size, embed_size
        comment_embed = self.dropout(self.embed(comment))
        comment_contx = self.encode(comment)
        # seq_len, hidden_size * 2
        comment_contx = self.relu(self.fc_hidden(comment_contx))
        # hidden_size * 2, output_size
        comment_contx = self.relu(self.fc_output(comment_contx))
        return pair_output

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (D_trn, D_val, D_tst), 
     batch_size = batch_size,
     sort_within_batch = True,
     sort_key = lambda x: len(x.text), # minimize padding
     device = device)

model = Classifier(
    input_size, output_size, embed_size, device,
    hidden_size, batch_size, dropout, num_layers, n_cells
).to(device)

pad_idx_in_vocab = TEXT.vocab.stoi["<pad"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx_in_vocab)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)