# Imports

In [1]:
!pip install transformers==3.0.0
!pip install emoji
import gc
import os
import emoji as emoji
import re
import string
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import AutoModel
from transformers import BertModel, BertTokenizer

import warnings
warnings.filterwarnings('ignore')

Collecting transformers==3.0.0
  Downloading transformers-3.0.0-py3-none-any.whl (754 kB)
[?25l[K     |▍                               | 10 kB 20.6 MB/s eta 0:00:01[K     |▉                               | 20 kB 15.5 MB/s eta 0:00:01[K     |█▎                              | 30 kB 10.7 MB/s eta 0:00:01[K     |█▊                              | 40 kB 9.3 MB/s eta 0:00:01[K     |██▏                             | 51 kB 5.2 MB/s eta 0:00:01[K     |██▋                             | 61 kB 5.4 MB/s eta 0:00:01[K     |███                             | 71 kB 5.6 MB/s eta 0:00:01[K     |███▌                            | 81 kB 6.3 MB/s eta 0:00:01[K     |████                            | 92 kB 4.9 MB/s eta 0:00:01[K     |████▍                           | 102 kB 5.3 MB/s eta 0:00:01[K     |████▊                           | 112 kB 5.3 MB/s eta 0:00:01[K     |█████▏                          | 122 kB 5.3 MB/s eta 0:00:01[K     |█████▋                          | 133 kB 5.3 MB/

In [2]:
!git clone https://github.com/hafezgh/Hate-Speech-Detection-in-Social-Media

Cloning into 'Hate-Speech-Detection-in-Social-Media'...
remote: Enumerating objects: 314, done.[K
remote: Counting objects: 100% (306/306), done.[K
remote: Compressing objects: 100% (205/205), done.[K
remote: Total 314 (delta 155), reused 208 (delta 90), pack-reused 8[K
Receiving objects: 100% (314/314), 2.83 MiB | 19.30 MiB/s, done.
Resolving deltas: 100% (156/156), done.


# Read and prepare data


In [3]:
import tensorflow as tf
URL = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset = tf.keras.utils.get_file(fname="aclImdb_v1.tar.gz", 
                                  origin=URL,
                                  untar=True,
                                  cache_dir='.',
                                  cache_subdir='')

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [4]:
# The shutil module offers a number of high-level 
# operations on files and collections of files.
import os
import shutil
# Create main directory path ("/aclImdb")
main_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
# Create sub directory path ("/aclImdb/train")
train_dir = os.path.join(main_dir, 'train')
# Remove unsup folder since this is a supervised learning task
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)
# View the final train folder
print(os.listdir(train_dir))

['labeledBow.feat', 'neg', 'unsupBow.feat', 'urls_unsup.txt', 'urls_neg.txt', 'pos', 'urls_pos.txt']


In [5]:
# We create a training dataset and a validation 
# dataset from our "aclImdb/train" directory with a 80/20 split.
train = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', batch_size=30000, validation_split=0.2, 
    subset='training', seed=2045)
val = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', batch_size=30000, validation_split=0.2, 
    subset='validation', seed=2045)
test = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/test', batch_size=32, seed=2045)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.


In [6]:
for i in train.take(1):
  train_feat = i[0].numpy()
  train_lab = i[1].numpy()

for i in val.take(1):
  val_feat = i[0].numpy()
  val_lab = i[1].numpy()

for i in test.take(1):
  test_feat = i[0].numpy()
  test_lab = i[1].numpy()

train = pd.DataFrame([train_feat, train_lab]).T
train.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
train['DATA_COLUMN'] = train['DATA_COLUMN'].str.decode("utf-8")

val = pd.DataFrame([val_feat, val_lab]).T
val.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
val['DATA_COLUMN'] = val['DATA_COLUMN'].str.decode("utf-8")

test = pd.DataFrame([test_feat, test_lab]).T
test.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
test['DATA_COLUMN'] = test['DATA_COLUMN'].str.decode("utf-8")


train.head()

Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
0,The real story (took place in Kansas in 1959) ...,1
1,Effect(s) without cause is generally not possi...,0
2,I'm a big fan of Lucio Fulci; many of his Gial...,0
3,"OK, I kinda like the idea of this movie. I'm i...",0
4,"As far as the Muppet line goes, however, this ...",1


In [7]:
train_set = train['DATA_COLUMN'].to_list()
train_labels = train['LABEL_COLUMN'].to_list()

val_set = val['DATA_COLUMN'].to_list()
val_labels = val['LABEL_COLUMN'].to_list()

test_set = test['DATA_COLUMN'].to_list()
test_labels = test['LABEL_COLUMN'].to_list()

# Utility functions

In [8]:

def pre_process_dataset(values):
    new_values = list()
    # Emoticons
    emoticons = [':-)', ':)', '(:', '(-:', ':))', '((:', ':-D', ':D', 'X-D', 'XD', 'xD', 'xD', '<3', '</3', ':\*',
                 ';-)',
                 ';)', ';-D', ';D', '(;', '(-;', ':-(', ':(', '(:', '(-:', ':,(', ':\'(', ':"(', ':((', ':D', '=D',
                 '=)',
                 '(=', '=(', ')=', '=-O', 'O-=', ':o', 'o:', 'O:', 'O:', ':-o', 'o-:', ':P', ':p', ':S', ':s', ':@',
                 ':>',
                 ':<', '^_^', '^.^', '>.>', 'T_T', 'T-T', '-.-', '*.*', '~.~', ':*', ':-*', 'xP', 'XP', 'XP', 'Xp',
                 ':-|',
                 ':->', ':-<', '$_$', '8-)', ':-P', ':-p', '=P', '=p', ':*)', '*-*', 'B-)', 'O.o', 'X-(', ')-X']

    for value in values:
        # Remove dots
        text = value.replace(".", "").lower()
        text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text)
        users = re.findall("[@]\w+", text)
        for user in users:
            text = text.replace(user, "<user>")
        urls = re.findall(r'(https?://[^\s]+)', text)
        if len(urls) != 0:
            for url in urls:
                text = text.replace(url, "<url >")
        for emo in text:
            if emo in emoji.UNICODE_EMOJI:
                text = text.replace(emo, "<emoticon >")
        for emo in emoticons:
            text = text.replace(emo, "<emoticon >")
        numbers = re.findall('[0-9]+', text)
        for number in numbers:
            text = text.replace(number, "<number >")
        text = text.replace('#', "<hashtag >")
        text = re.sub(r"([?.!,¿])", r" ", text)
        text = "".join(l for l in text if l not in string.punctuation)
        text = re.sub(r'[" "]+', " ", text)
        new_values.append(text)
    return new_values


def data_process(data, labels):
    input_ids = []
    attention_masks = []
    bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    for sentence in data:
        bert_inp = bert_tokenizer.__call__(sentence, max_length=128,
                                           padding='max_length', pad_to_max_length=True,
                                           truncation=True, return_token_type_ids=False)

        input_ids.append(bert_inp['input_ids'])
        attention_masks.append(bert_inp['attention_mask'])
    #del bert_tokenizer
    #gc.collect()
    #torch.cuda.empty_cache()
    input_ids = np.asarray(input_ids)
    attention_masks = np.array(attention_masks)
    labels = np.array(labels)
    return input_ids, attention_masks, labels



# Model

In [9]:
class BERT_Arch(nn.Module):

    def __init__(self, bert, n_classes, mode='cnn'):
        super(BERT_Arch, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.n_classes = n_classes
        self.mode = mode

        if mode == 'cnn':
            # CNN
            self.conv = nn.Conv2d(in_channels=13, out_channels=13, kernel_size=(3, 768), padding='valid')
            self.relu = nn.ReLU()
            # change the kernel size either to (3,1), e.g. 1D max pooling
            # or remove it altogether
            self.pool = nn.MaxPool2d(kernel_size=(3, 1), stride=1)
            self.dropout = nn.Dropout(0.1)
            # be careful here, this needs to be changed according to your max pooling
            # without pooling: 443, with 3x1 pooling: 416
            # FC
            self.fc = nn.Linear(13 * (128 - 4), self.n_classes)
            self.flat = nn.Flatten()
            
        elif mode == 'rnn':
            ### RNN
            self.lstm = nn.LSTM(768, 256, batch_first=True, bidirectional=True)
            ## FC
            self.fc = nn.Linear(256*2, self.n_classes)
        elif mode == 'shallow_fc':
            self.fc = nn.Linear(768, self.n_classes)
        elif mode == 'deep_fc':
            self.leaky_relu = nn.LeakyReLU()
            self.fc1 = nn.Linear(768, 768)
            self.fc2 = nn.Linear(768, 768)
            self.fc3 = nn.Linear(768, self.n_classes)
        else:
            raise NotImplementedError("Unsupported extension!")

        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, sent_id, mask):
        sequence_output, _, all_layers = self.bert(sent_id, attention_mask=mask, output_hidden_states=True)
        if self.mode == 'cnn':
            x = torch.transpose(torch.cat(tuple([t.unsqueeze(0) for t in all_layers]), 0), 0, 1)
            x = self.pool(self.dropout(self.relu(self.conv(self.dropout(x)))))
            x = self.fc(self.dropout(self.flat(self.dropout(x))))
        elif self.mode == 'rnn':
            lstm_output, (h,c) = self.lstm(sequence_output)
            hidden = torch.cat((lstm_output[:,-1, :256],lstm_output[:,0, 256:]),dim=-1)
            x  = self.fc(hidden.view(-1,256*2))
        elif self.mode == 'shallow_fc':
            x = self.fc(sequence_output[:,0,:])
        elif self.mode == 'deep_fc':
            x = self.fc1(sequence_output[:,0,:])
            x = self.leaky_relu(x)
            x = self.fc2(x)
            x = self.leaky_relu(x)
            x = self.fc3(x)
        else:
            raise NotImplementedError("Unsupported extension!")
        gc.collect()
        torch.cuda.empty_cache()
        del all_layers
        c = self.softmax(x)
        return c


# Train

In [10]:

# function to train the model
def train():
    model.train()

    total_loss, total_accuracy = 0, 0

    # empty list to save model predictions
    total_preds = []

    # iterate over batches
    total = len(train_dataloader)
    for i, batch in enumerate(train_dataloader):

        step = i+1
        percent = "{0:.2f}".format(100 * (step / float(total)))
        lossp = "{0:.2f}".format(total_loss/(total*batch_size))
        filledLength = int(100 * step // total)
        bar = '█' * filledLength + '>'  *(filledLength < 100) + '.' * (99 - filledLength)
        print(f'\rBatch {step}/{total} |{bar}| {percent}% complete, loss={lossp}, accuracy={total_accuracy}', end='')

        # push the batch to gpu
        batch = [r.to(device) for r in batch]
        sent_id, mask, labels = batch
        del batch
        gc.collect()
        torch.cuda.empty_cache()
        # clear previously calculated gradients
        model.zero_grad()

        # get model predictions for the current batch
        #sent_id = torch.tensor(sent_id).to(device).long()
        preds = model(sent_id, mask)

        # compute the loss between actual and predicted values
        loss = cross_entropy(preds, labels)

        # add on to the total loss
        total_loss += float(loss.item())

        # backward pass to calculate the gradients
        loss.backward()

        # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # update parameters
        optimizer.step()

        # model predictions are stored on GPU. So, push it to CPU
        #preds = preds.detach().cpu().numpy()

        # append the model predictions
        #total_preds.append(preds)
        total_preds.append(preds.detach().cpu().numpy())

    gc.collect()
    torch.cuda.empty_cache()

    # compute the training loss of the epoch
    avg_loss = total_loss / (len(train_dataloader)*batch_size)

    # predictions are in the form of (no. of batches, size of batch, no. of classes).
    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds = np.concatenate(total_preds, axis=0)

    # returns the loss and predictions
    return avg_loss, total_preds


# function for evaluating the model
def evaluate():
    print("\n\nEvaluating...")

    # deactivate dropout layers
    model.eval()

    total_loss, total_accuracy = 0, 0

    # empty list to save the model predictions
    total_preds = []

    # iterate over batches
    total = len(val_dataloader)
    for i, batch in enumerate(val_dataloader):
        
        step = i+1
        percent = "{0:.2f}".format(100 * (step / float(total)))
        lossp = "{0:.2f}".format(total_loss/(total*batch_size))
        filledLength = int(100 * step // total)
        bar = '█' * filledLength + '>' * (filledLength < 100) + '.' * (99 - filledLength)
        print(f'\rBatch {step}/{total} |{bar}| {percent}% complete, loss={lossp}, accuracy={total_accuracy}', end='')

        # push the batch to gpu
        batch = [t.to(device) for t in batch]

        sent_id, mask, labels = batch
        del batch
        gc.collect()
        torch.cuda.empty_cache()
        # deactivate autograd
        with torch.no_grad():

            # model predictions
            preds = model(sent_id, mask)

            # compute the validation loss between actual and predicted values
            loss = cross_entropy(preds, labels)

            total_loss += float(loss.item())
            #preds = preds.detach().cpu().numpy()

            #total_preds.append(preds)
            total_preds.append(preds.detach().cpu().numpy())

    gc.collect()
    torch.cuda.empty_cache()

    # compute the validation loss of the epoch
    avg_loss = total_loss / (len(val_dataloader)*batch_size)

    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds = np.concatenate(total_preds, axis=0)

    return avg_loss, total_preds


In [11]:
### Extension mode
MODE = 'shallow_fc'

In [12]:
pre_pro_train_data = pre_process_dataset(train_set)
pre_pro_val_data = pre_process_dataset(val_set)
pre_pro_test_data = pre_process_dataset(test_set)

train_input_ids, train_attention_masks, train_labels = data_process(pre_pro_train_data,train_labels)
val_input_ids, val_attention_masks, val_labels = data_process(pre_pro_val_data,val_labels)
test_input_ids, test_attention_masks, test_labels = data_process(pre_pro_test_data,test_labels)


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [13]:
# Specify the GPU
# Setting up the device for GPU usage
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

train_count = len(train_labels)
test_count = len(test_labels)
val_count = len(val_labels)

# ~~~~~~~~~~~~~~~~~~~~~ Import BERT Model and BERT Tokenizer ~~~~~~~~~~~~~~~~~~~~~#
# import BERT-base pretrained model
bert = AutoModel.from_pretrained('bert-base-uncased')
# bert = AutoModel.from_pretrained('bert-base-uncased')
# Load the BERT tokenizer
#tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#


cuda


Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [14]:

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Tokenization ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
# for train set
train_seq = torch.tensor(train_input_ids.tolist())
train_mask = torch.tensor(train_attention_masks.tolist())
train_y = torch.tensor(train_labels.tolist())

# for validation set
val_seq = torch.tensor(val_input_ids.tolist())
val_mask = torch.tensor(val_attention_masks.tolist())
val_y = torch.tensor(val_labels.tolist())

# for test set
test_seq = torch.tensor(test_input_ids.tolist())
test_mask = torch.tensor(test_attention_masks.tolist())
test_y = torch.tensor(test_labels.tolist())
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Create DataLoaders ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# define a batch size
batch_size = 32

# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)

# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)

# dataLoader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# wrap tensors
val_data = TensorDataset(val_seq, val_mask, val_y)

# sampler for sampling the data during training
val_sampler = SequentialSampler(val_data)

# dataLoader for validation set
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Freeze BERT Parameters ~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
# freeze all the parameters
for param in bert.parameters():
    param.requires_grad = False
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

# pass the pre-trained BERT to our define architecture
model = BERT_Arch(bert, n_classes=2, mode=MODE)
# push the model to GPU
model = model.to(device)

# optimizer from hugging face transformers
from transformers import AdamW

# define the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

#from sklearn.utils.class_weight import compute_class_weight

# compute the class weights
#class_wts = compute_class_weight('balanced', np.unique(train_labels), train_labels)

#print(class_wts)

# convert class weights to tensor
#weights = torch.tensor(class_wts, dtype=torch.float)
#weights = weights.to(device)

# loss function
#cross_entropy = nn.NLLLoss(weight=weights)
cross_entropy = nn.NLLLoss()

# set initial loss to infinite
best_valid_loss = float('inf')

# empty lists to store training and validation loss of each epoch
#train_losses = []
#valid_losses = []

#if os.path.isfile("/content/drive/MyDrive/saved_weights.pth") == False:
#if os.path.isfile("saved_weights.pth") == False:
    # number of training epochs
epochs = 3
current = 1
# for each epoch
while current <= epochs:

    print(f'\nEpoch {current} / {epochs}:')

    # train model
    train_loss, _ = train()

    # evaluate model
    valid_loss, _ = evaluate()

    # save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        #torch.save(model.state_dict(), 'saved_weights.pth')

    # append training and validation loss
    #train_losses.append(train_loss)
    #valid_losses.append(valid_loss)

    print(f'\n\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')

    current = current + 1
#else:
    #print("Got weights!")
    # load weights of best model
    #model.load_state_dict(torch.load("saved_weights.pth"))
    #model.load_state_dict(torch.load("/content/drive/MyDrive/saved_weights.pth"), strict=False)




Epoch 1 / 3:
Batch 625/625 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00% complete, loss=0.01, accuracy=0

Evaluating...
Batch 157/157 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00% complete, loss=0.01, accuracy=0

Training Loss: 0.011
Validation Loss: 0.009

Epoch 2 / 3:
Batch 625/625 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00% complete, loss=0.01, accuracy=0

Evaluating...
Batch 157/157 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00% complete, loss=0.01, accuracy=0

Training Loss: 0.006
Validation Loss: 0.011

Epoch 3 / 3:
Batch 625/625 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00% complete, loss=0.00, accuracy=0

Evaluating...
Batch 157/157 |███████████████████████

# Test

In [15]:
# get predictions for test data
gc.collect()
torch.cuda.empty_cache()

with torch.no_grad():
    preds = model(test_seq.to(device), test_mask.to(device))
    #preds = model(test_seq, test_mask)
    preds = preds.detach().cpu().numpy()


print("Performance:")
# model's performance
preds = np.argmax(preds, axis=1)
print('Classification Report')
print(classification_report(test_y, preds))

print("Accuracy: " + str(accuracy_score(test_y, preds)))

Performance:
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.93      0.96        14
           1       0.95      1.00      0.97        18

    accuracy                           0.97        32
   macro avg       0.97      0.96      0.97        32
weighted avg       0.97      0.97      0.97        32

Accuracy: 0.96875
