In [142]:
#Imports
import torch
from torchtext.legacy import data
import pandas as pd
import re
import string
import random
import os
import torch.nn as nn


In [2]:
#dataset upload
from google.colab import files
uploaded = files.upload()

Saving banglastopword.txt to banglastopword.txt
Saving bengali_hatespeech.csv to bengali_hatespeech.csv
Saving hindi_hatespeech.tsv to hindi_hatespeech.tsv
Saving stopword.txt to stopword.txt


Hindi Dataset

In [143]:
# Read Hinid dataset and some preprocessing
# Read Hindi Dataset
hindi_data = pd.read_csv('hindi_hatespeech.tsv', sep='\t')
hindi_data = hindi_data.loc[:,"text":"task_1"]
hindi_data.dropna(subset = ["text"], inplace=True)
hindi_data.dropna(subset = ["task_1"], inplace=True)
hindi_data['task_1'] = hindi_data['task_1'].map({'HOF': 1, 'NOT': 0})
hindi_data['text'] = hindi_data['text'].str.replace('[{}]'.format(string.punctuation), ' ')
hindi_data['text'] = hindi_data['text'].str.replace('[{}]'.format('।'), '')
hindi_stop_word_list = pd.read_csv('stopword.txt', sep='\s+', header=None)
hindi_stop_word_list = hindi_stop_word_list[0].tolist()
hindi_data['text'] = hindi_data['text'].apply(lambda x: ' '.join([item for item in x.split() if item not in (hindi_stop_word_list)]))
hindi_training_data = hindi_data.sample(frac = 0.8)
hindi_test_data = hindi_data.drop(hindi_training_data.index)
# write a dataframe to tsv file
hindi_training_data.to_csv("hindi_training_data.tsv", sep="\t", index=False)
# write a dataframe to tsv file
hindi_test_data.to_csv("hindi_test_data.tsv", sep="\t", index=False)

In [141]:
len(hindi_data)

4665

In [144]:
#Hindi Dataset Cleanup
def hindi_data_cleanup(hindi_data):
  cleaned_text = []
  for text in hindi_data:
      # remove multiple spaces
      text = re.sub(r' +', ' ', text)
      # remove newline
      text = re.sub(r'\n', ' ', text)
      cleaned_text.append(text)
  return cleaned_text

In [145]:
#For Creating Train and Validation data
char_based = False
if char_based:
    tokenizer = lambda s: list(s) # char-based
else:
    tokenizer = lambda s: s.split() # word-based

Text = data.Field(preprocessing=hindi_data_cleanup, tokenize=tokenizer, batch_first=True, include_lengths=True, fix_length=100)
Label = data.Field(sequential=False, use_vocab=False, pad_token=None, unk_token=None)

fields = [('text', Text), ('labels', Label)]
train_data_hindi, test_data_hindi = data.TabularDataset.splits(
    path = "/content",
    train = "hindi_training_data.tsv",
    test = "hindi_test_data.tsv",
    format='tsv',
    fields=fields,
    skip_header=True
)

seed = 42
train_data_hindi, valid_data_hindi = train_data_hindi.split(split_ratio=0.8, random_state=random.seed(seed))

In [146]:
def create_iterator(train_data, valid_data, test_data, batch_size, device):    
  train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits((train_data, valid_data, test_data),
          batch_size = batch_size,
          sort_key = lambda x: len(x.text), 
          sort_within_batch = True,
          device = device)
  return train_iterator, valid_iterator, test_iterator

def accuracy(probs, target):
  predictions = probs.argmax(dim=1)
  corrects = (predictions == target)
  accuracy = corrects.sum().float() / float(target.size(0))
  return accuracy

def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    for batch in iterator:
        optimizer.zero_grad()
        text, text_lengths = batch.text
        text, text_lengths = text.to(device), text_lengths.to(device)
        predictions = model(text, text_lengths)
        loss = criterion(predictions, batch.labels.squeeze())
        acc = accuracy(predictions, batch.labels)        
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            text, text_lengths = batch.text
            predictions = model(text, text_lengths).squeeze(1)
            loss = criterion(predictions, batch.labels)
            acc = accuracy(predictions, batch.labels)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def run_train(epochs, model, train_iterator, valid_iterator, optimizer, criterion, model_type):
    best_valid_loss = float('inf')

    for epoch in range(epochs):

        # train the model
        train_loss, train_acc = train(model, train_iterator, optimizer, criterion)

        # evaluate the model
        valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

        # save the best model
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), 'saved_weights'+'_'+model_type+'.pt')

        print(f'Epoch: {epoch+1} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%')
        print(f'Epoch: {epoch+1} | Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc * 100:.2f}%')

In [147]:
# Hyperparameters
lr = 1e-4
batch_size = 50
dropout_keep_prob = 0.5
embedding_size = 300
max_document_length = 100  # each sentence has until 100 words
dev_size = 0.8 # split percentage to train\validation data
max_size = 5000 # maximum vocabulary size
seed = 1
num_classes = 2
num_hidden_nodes = 93
hidden_dim2 = 128
num_layers = 2  # LSTM layers
bi_directional = False 
num_epochs = 7
num_hidden_nodes = 93

In [148]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim1, hidden_dim2, output_dim, n_layers,
                 bidirectional, dropout, pad_index):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_index)
        self.lstm = nn.LSTM(embedding_dim,
                            hidden_dim1,
                            num_layers=n_layers,
                            bidirectional=bidirectional,
                            batch_first=True)
        self.fc1 = nn.Linear(hidden_dim1 * 2, hidden_dim2)
        self.fc2 = nn.Linear(hidden_dim2, output_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths):
        embedded = self.embedding(text)
        packed_embedded = pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True) 

        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        cat = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        rel = self.relu(cat)
        dense1 = self.fc1(rel)
        drop = self.dropout(dense1)
        preds = self.fc2(drop)
        return preds

In [149]:
# Main Function

if __name__ == "__main__":
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
  path = '/content'
  path_data = os.path.join(path, "data")
  # parameters
  model_type = "LSTM"
  data_type = "token" # or: "morph"

  char_based = True
  if char_based:
      tokenizer = lambda s: list(s) # char-based
  else:
      tokenizer = lambda s: s.split() # word-based

  Text.build_vocab(train_data_hindi, max_size=max_size)
  Label.build_vocab(train_data_hindi)
  vocab_size = len(Text.vocab)
  to_train = True
  pad_index = Text.vocab.stoi[Text.pad_token]
  # pad_index = pad_index.type(torch.int64)


  train_iterator, valid_iterator, test_iterator = create_iterator(train_data_hindi, valid_data_hindi, test_data_hindi, batch_size, device)

  # loss function
  loss_func = nn.CrossEntropyLoss()
  loss_func = loss_func.to(device)
  lstm_model = LSTM(vocab_size, embedding_size, num_hidden_nodes, hidden_dim2 , num_classes, num_layers, bi_directional, dropout_keep_prob, pad_index)
  if torch.cuda.is_available():
    lstm_model.cuda()

  # optimization algorithm
  optimizer = torch.optim.Adam(lstm_model.parameters(), lr=lr)
  # train and evaluation
  # if (to_train):
  #     # train and evaluation
  torch.backends.cudnn.enabled = False
  run_train(num_epochs, lstm_model, train_iterator, valid_iterator, optimizer, loss_func, model_type)

  # load weights
  # lstm_model.load_state_dict(torch.load(os.path.join(path, "saved_weights_LSTM.pt")))
  # predict
  test_loss, test_acc = evaluate(lstm_model, test_iterator, loss_func)
  print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.2f}%')

Epoch: 1 | Train Loss: 0.694 | Train Acc: 48.83%
Epoch: 1 | Val. Loss: 0.687 |  Val. Acc: 58.18%
Epoch: 2 | Train Loss: 0.685 | Train Acc: 57.92%
Epoch: 2 | Val. Loss: 0.679 |  Val. Acc: 61.62%
Epoch: 3 | Train Loss: 0.674 | Train Acc: 63.41%
Epoch: 3 | Val. Loss: 0.662 |  Val. Acc: 64.19%
Epoch: 4 | Train Loss: 0.651 | Train Acc: 64.46%
Epoch: 4 | Val. Loss: 0.628 |  Val. Acc: 68.50%
Epoch: 5 | Train Loss: 0.585 | Train Acc: 71.43%
Epoch: 5 | Val. Loss: 0.525 |  Val. Acc: 74.52%
Epoch: 6 | Train Loss: 0.493 | Train Acc: 76.83%
Epoch: 6 | Val. Loss: 0.482 |  Val. Acc: 76.09%
Epoch: 7 | Train Loss: 0.417 | Train Acc: 81.68%
Epoch: 7 | Val. Loss: 0.480 |  Val. Acc: 77.72%
Test Loss: 0.553 | Test Acc: 74.93%


Bangla Dataset

In [150]:
# Read Bangla Dataset
bengali_data = pd.read_csv('bengali_hatespeech.csv', sep=',')
bengali_data = bengali_data.loc[:,"sentence":"hate"]
bengali_data['sentence'] = bengali_data['sentence'].str.replace('[{}]'.format(string.punctuation), ' ')
bengali_data['sentence'] = bengali_data['sentence'].str.replace('[{}]'.format('।'), '')
bengali_stop_word_list = pd.read_csv('banglastopword.txt', sep='\s+', header=None)
bengali_stop_word_list = bengali_stop_word_list[0].tolist()
bengali_data['sentence'] = bengali_data['sentence'].apply(lambda x: ' '.join([item for item in x.split() if item not in (bengali_stop_word_list)]))
bengali_data = bengali_data[bengali_data.sentence != ""]
bengali_data.dropna(subset = ["sentence"], inplace=True)
bengali_data.dropna(subset = ["hate"], inplace=True)
bengali_training_data = bengali_data.sample(frac = 0.8)
bengali_test_data = bengali_data.drop(bengali_training_data.index)
# write a dataframe to tsv file
bengali_training_data.to_csv("bengali_training_data.tsv", sep="\t", index=False)
# write a dataframe to tsv file
bengali_test_data.to_csv("bengali_test_data.tsv", sep="\t", index=False)
# len(bengali_data)

29944

In [151]:
bengali_data = bengali_data.sample(n=len(hindi_data))

In [None]:
bengali_data

In [153]:
#Bengali Dataset Cleanup
def bengali_data_cleanup(bengali_data):
  cleaned_text = []
  for text in bengali_data:
      # remove multiple spaces
      text = re.sub(r' +', ' ', text)
      # remove newline
      text = re.sub(r'\n', ' ', text)
      cleaned_text.append(text)
  return cleaned_text

In [154]:
Text = data.Field(preprocessing=bengali_data_cleanup, tokenize=tokenizer, batch_first=True, include_lengths=True, fix_length=100)
Label = data.Field(sequential=False, use_vocab=False, pad_token=None, unk_token=None)

fields = [('text', Text), ('labels', Label)]
train_data_bengali, test_data_bengali = data.TabularDataset.splits(
    path = "/content",
    train = "bengali_training_data.tsv",
    test = "bengali_test_data.tsv",
    format='tsv',
    fields=fields,
    skip_header=True
)

seed = 42
train_data_bengali, valid_data_bengali = train_data_bengali.split(split_ratio=0.8, random_state=random.seed(seed))

In [155]:
# Main Function

if __name__ == "__main__":
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
  path = '/content'
  path_data = os.path.join(path, "data")
  # parameters
  model_type = "LSTM"
  data_type = "token" # or: "morph"

  char_based = True
  if char_based:
      tokenizer = lambda s: list(s) # char-based
  else:
      tokenizer = lambda s: s.split() # word-based

  Text.build_vocab(train_data_bengali, max_size=max_size)
  Label.build_vocab(train_data_bengali)
  vocab_size = len(Text.vocab)
  to_train = True
  pad_index = Text.vocab.stoi[Text.pad_token]
  # pad_index = pad_index.type(torch.int64)


  train_iterator, valid_iterator, test_iterator = create_iterator(train_data_bengali, valid_data_bengali, test_data_bengali, batch_size, device)

  # loss function
  loss_func = nn.CrossEntropyLoss()
  loss_func = loss_func.to(device)
  lstm_model = LSTM(vocab_size, embedding_size, num_hidden_nodes, hidden_dim2 , num_classes, num_layers, bi_directional, dropout_keep_prob, pad_index)
  if torch.cuda.is_available():
    lstm_model.cuda()

  # optimization algorithm
  optimizer = torch.optim.Adam(lstm_model.parameters(), lr=lr)
  # train and evaluation
  # if (to_train):
  #     # train and evaluation
  torch.backends.cudnn.enabled = False
  run_train(num_epochs, lstm_model, train_iterator, valid_iterator, optimizer, loss_func, model_type)

  # load weights
  lstm_model.load_state_dict(torch.load(os.path.join(path, "saved_weights_LSTM.pt")))
  # predict
  test_loss, test_acc = evaluate(lstm_model, test_iterator, loss_func)
  print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.2f}%')

Epoch: 1 | Train Loss: 0.622 | Train Acc: 67.31%
Epoch: 1 | Val. Loss: 0.566 |  Val. Acc: 72.01%
Epoch: 2 | Train Loss: 0.512 | Train Acc: 75.27%
Epoch: 2 | Val. Loss: 0.469 |  Val. Acc: 79.91%
Epoch: 3 | Train Loss: 0.438 | Train Acc: 81.57%
Epoch: 3 | Val. Loss: 0.429 |  Val. Acc: 82.15%
Epoch: 4 | Train Loss: 0.405 | Train Acc: 83.45%
Epoch: 4 | Val. Loss: 0.392 |  Val. Acc: 83.46%
Epoch: 5 | Train Loss: 0.390 | Train Acc: 84.33%
Epoch: 5 | Val. Loss: 0.374 |  Val. Acc: 85.05%
Epoch: 6 | Train Loss: 0.383 | Train Acc: 84.87%
Epoch: 6 | Val. Loss: 0.375 |  Val. Acc: 85.13%
Epoch: 7 | Train Loss: 0.438 | Train Acc: 80.17%
Epoch: 7 | Val. Loss: 0.447 |  Val. Acc: 77.04%
Test Loss: 0.378 | Test Acc: 85.25%
