# Imports

In [1]:
import pandas as pd
train_set = pd.read_csv("csv_data/train_set_2.csv")
test_set = pd.read_csv("csv_data/test_set_2.csv")
validate_set = pd.read_csv("csv_data/test_set_1.csv")

In [2]:
import json
with open("txt_files/stopwords-bn.json", encoding="utf8") as datafile:
    stop_words_bangla = json.load(datafile)
        
# for i in stop_words_bangla:
#     if "এই" in i:
#         print(i)

In [3]:
texts = train_set.text.tolist()
labels = train_set.label.tolist()
new_texts = []
for text in texts:
    temp_texts = text.split(" ")
    new_temp_text = []
    for temp_text in temp_texts:
        i=0
        for stop in stop_words_bangla:
            if stop==temp_text:
                i=1
                break
        if i==0:
            new_temp_text.append(temp_text)
            
    x=" ".join(new_temp_text)
    new_texts.append(x)
# print(new_texts)

In [4]:
new_train_set = pd.DataFrame(list(zip(new_texts, labels)),columns =['text', 'label'])
print(new_train_set.head())
train_set = new_train_set

                                                text  label
0  মামা ভয়ে ভয়ে লুকিং গ্লাসের কাপড় সরিয়ে বাচ্...      4
1  বয়সে নাতি নাতনিদের আনন্দে মেতে থাকার কথা বুড়ো ...      3
2                          আপু আপনাকে সুন্দর লাগছে        0
3     শহরের কোণে মায়া জমায়  এখনো ভাবি তোমাকে ফেরানো       1
4  মাথা পুরোটাই নষ্ট  পশ্চিম বঙ্গবাসী ভোট দেয়   ছ...      2


In [5]:
import time
from transformers import AutoModel, BertTokenizerFast
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from tqdm.notebook import tqdm
from transformers import BertModel, BertTokenizer, BertForSequenceClassification

# Optoimizing dataset

In [6]:
class makeDataset(Dataset):
    def __init__(self, data, max_length=100):
        self.data = data
        
        self.config = {
            "max_length": max_length,
            "padding": "max_length",
            "return_tensors": "pt",
            "truncation": True,
            "add_special_tokens": True
        }

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        value = self.data.iloc[idx]
        return value['text'], value['label']

# Training data 2 testing data 2 final testing data 1

In [7]:
training_data = makeDataset(train_set)
train_dataloader = DataLoader(training_data, batch_size=16, shuffle=True)

In [8]:
test_data = makeDataset(test_set)
test_dataloader = DataLoader(test_data, batch_size=16, shuffle=True)

In [9]:
final_test_data = makeDataset(validate_set)
final_test_dataloader = DataLoader(final_test_data, batch_size=16, shuffle=True)

In [10]:
class newEmotionBert(nn.Module):

    def __init__(self, bert):
        super(newEmotionBert, self).__init__()

        self.bert = bert

        # dropout layer
        self.dropout = nn.Dropout(0.8)

        # relu activation function
        self.relu = nn.ReLU()

        # dense layer 1
        self.fc1 = nn.Linear(768, 384)
        
        self.fc2 = nn.Linear(384, 96)
        self.fc3 = nn.Linear(96, 48)
        self.fc4 = nn.Linear(48, 24)

        # dense layer 2 (Output layer)
        self.fc5 = nn.Linear(24, 5)  
        
        self.softmax = nn.LogSoftmax(dim=1)

    # define the forward pass
    def forward(self, input_ids, token_type_ids, attention_mask):
        # pass the inputs to the model
        out = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
#         print(out)
        x = self.fc1(out[0])
#         x = self.relu(x)

        # output layer
        x = self.fc2(self.dropout(x))
        x = self.fc3(self.dropout(x))
        x = self.fc4(self.dropout(x))
        x = self.fc5(self.dropout(x))
        x = self.softmax(x)
        
        return x

In [11]:
bert_model_name = "sagorsarker/bangla-bert-base"
bert = BertForSequenceClassification.from_pretrained(bert_model_name, num_labels = 5)
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

Some weights of the model checkpoint at sagorsarker/bangla-bert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = newEmotionBert(bert)
model.to(device);

In [13]:
from torch.optim.lr_scheduler import StepLR

optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=1e-5)
criterion = nn.NLLLoss()
scheduler = StepLR(optimizer, step_size=2, gamma=0.1)

In [14]:
from time import sleep
def train(model, dataloader, optimizer, criterion, config):
    model.train()  # prep model for training
    train_loss = 0
    for batch in tqdm(dataloader):
        text, labels = batch

        model.zero_grad()

        inputs = tokenizer.batch_encode_plus(
            text, **config
        )
        input_ids = inputs['input_ids'].to(device)
        token_type_ids = inputs['token_type_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        labels = labels.to(device)

        # move things to model
        logs = model(token_type_ids=token_type_ids, input_ids=input_ids, attention_mask=attention_mask)

        loss = criterion(logs, labels)
        train_loss += loss.item() * input_ids.size(0)
        loss.backward()

        # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    return train_loss

In [15]:
def evaluate(model, dataloader, criterion, config):
    total = 0
    correct = 0
    valid_loss = 0.0

    model.eval()  # prep model for evaluation
    for batch in tqdm(dataloader):
        text, labels = batch
        inputs = tokenizer.batch_encode_plus(
            text, **config
        )
        input_ids = inputs['input_ids'].to(device)
        token_type_ids = inputs['token_type_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        labels = labels.to(device)

        # move things to model
        output = model(token_type_ids=token_type_ids, input_ids=input_ids, attention_mask=attention_mask)

        loss_p = criterion(output, labels)
        # update running validation loss
        valid_loss += loss_p.item() * input_ids.size(0)
        # calculate accuracy
        proba = torch.exp(output)
        top_p, top_class = proba.topk(1, dim=1)
        equals = top_class == labels.view(*top_class.shape)
        # accuracy += torch.mean(equals.type(torch.FloatTensor)).item()

        _, predicted = torch.max(output.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    return total, correct, valid_loss

In [16]:
epochs = 5
tokenizer_config = {
    "max_length": 100,
    "padding": "max_length",
    "return_tensors": "pt",
    "truncation": True,
    "add_special_tokens": True
}

In [17]:
train_loss_data, valid_loss_data = [], []
valid_loss_min = np.Inf
since = time.time()
best_loss = np.inf

for epoch in range(epochs):
    print("Epoch: {}/{}".format(epoch + 1, epochs))
    # monitor training loss
    train_loss = 0.0
    valid_loss = 0.0
    total = 0
    correct = 0
    e_since = time.time()

    # Train Model
    train_loss += train(model, train_dataloader, optimizer, criterion, tokenizer_config)

    # Now Evaluate
    out = evaluate(model, test_dataloader, criterion, tokenizer_config)

    total += out[0]
    correct += out[1]
    valid_loss += out[2]

    scheduler.step()

    # print training/validation statistics
    # calculate average loss over an epoch
    train_loss = train_loss / len(train_dataloader.dataset)
    valid_loss = valid_loss / len(test_dataloader.dataset)

    # calculate train loss and running loss
    train_loss_data.append(train_loss * 100)
    valid_loss_data.append(valid_loss * 100)
    
    if valid_loss < best_loss:
        best_loss = valid_loss
        torch.save(model.state_dict(), "new_emotion_model.pth")

    print("\tTrain loss:{:.6f}..".format(train_loss),
          "\tValid Loss:{:.6f}..".format(valid_loss),
          "\tAccuracy: {:.4f}".format(correct / total * 100))

time_elapsed = time.time() - since
print('Training completed in {:.0f}m {:.0f}s'.format(
    time_elapsed // 60, time_elapsed % 60))

Epoch: 1/5


  0%|          | 0/233 [00:00<?, ?it/s]

RuntimeError: mat1 and mat2 shapes cannot be multiplied (16x5 and 768x384)

In [None]:
model.load_state_dict(torch.load("./new_emotion_model.pth", map_location = device))

In [None]:
all_preds = []
all_labels = []

for batch in final_test_dataloader:
    text, labels = batch
    inputs = tokenizer.batch_encode_plus(
        text, **tokenizer_config
    )
    input_ids = inputs['input_ids'].to(device)
    token_type_ids = inputs['token_type_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    labels = labels.to(device)

    # move things to model
    output = model(token_type_ids=token_type_ids, input_ids=input_ids, attention_mask=attention_mask)
    preds = output.detach().cpu().numpy()
    preds = np.argmax(preds, axis = 1)
    all_preds.extend(preds)
    all_labels.extend(labels.cpu().numpy())

In [None]:
correct = 0
for i in range(len(all_labels)):
    if all_labels[i]==all_preds[i]:
        correct+=1
print(correct)

In [None]:
print(f"accuracy: {correct/len(all_labels)*100}")
print(f"total: {len(all_labels)}")
print(f"correct: {correct}")

In [None]:
in_correct = []
for i in range(len(all_labels)):
    if all_labels[i]!=all_preds[i]:
        in_correct.append(train_set["text"][i])
print(in_correct)

In [None]:
len(in_correct)

In [None]:
position = []
for i in range(len(all_labels)):
    if all_labels[i]!=all_preds[i]:
        position.append([i])
print(position)

In [None]:
position = []
for i in range(len(all_labels)):
    if all_labels[i]!=all_preds[i]:
        print(f"{all_labels[i]} and {all_preds[i]}")