# Imports

In [1]:
import pandas as pd
train_set = pd.read_csv("csv_data/train_set_2.csv")
test_set = pd.read_csv("csv_data/test_set_2.csv")
validate_set = pd.read_csv("csv_data/test_set_1.csv")

In [2]:
import time
from transformers import AutoModel, BertTokenizerFast
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from tqdm.notebook import tqdm
from transformers import BertModel, BertTokenizer, BertForSequenceClassification

# Optoimizing dataset

In [3]:
class makeDataset(Dataset):
    def __init__(self, data, max_length=100):
        self.data = data
        
        self.config = {
            "max_length": max_length,
            "padding": "max_length",
            "return_tensors": "pt",
            "truncation": True,
            "add_special_tokens": True
        }

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        value = self.data.iloc[idx]
        return value['text'], value['label']

# Training data 2 testing data 2 final testing data 1

In [4]:
training_data = makeDataset(train_set)
train_dataloader = DataLoader(training_data, batch_size=16, shuffle=True)

In [5]:
test_data = makeDataset(test_set)
test_dataloader = DataLoader(test_data, batch_size=16, shuffle=True)

In [6]:
final_test_data = makeDataset(validate_set)
final_test_dataloader = DataLoader(final_test_data, batch_size=16, shuffle=True)

In [7]:
class newEmotionBert(nn.Module):

    def __init__(self, bert):
        super(newEmotionBert, self).__init__()

        self.bert = bert

        # dropout layer
        self.dropout = nn.Dropout(0.8)

        # relu activation function
        self.relu = nn.ReLU()

        # dense layer 1
        self.fc1 = nn.Linear(768, 328)

        # dense layer 2 (Output layer)
        self.fc2 = nn.Linear(328, 5)  

    # define the forward pass
    def forward(self, input_ids, token_type_ids, attention_mask):
        # pass the inputs to the model
        out = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

        x = self.fc1(out[1])
        x = self.relu(x)

        # output layer
        x = self.fc2(self.dropout(x))
        
        return x

In [8]:
bert_model_name = "sagorsarker/bangla-bert-base"
bert = BertModel.from_pretrained(bert_model_name)
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

Some weights of the model checkpoint at sagorsarker/bangla-bert-base were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = newEmotionBert(bert)
model.to(device);

In [10]:
from torch.optim.lr_scheduler import StepLR

optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()
scheduler = StepLR(optimizer, step_size=2, gamma=0.1)

In [11]:
from time import sleep
def train(model, dataloader, optimizer, criterion, config):
    model.train()  # prep model for training
    train_loss = 0
    for batch in dataloader:
        text, labels = batch

        model.zero_grad()

        inputs = tokenizer.batch_encode_plus(
            text, **config
        )
        input_ids = inputs['input_ids'].to(device)
        token_type_ids = inputs['token_type_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        labels = labels.to(device)

        # move things to model
        logs = model(token_type_ids=token_type_ids, input_ids=input_ids, attention_mask=attention_mask)

        loss = criterion(logs, labels)
        train_loss += loss.item() * input_ids.size(0)
        loss.backward()

        # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    return train_loss

In [12]:
def evaluate(model, dataloader, criterion, config):
    total = 0
    correct = 0
    valid_loss = 0.0

    model.eval()  # prep model for evaluation
    for batch in dataloader:
        text, labels = batch
        inputs = tokenizer.batch_encode_plus(
            text, **config
        )
        input_ids = inputs['input_ids'].to(device)
        token_type_ids = inputs['token_type_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        labels = labels.to(device)

        # move things to model
        output = model(token_type_ids=token_type_ids, input_ids=input_ids, attention_mask=attention_mask)

        loss_p = criterion(output, labels)
        # update running validation loss
        valid_loss += loss_p.item() * input_ids.size(0)
        # calculate accuracy
        proba = torch.exp(output)
        top_p, top_class = proba.topk(1, dim=1)
        equals = top_class == labels.view(*top_class.shape)
        # accuracy += torch.mean(equals.type(torch.FloatTensor)).item()

        _, predicted = torch.max(output.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    return total, correct, valid_loss

In [13]:
epochs = 5
tokenizer_config = {
    "max_length": 100,
    "padding": "max_length",
    "return_tensors": "pt",
    "truncation": True,
    "add_special_tokens": True
}

In [14]:
train_loss_data, valid_loss_data = [], []
valid_loss_min = np.Inf
since = time.time()
best_loss = np.inf

for epoch in range(epochs):
    print("Epoch: {}/{}".format(epoch + 1, epochs))
    # monitor training loss
    train_loss = 0.0
    valid_loss = 0.0
    total = 0
    correct = 0
    e_since = time.time()

    # Train Model
    train_loss += train(model, train_dataloader, optimizer, criterion, tokenizer_config)

    # Now Evaluate
    out = evaluate(model, test_dataloader, criterion, tokenizer_config)

    total += out[0]
    correct += out[1]
    valid_loss += out[2]

    scheduler.step()

    # print training/validation statistics
    # calculate average loss over an epoch
    train_loss = train_loss / len(train_dataloader.dataset)
    valid_loss = valid_loss / len(test_dataloader.dataset)

    # calculate train loss and running loss
    train_loss_data.append(train_loss * 100)
    valid_loss_data.append(valid_loss * 100)
    
    if valid_loss < best_loss:
        best_loss = valid_loss
        torch.save(model.state_dict(), "new_emotion_model.pth")

    print("\tTrain loss:{:.6f}..".format(train_loss),
          "\tValid Loss:{:.6f}..".format(valid_loss),
          "\tAccuracy: {:.4f}".format(correct / total * 100))

time_elapsed = time.time() - since
print('Training completed in {:.0f}m {:.0f}s'.format(
    time_elapsed // 60, time_elapsed % 60))

Epoch: 1/5
	Train loss:1.538021.. 	Valid Loss:1.344273.. 	Accuracy: 52.0949
Epoch: 2/5
	Train loss:1.296083.. 	Valid Loss:1.106052.. 	Accuracy: 60.6324
Epoch: 3/5
	Train loss:1.124677.. 	Valid Loss:1.057775.. 	Accuracy: 62.3715
Epoch: 4/5
	Train loss:1.081334.. 	Valid Loss:1.028418.. 	Accuracy: 63.3992
Epoch: 5/5
	Train loss:1.054043.. 	Valid Loss:1.026204.. 	Accuracy: 63.6364
Training completed in 8m 32s


In [15]:
model.load_state_dict(torch.load("./new_emotion_model.pth", map_location = device))

<All keys matched successfully>

In [16]:
all_preds = []
all_labels = []

for batch in final_test_dataloader:
    text, labels = batch
    inputs = tokenizer.batch_encode_plus(
        text, **tokenizer_config
    )
    input_ids = inputs['input_ids'].to(device)
    token_type_ids = inputs['token_type_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    labels = labels.to(device)

    # move things to model
    output = model(token_type_ids=token_type_ids, input_ids=input_ids, attention_mask=attention_mask)
    preds = output.detach().cpu().numpy()
    preds = np.argmax(preds, axis = 1)
    all_preds.extend(preds)
    all_labels.extend(labels.cpu().numpy())

In [17]:
correct = 0
for i in range(len(all_labels)):
    if all_labels[i]==all_preds[i]:
        correct+=1
print(correct)

354


In [18]:
print(f"accuracy: {correct/len(all_labels)*100}")
print(f"total: {len(all_labels)}")
print(f"correct: {correct}")

accuracy: 34.30232558139535
total: 1032
correct: 354


In [19]:
in_correct = []
for i in range(len(all_labels)):
    if all_labels[i]!=all_preds[i]:
        in_correct.append(train_set["text"][i])
print(in_correct)

['মামা ভয়ে ভয়ে লুকিং গ্লাসের কাপড় সরিয়ে দিয়ে দেখে বাচ্চাটি একটা রক্তাক্ত পা চিবাই চিবাই খাচ্ছে  ', 'আপু আপনাকে খুব সুন্দর লাগছে  ', 'আমি গত সপ্তাহের কতটা মনে পড়ছি তা দেখে আমি চমকে উঠতে শুরু করি আমি নিশ্চিত অসুস্থ কিছুটা নার্ভাস বোধ করি', ' শেষ সময়ে এসে কিছু তিক্ত ঘটনা ঘটে গেলো কিন্তু এর জের ধরে নিজ নিজ ইগো নিয়ে যদি সবাই বসে থাকি তাহলে আজ থেকে অনেক বছর পর অবশ্যই পোড়াবে ', 'রবি আজিয়াটা লিমিটেডের দুই ব্র্যান্ড রবি ও এয়ারটেলের আয়োজনে বাংলাদেশে প্রথমবারের মতো অনুষ্ঠিত হলো আন্তর্জাতিক অনলাইন গেমিং প্রতিযোগিতা  আজিয়াটা গেম হিরো   প্রতি দলে  জন করে  দলের  জন প্রতিযোগী বাংলাদেশ পর্বের চূড়ান্ত পর্বে অংশগ্রহণ করেন  অংশগ্রহণকারী টি দল প্রায়  লাখ টাকার পুরস্কার জিতেছে ', 'একটা সেকেন্ডও স্কিপ করিনি পুরাই আগুন ছিলো লাভ ফ্রম বাংলাদেশ ', 'জাহালমের আটক থাকার ঘটনাকে ন্যাক্কারজনক বলে সমালোচনা করে বিএনপির সিনিয়র যুগ্ম মহাসচিব রুহুল কবির রিজভী বলেছেন  তার  জাহালম  দীর্ঘদিন কারাভোগের ঘটনায় দুদকের ভূমিকা নজীরবিহীনভাবে ন্যাক্কারজনক ', 'স্কুলে ঠিক সময়ে পৌঁছতে না পারার ভয়  এইরকম কতশত ভয় নিয়েই তো আমরা ব

In [20]:
len(in_correct)

678

In [21]:
position = []
for i in range(len(all_labels)):
    if all_labels[i]!=all_preds[i]:
        position.append([i])
print(position)

[[0], [2], [5], [6], [7], [8], [9], [10], [11], [12], [13], [14], [15], [16], [17], [19], [20], [22], [23], [24], [25], [27], [28], [29], [30], [31], [32], [33], [34], [36], [37], [38], [39], [41], [43], [45], [47], [49], [50], [52], [53], [54], [55], [57], [58], [60], [61], [62], [64], [65], [66], [67], [68], [71], [72], [73], [74], [77], [78], [79], [81], [82], [83], [84], [85], [86], [87], [89], [91], [92], [93], [94], [96], [97], [98], [99], [102], [103], [104], [105], [106], [107], [108], [110], [113], [115], [116], [117], [118], [120], [122], [123], [125], [126], [127], [131], [132], [133], [134], [135], [138], [141], [142], [143], [144], [148], [150], [154], [155], [158], [159], [160], [161], [163], [164], [165], [167], [169], [171], [172], [174], [175], [177], [179], [180], [181], [184], [186], [187], [189], [190], [193], [194], [195], [197], [198], [200], [201], [202], [203], [206], [207], [210], [212], [213], [214], [215], [216], [217], [219], [220], [221], [222], [223], [224

In [22]:
position = []
for i in range(len(all_labels)):
    if all_labels[i]!=all_preds[i]:
        print(f"{all_labels[i]} and {all_preds[i]}")

1 and 3
2 and 0
1 and 3
0 and 3
1 and 0
1 and 3
2 and 3
2 and 3
0 and 3
1 and 3
1 and 3
1 and 3
2 and 3
1 and 3
2 and 0
1 and 3
1 and 3
2 and 3
2 and 3
1 and 0
1 and 0
2 and 3
1 and 3
2 and 3
1 and 4
1 and 3
2 and 0
1 and 3
0 and 3
1 and 0
2 and 3
2 and 3
1 and 0
2 and 3
2 and 3
2 and 3
0 and 3
2 and 3
1 and 3
1 and 3
1 and 3
1 and 3
1 and 3
0 and 3
1 and 0
4 and 3
1 and 3
2 and 3
2 and 3
1 and 0
0 and 1
0 and 3
4 and 1
1 and 3
2 and 3
0 and 3
0 and 4
2 and 3
2 and 3
2 and 3
1 and 3
2 and 3
1 and 3
2 and 3
0 and 3
0 and 3
3 and 1
1 and 3
1 and 3
2 and 3
1 and 0
3 and 4
1 and 3
0 and 3
2 and 3
3 and 0
4 and 3
4 and 3
0 and 2
0 and 3
2 and 0
2 and 0
2 and 3
3 and 0
2 and 3
2 and 3
3 and 0
1 and 0
2 and 3
2 and 1
1 and 3
0 and 3
1 and 3
4 and 3
2 and 3
4 and 1
1 and 3
0 and 3
0 and 3
1 and 3
2 and 3
2 and 3
3 and 0
2 and 0
2 and 3
2 and 3
1 and 3
2 and 3
1 and 3
1 and 0
2 and 3
1 and 3
1 and 3
2 and 3
4 and 3
0 and 3
2 and 3
1 and 3
2 and 3
1 and 3
2 and 3
0 and 3
2 and 3
3 and 1
0 and 3
