# Imports

# Without jaccard

train / test / test_set_2

In [1]:
import pandas as pd
train_set = pd.read_csv("csv_data/train.csv")
test_set = pd.read_csv("csv_data/test.csv")
validate_set = pd.read_csv("csv_data/test_set_2.csv")

train_set_2 / test_set_2 / test

In [None]:
import pandas as pd
train_set = pd.read_csv("csv_data/train_set_2.csv")
test_set = pd.read_csv("csv_data/test_set_2.csv")
validate_set = pd.read_csv("csv_data/test.csv")

# With Jaccard

jaccard_train / test / test_set_2

In [None]:
import pandas as pd
train_set = pd.read_csv("csv_data/jaccard_train.csv")
test_set = pd.read_csv("csv_data/test.csv")
validate_set = pd.read_csv("csv_data/test_set_2.csv")

jaccard_train_set_2 / test_set_2 / test

In [None]:
import pandas as pd
train_set = pd.read_csv("csv_data/jaccard_train_set_2.csv")
test_set = pd.read_csv("csv_data/test_set_2.csv")
validate_set = pd.read_csv("csv_data/test.csv")

# Run Bellow

In [2]:
import time
from transformers import AutoModel, BertTokenizerFast
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from tqdm.notebook import tqdm
from transformers import BertModel, BertTokenizer, BertForSequenceClassification
from simpletransformers.classification import ClassificationModel

# Optoimizing dataset

In [3]:
class makeDataset(Dataset):
    def __init__(self, data, max_length=100):
        self.data = data
        
        self.config = {
            "max_length": max_length,
            "padding": "max_length",
            "return_tensors": "pt",
            "truncation": True,
            "add_special_tokens": True
        }

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        value = self.data.iloc[idx]
        return value['text'], value['label']

# Training data 2 testing data 2 final testing data 1

In [4]:
training_data = makeDataset(train_set)
train_dataloader = DataLoader(training_data, batch_size=16, shuffle=True)

In [5]:
test_data = makeDataset(test_set)
test_dataloader = DataLoader(test_data, batch_size=16, shuffle=True)

In [6]:
final_test_data = makeDataset(validate_set)
final_test_dataloader = DataLoader(final_test_data, batch_size=16, shuffle=True)

In [7]:
class newEmotionBert(nn.Module):

    def __init__(self, bert):
        super(newEmotionBert, self).__init__()

        self.bert = bert

        # dropout layer
        self.dropout = nn.Dropout(0.2)

        # relu activation function
#         self.relu = nn.ReLU()

        # dense layer 1
        self.fc1 = nn.Linear(768, 328)

        # dense layer 2 (Output layer)
        self.fc2 = nn.Linear(328, 5)  
        self.softmax = nn.LogSoftmax(dim=1)


    # define the forward pass
    def forward(self, input_ids, token_type_ids, attention_mask):
        # pass the inputs to the model
        out = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

        x = self.fc1(out[1])
#         x = self.relu(x)

        # output layer
        x = self.fc2(self.dropout(x))
        x = self.softmax(x)

        return x

In [8]:
bert_model_name = "sagorsarker/bangla-bert-base"
bert = BertModel.from_pretrained(bert_model_name)
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

Some weights of the model checkpoint at sagorsarker/bangla-bert-base were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = newEmotionBert(bert)
model.to(device);

In [10]:
from torch.optim.lr_scheduler import StepLR

optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = nn.NLLLoss()
scheduler = StepLR(optimizer, step_size=2, gamma=0.1)

In [11]:
from time import sleep
def train(model, dataloader, optimizer, criterion, config):
    model.train()  # prep model for training
    train_loss = 0
    for batch in tqdm(dataloader):
        text, labels = batch

        model.zero_grad()

        inputs = tokenizer.batch_encode_plus(
            text, **config
        )
        input_ids = inputs['input_ids'].to(device)
        token_type_ids = inputs['token_type_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        labels = labels.to(device)

        # move things to model
        logs = model(token_type_ids=token_type_ids, input_ids=input_ids, attention_mask=attention_mask)

        loss = criterion(logs, labels)
        train_loss += loss.item() * input_ids.size(0)
        loss.backward()

        # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    return train_loss

In [12]:
def evaluate(model, dataloader, criterion, config):
    total = 0
    correct = 0
    valid_loss = 0.0

    model.eval()  # prep model for evaluation
    for batch in tqdm(dataloader):
        text, labels = batch
        inputs = tokenizer.batch_encode_plus(
            text, **config
        )
        input_ids = inputs['input_ids'].to(device)
        token_type_ids = inputs['token_type_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        labels = labels.to(device)

        # move things to model
        output = model(token_type_ids=token_type_ids, input_ids=input_ids, attention_mask=attention_mask)

        loss_p = criterion(output, labels)
        # update running validation loss
        valid_loss += loss_p.item() * input_ids.size(0)
        # calculate accuracy
        proba = torch.exp(output)
        top_p, top_class = proba.topk(1, dim=1)
        equals = top_class == labels.view(*top_class.shape)
        # accuracy += torch.mean(equals.type(torch.FloatTensor)).item()

        _, predicted = torch.max(output.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    return total, correct, valid_loss

In [13]:
epochs = 10
tokenizer_config = {
    "max_length": 100,
    "padding": "max_length",
    "return_tensors": "pt",
    "truncation": True,
    "add_special_tokens": True
}

In [14]:
train_loss_data, valid_loss_data = [], []
valid_loss_min = np.Inf
since = time.time()
best_loss = np.inf

for epoch in range(epochs):
    if epoch > 4:
        for param in model.bert.parameters():
            param.requires_grad = False
    print("Epoch: {}/{}".format(epoch + 1, epochs))
    # monitor training loss
    train_loss = 0.0
    valid_loss = 0.0
    total = 0
    correct = 0
    e_since = time.time()
    
    # Train Model
    train_loss += train(model, train_dataloader, optimizer, criterion, tokenizer_config)

    # Now Evaluate
    out = evaluate(model, test_dataloader, criterion, tokenizer_config)

    total += out[0]
    correct += out[1]
    valid_loss += out[2]

    scheduler.step()

    # print training/validation statistics
    # calculate average loss over an epoch
    train_loss = train_loss / len(train_dataloader.dataset)
    valid_loss = valid_loss / len(test_dataloader.dataset)

    # calculate train loss and running loss
    train_loss_data.append(train_loss * 100)
    valid_loss_data.append(valid_loss * 100)
    
    if valid_loss < best_loss:
        best_loss = valid_loss
        torch.save(model.state_dict(), "new_emotion_model.pth")

    print("\tTrain loss:{:.6f}..".format(train_loss),
          "\tValid Loss:{:.6f}..".format(valid_loss),
          "\tAccuracy: {:.4f}".format(correct / total * 100))

time_elapsed = time.time() - since
print('Training completed in {:.0f}m {:.0f}s'.format(
    time_elapsed // 60, time_elapsed % 60))

Epoch: 1/10


  0%|          | 0/276 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

	Train loss:1.290096.. 	Valid Loss:1.108219.. 	Accuracy: 55.0725
Epoch: 2/10


  0%|          | 0/276 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

	Train loss:0.924828.. 	Valid Loss:1.013365.. 	Accuracy: 60.6884
Epoch: 3/10


  0%|          | 0/276 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

	Train loss:0.697830.. 	Valid Loss:1.018541.. 	Accuracy: 61.5942
Epoch: 4/10


  0%|          | 0/276 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

	Train loss:0.660409.. 	Valid Loss:1.025709.. 	Accuracy: 60.8696
Epoch: 5/10


  0%|          | 0/276 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

	Train loss:0.626259.. 	Valid Loss:1.027354.. 	Accuracy: 61.4130
Epoch: 6/10


  0%|          | 0/276 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

	Train loss:0.623392.. 	Valid Loss:1.029124.. 	Accuracy: 61.2319
Epoch: 7/10


  0%|          | 0/276 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

	Train loss:0.626866.. 	Valid Loss:1.029227.. 	Accuracy: 61.2319
Epoch: 8/10


  0%|          | 0/276 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

	Train loss:0.619839.. 	Valid Loss:1.029317.. 	Accuracy: 61.2319
Epoch: 9/10


  0%|          | 0/276 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

	Train loss:0.621500.. 	Valid Loss:1.029320.. 	Accuracy: 61.2319
Epoch: 10/10


  0%|          | 0/276 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

	Train loss:0.623319.. 	Valid Loss:1.029323.. 	Accuracy: 61.2319
Training completed in 15m 46s


In [15]:
model.load_state_dict(torch.load("./new_emotion_model.pth", map_location = device))

<All keys matched successfully>

In [16]:
all_preds = []
all_labels = []

for batch in final_test_dataloader:
    text, labels = batch
    inputs = tokenizer.batch_encode_plus(
        text, **tokenizer_config
    )
    input_ids = inputs['input_ids'].to(device)
    token_type_ids = inputs['token_type_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    labels = labels.to(device)

    # move things to model
    output = model(token_type_ids=token_type_ids, input_ids=input_ids, attention_mask=attention_mask)
    preds = output.detach().cpu().numpy()
    preds = np.argmax(preds, axis = 1)
    all_preds.extend(preds)
    all_labels.extend(labels.cpu().numpy())

In [17]:
correct = 0
for i in range(len(all_labels)):
    if all_labels[i]==all_preds[i]:
        correct+=1
print(correct)

812


In [18]:
print(f"accuracy: {correct/len(all_labels)*100}")
print(f"total: {len(all_labels)}")
print(f"correct: {correct}")

accuracy: 64.1897233201581
total: 1265
correct: 812


In [19]:
from sklearn.metrics import precision_recall_fscore_support
y_true = np.array(all_labels)
y_pred = np.array(all_preds)
print(precision_recall_fscore_support(y_true, y_pred, average='macro'))
print(precision_recall_fscore_support(y_true, y_pred, average='micro'))
print(precision_recall_fscore_support(y_true, y_pred, average='weighted'))

(0.6580885903556287, 0.6232771998303991, 0.5989634919236948, None)
(0.641897233201581, 0.641897233201581, 0.641897233201581, None)
(0.6612807753026061, 0.641897233201581, 0.6136254328863237, None)


In [20]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.73      0.80       311
           1       0.64      0.64      0.64       275
           2       0.51      0.09      0.15       208
           3       0.48      0.83      0.61       289
           4       0.78      0.83      0.80       182

    accuracy                           0.64      1265
   macro avg       0.66      0.62      0.60      1265
weighted avg       0.66      0.64      0.61      1265

