In [2]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import shutil
import sys
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel

In [3]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [4]:
train_df = pd.read_table('/content/train.txt', delimiter = ';', header = None)
train_df.columns = ['comment', 'sentiment']
train_df = train_df.reset_index(drop=True)
train_df['sentiment'].value_counts()

joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: sentiment, dtype: int64

In [5]:
val_df = pd.read_table('/content/val.txt', delimiter = ';', header = None)
val_df.columns = ['comment', 'sentiment']
val_df = val_df.reset_index(drop=True)

test_df = pd.read_table('/content/test.txt', delimiter = ';', header = None)
test_df.columns = ['comment', 'sentiment']
test_df = test_df.reset_index(drop=True)
test_df.head(5)

Unnamed: 0,comment,sentiment
0,im feeling rather rotten so im not very ambiti...,sadness
1,im updating my blog because i feel shitty,sadness
2,i never make her separate from me because i do...,sadness
3,i left with my bouquet of red and yellow tulip...,joy
4,i was feeling a little vain when i did this one,sadness


In [6]:
target_list = {'joy': [1, 0, 0, 0, 0, 0], 
               'sadness': [0, 1, 0, 0, 0, 0], 
               'anger': [0, 0, 1, 0, 0, 0], 
               'fear': [0, 0, 0, 1, 0, 0], 
               'love': [0, 0, 0, 0, 1, 0], 
               'surprise': [0, 0, 0, 0, 0, 1]}

In [29]:
target_index = {0: 'joy', 
                1: 'sadness', 
                2: 'anger', 
                3: 'fear', 
                4: 'love', 
                5: 'surprise'}

In [8]:
class CFG:
    # hyperparameters
    MAX_LEN = 256
    TRAIN_BATCH_SIZE = 32
    VALID_BATCH_SIZE = 32
    EPOCHS = 1
    LEARNING_RATE = 1e-05

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [10]:
class CustomDataset(Dataset):

    def __init__(self, df, tokenizer, max_len=CFG.MAX_LEN):
        self.tokenizer = tokenizer
        self.df = df
        self.comment = self.df['comment']
        self.targets = self.df['sentiment']
        self.target_id = [target_list[i] for i in self.targets]
        self.max_len = max_len

    def __len__(self):
        return len(self.comment)

    def __getitem__(self, index):
        comment = str(self.comment[index])
        comment = " ".join(comment.split())

        inputs = self.tokenizer.encode_plus(
            comment,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.target_id[index])
        }

In [11]:
train_dataset = CustomDataset(train_df, tokenizer, CFG.MAX_LEN)
valid_dataset = CustomDataset(val_df, tokenizer, CFG.MAX_LEN)

In [12]:
train_data_loader = torch.utils.data.DataLoader(train_dataset, 
    batch_size = CFG.TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset, 
    batch_size = CFG.VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

In [13]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, 6)
    
    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids, 
            attention_mask=attn_mask, 
            token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        return output

model = BERTClass()
model.to(device)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

BERTClass(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [14]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=CFG.LEARNING_RATE)

In [15]:
def load_ckp(checkpoint_fpath, model, optimizer):
    checkpoint = torch.load(checkpoint_fpath)
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    valid_loss_min = checkpoint['valid_loss_min']
    return model, optimizer, checkpoint['epoch'], valid_loss_min.item()

def save_ckp(state, is_best, checkpoint_path, best_model_path):

    f_path = checkpoint_path
    torch.save(state, f_path)
    if is_best:
        best_fpath = best_model_path
        shutil.copyfile(f_path, best_fpath)

In [16]:
val_targets=[]
val_outputs=[]

In [20]:
def train_model(n_epochs, training_loader, validation_loader, model, 
                optimizer, checkpoint_path, best_model_path):

    valid_loss_min = np.Inf
    
    for epoch in range(1, n_epochs+1):
        train_loss = 0
        valid_loss = 0

        model.train()
        print('Epoch {}: Training Start'.format(epoch))
        for batch_idx, data in enumerate(training_loader):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)

            outputs = model(ids, mask, token_type_ids)

            optimizer.zero_grad()
            loss = loss_fn(outputs, targets)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))
        print('Epoch {}: Training End'.format(epoch))
        
        print('Epoch {}: Validation Start'.format(epoch))
    
        model.eval()
    
        with torch.no_grad():
            for batch_idx, data in enumerate(validation_loader):
                ids = data['input_ids'].to(device, dtype = torch.long)
                mask = data['attention_mask'].to(device, dtype = torch.long)
                token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
                targets = data['targets'].to(device, dtype = torch.float)
                outputs = model(ids, mask, token_type_ids)

                loss = loss_fn(outputs, targets)
                valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
                val_targets.extend(targets.cpu().detach().numpy().tolist())
                val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

        print('Epoch {}: Validation End'.format(epoch))
        train_loss = train_loss/len(training_loader)
        valid_loss = valid_loss/len(validation_loader)
        print('Epoch: {} \tAvgerage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
                epoch, 
                train_loss,
                valid_loss
                ))
        
        checkpoint = {
                'epoch': epoch + 1,
                'valid_loss_min': valid_loss,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict()
        }
            
        save_ckp(checkpoint, False, checkpoint_path, best_model_path)
            
        if valid_loss <= valid_loss_min:
            print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,valid_loss))
            save_ckp(checkpoint, True, checkpoint_path, best_model_path)
            valid_loss_min = valid_loss

        print('Epoch {}  Done\n'.format(epoch))

    return model

In [21]:
ckpt_path = "/content/curr_ckpt"
best_model_path = "/content/best_model.pt"

In [22]:
trained_model = train_model(CFG.EPOCHS, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path)

Epoch 1: Training Start
Epoch 1: Training End
Epoch 1: Validation Start
Epoch 1: Validation End
Epoch: 1 	Avgerage Training Loss: 0.000174 	Average Validation Loss: 0.001030
Validation loss decreased (inf --> 0.001030).  Saving model ...
Epoch 1  Done

Epoch 2: Training Start
Epoch 2: Training End
Epoch 2: Validation Start
Epoch 2: Validation End
Epoch: 2 	Avgerage Training Loss: 0.000109 	Average Validation Loss: 0.000905
Validation loss decreased (0.001030 --> 0.000905).  Saving model ...
Epoch 2  Done

Epoch 3: Training Start
Epoch 3: Training End
Epoch 3: Validation Start
Epoch 3: Validation End
Epoch: 3 	Avgerage Training Loss: 0.000084 	Average Validation Loss: 0.000802
Validation loss decreased (0.000905 --> 0.000802).  Saving model ...
Epoch 3  Done

Epoch 4: Training Start
Epoch 4: Training End
Epoch 4: Validation Start
Epoch 4: Validation End
Epoch: 4 	Avgerage Training Loss: 0.000069 	Average Validation Loss: 0.000818
Epoch 4  Done

Epoch 5: Training Start
Epoch 5: Training 

In [34]:
# testing
accuracy = 0
for idx, example in enumerate(list(test_df['comment'])):
    encodings = tokenizer.encode_plus(
        example,
        None,
        add_special_tokens=True,
        max_length=CFG.MAX_LEN,
        padding='max_length',
        return_token_type_ids=True,
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    model.eval()
    with torch.no_grad():
        input_ids = encodings['input_ids'].to(device, dtype=torch.long)
        attention_mask = encodings['attention_mask'].to(device, dtype=torch.long)
        token_type_ids = encodings['token_type_ids'].to(device, dtype=torch.long)
        output = model(input_ids, attention_mask, token_type_ids)
        final_output = torch.sigmoid(output).cpu().detach().numpy().tolist()
        accuracy += target_index[np.argmax(final_output)] == list(test_df['sentiment'])[idx]
        #print(target_index[np.argmax(final_output)], list(test_df['sentiment'])[idx])
accuracy = accuracy / len(list(test_df['sentiment'])) * 100
accuracy

93.35