In [1]:
!pip install datasets



In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer

from sklearn.model_selection import train_test_split
from tqdm import tqdm
import math
import numpy as np

from sklearn.metrics import f1_score, accuracy_score

In [3]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-large")

if tokenizer.bos_token is None:
    tokenizer.bos_token = '<bos>'
    tokenizer.bos_token_id = tokenizer.convert_tokens_to_ids('<bos>')

if tokenizer.eos_token is None:
    tokenizer.eos_token = '<eos>'
    tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids('<eos>')

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id



config.json:   0%|          | 0.00/558 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]

In [4]:
from datasets import load_dataset
dataset = load_dataset("nyu-mll/glue", "sst2")

train = dataset["train"]

Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [5]:
train_text = train['sentence']
train_label = train['label']


In [6]:
train_text, test_text, train_label, test_label = train_test_split(train_text, train_label, test_size=0.1, random_state=42)

In [7]:
train_text, val_text, train_label, val_label = train_test_split(train_text, train_label, test_size=0.1, random_state=42)

In [8]:
print(len(train_text))
print(len(train_label))

print(len(test_text))
print(len(test_label))

print(len(val_text))
print(len(val_label))

54552
54552
6735
6735
6062
6062


In [9]:
class SentimentDataset(Dataset): 
  def __init__(self, text, label, tokenizer, max_length=64): 
    self.text = text
    self.label = label 
    self.tokenizer = tokenizer 
    self.max_length = max_length

  def __len__(self): 
    return len(self.label) 

  def __getitem__(self, idx): 
    text_embedding = self.tokenizer.encode_plus(self.text[idx], return_tensors='pt', padding='max_length', max_length=self.max_length, truncation=True)
    label = torch.tensor(self.label[idx], dtype=torch.long) 
    return text_embedding['input_ids'].squeeze(), label
  

train_dataset = SentimentDataset(train_text, train_label, tokenizer)
test_dataset = SentimentDataset(test_text, test_label, tokenizer)
val_dataset = SentimentDataset(val_text, val_label, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [10]:
class Encoder(nn.Module): 
  def __init__(self, vocab_size, max_seq_length, embedding_dim, hidden_dim, dropout): 
    super(Encoder, self).__init__()

    self.vocab_size = vocab_size
    self.max_seq_length = max_seq_length 
    self.embedding_dim = embedding_dim 
    self.hidden_dim = hidden_dim 
    self.dropout = dropout 

    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)

    self.dropout = nn.Dropout(dropout) 

    self.fc = nn.Sequential(
        nn.Linear(max_seq_length * hidden_dim, hidden_dim),
        nn.ReLU(), 
        nn.Linear(hidden_dim, 2)
    )

  def forward(self, x): 
    x = self.dropout(self.embedding(x))
    x, _ = self.gru(x) 
    x = x.reshape(x.size(0), -1)
    x = self.fc(x) 
    return x 


x = torch.randint(size=(128, 64), low=0, high=100)

net = Encoder(vocab_size=64000, max_seq_length=64, embedding_dim=128, hidden_dim=256, dropout=0.1)
out = net(x)
print(out.shape)




torch.Size([128, 2])


In [11]:
vocab_size = tokenizer.vocab_size
max_seq_length = 64
embedding_dim = 256
hidden_dim = 512
dropout = 0.1

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Encoder(vocab_size=vocab_size, max_seq_length=max_seq_length, embedding_dim=embedding_dim, hidden_dim=hidden_dim, dropout=dropout).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)
epochs = 30

train_loss = []
val_loss = []

for epoch in range(epochs):
  model.train()
  total_train_loss = 0
  for inputs, labels in tqdm(train_loader):

    inputs = inputs.to(device)
    labels = labels.to(device)

    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()

    total_train_loss += loss.item()
    train_loss.append(loss.item())

  print(f'Epoch {epoch + 1}/{epochs}, Train Loss: {total_train_loss / len(train_loader)}')

  model.eval()
  total_val_loss = 0
  f_1 = 0
  accuracy = 0
  for inputs, labels in tqdm(val_loader):
    inputs = inputs.to(device)
    labels = labels.to(device)

    with torch.no_grad():
      outputs = model(inputs)
      loss = criterion(outputs, labels)
      total_val_loss += loss.item()

      val_loss.append(loss.item())

      preds = torch.argmax(outputs, dim=-1)
      f_1 += f1_score(labels.cpu(), preds.cpu())
      accuracy += accuracy_score(labels.cpu(), preds.cpu())

  print(f'Epoch {epoch + 1}/{epochs}, Val Loss: {total_val_loss / len(val_loader)}')
  print(f'F1 Score: {f_1 / len(val_loader)}')
  print(f'Accuracy: {accuracy / len(val_loader)}')

torch.save(model.state_dict(), '/kaggle/working/rnn_sen.pth')

100%|██████████| 853/853 [00:36<00:00, 23.38it/s]


Epoch 1/30, Train Loss: 0.5874617492467828


100%|██████████| 95/95 [00:02<00:00, 34.90it/s]


Epoch 1/30, Val Loss: 0.47814940182786236
F1 Score: 0.7882128933121518
Accuracy: 0.77162471395881


100%|██████████| 853/853 [00:35<00:00, 24.30it/s]


Epoch 2/30, Train Loss: 0.41560122637997476


100%|██████████| 95/95 [00:02<00:00, 35.61it/s]


Epoch 2/30, Val Loss: 0.37570860809401463
F1 Score: 0.8482349523056097
Accuracy: 0.8376716247139588


100%|██████████| 853/853 [00:35<00:00, 24.19it/s]


Epoch 3/30, Train Loss: 0.3126860644061289


100%|██████████| 95/95 [00:02<00:00, 35.87it/s]


Epoch 3/30, Val Loss: 0.32241153089623703
F1 Score: 0.8752207482462611
Accuracy: 0.8672768878718536


100%|██████████| 853/853 [00:35<00:00, 24.29it/s]


Epoch 4/30, Train Loss: 0.24748070780864773


100%|██████████| 95/95 [00:02<00:00, 39.17it/s]


Epoch 4/30, Val Loss: 0.31444427543564846
F1 Score: 0.8831955904428112
Accuracy: 0.8712242562929062


100%|██████████| 853/853 [00:35<00:00, 24.29it/s]


Epoch 5/30, Train Loss: 0.19609566773577283


100%|██████████| 95/95 [00:02<00:00, 36.64it/s]


Epoch 5/30, Val Loss: 0.3066514813586285
F1 Score: 0.8917210620736867
Accuracy: 0.8838529748283753


100%|██████████| 853/853 [00:35<00:00, 24.33it/s]


Epoch 6/30, Train Loss: 0.1609850146863413


100%|██████████| 95/95 [00:02<00:00, 35.75it/s]


Epoch 6/30, Val Loss: 0.3055036172270775
F1 Score: 0.8964261613255975
Accuracy: 0.8900028604118994


100%|██████████| 853/853 [00:35<00:00, 24.32it/s]


Epoch 7/30, Train Loss: 0.13072832329646364


100%|██████████| 95/95 [00:02<00:00, 36.55it/s]


Epoch 7/30, Val Loss: 0.3187079591186423
F1 Score: 0.8932630392686626
Accuracy: 0.8878289473684211


100%|██████████| 853/853 [00:35<00:00, 24.29it/s]


Epoch 8/30, Train Loss: 0.11061096619618106


100%|██████████| 95/95 [00:02<00:00, 36.53it/s]


Epoch 8/30, Val Loss: 0.3115415227256323
F1 Score: 0.9049528354123023
Accuracy: 0.8973040617848971


100%|██████████| 853/853 [00:35<00:00, 24.32it/s]


Epoch 9/30, Train Loss: 0.09362657917805392


100%|██████████| 95/95 [00:02<00:00, 36.43it/s]


Epoch 9/30, Val Loss: 0.33208240184344745
F1 Score: 0.9045762495995512
Accuracy: 0.8987199656750572


100%|██████████| 853/853 [00:34<00:00, 24.42it/s]


Epoch 10/30, Train Loss: 0.07931985160562997


100%|██████████| 95/95 [00:02<00:00, 34.16it/s]


Epoch 10/30, Val Loss: 0.3472421573965173
F1 Score: 0.9063172196325722
Accuracy: 0.8998712814645309


100%|██████████| 853/853 [00:34<00:00, 24.38it/s]


Epoch 11/30, Train Loss: 0.06637568624825561


100%|██████████| 95/95 [00:02<00:00, 35.81it/s]


Epoch 11/30, Val Loss: 0.3831772815240057
F1 Score: 0.9031783256383712
Accuracy: 0.8950371853546911


100%|██████████| 853/853 [00:35<00:00, 24.19it/s]


Epoch 12/30, Train Loss: 0.05896921464349039


100%|██████████| 95/95 [00:02<00:00, 34.37it/s]


Epoch 12/30, Val Loss: 0.368775605449551
F1 Score: 0.9021039145871395
Accuracy: 0.8967176773455379


100%|██████████| 853/853 [00:35<00:00, 24.24it/s]


Epoch 13/30, Train Loss: 0.049730171294716935


100%|██████████| 95/95 [00:02<00:00, 36.16it/s]


Epoch 13/30, Val Loss: 0.3753871948311203
F1 Score: 0.9070520290290602
Accuracy: 0.8991490274599543


100%|██████████| 853/853 [00:35<00:00, 24.26it/s]


Epoch 14/30, Train Loss: 0.04375218887756615


100%|██████████| 95/95 [00:02<00:00, 34.78it/s]


Epoch 14/30, Val Loss: 0.3873464073789747
F1 Score: 0.9058842006946426
Accuracy: 0.8998069221967964


100%|██████████| 853/853 [00:34<00:00, 24.46it/s]


Epoch 15/30, Train Loss: 0.039016794926074204


100%|██████████| 95/95 [00:02<00:00, 35.33it/s]


Epoch 15/30, Val Loss: 0.4133265134535338
F1 Score: 0.9112776337721861
Accuracy: 0.9042477116704806


100%|██████████| 853/853 [00:35<00:00, 24.30it/s]


Epoch 16/30, Train Loss: 0.03408850787987044


100%|██████████| 95/95 [00:02<00:00, 35.75it/s]


Epoch 16/30, Val Loss: 0.38445440558226485
F1 Score: 0.905755179702454
Accuracy: 0.8999070366132723


100%|██████████| 853/853 [00:35<00:00, 24.26it/s]


Epoch 17/30, Train Loss: 0.030072882389150242


100%|██████████| 95/95 [00:02<00:00, 35.83it/s]


Epoch 17/30, Val Loss: 0.45469655351419197
F1 Score: 0.9082960757583852
Accuracy: 0.9001644736842105


100%|██████████| 853/853 [00:35<00:00, 24.27it/s]


Epoch 18/30, Train Loss: 0.027995401213766084


100%|██████████| 95/95 [00:02<00:00, 36.46it/s]


Epoch 18/30, Val Loss: 0.44503892279769247
F1 Score: 0.9066716718766221
Accuracy: 0.9008938787185355


100%|██████████| 853/853 [00:35<00:00, 24.19it/s]


Epoch 19/30, Train Loss: 0.026615764314400474


100%|██████████| 95/95 [00:02<00:00, 35.89it/s]


Epoch 19/30, Val Loss: 0.4604512872272416
F1 Score: 0.9114359886789395
Accuracy: 0.9034253432494279


100%|██████████| 853/853 [00:34<00:00, 24.47it/s]


Epoch 20/30, Train Loss: 0.023195873962431123


100%|██████████| 95/95 [00:02<00:00, 39.57it/s]


Epoch 20/30, Val Loss: 0.4531304738239238
F1 Score: 0.9078339986815408
Accuracy: 0.8998069221967964


100%|██████████| 853/853 [00:34<00:00, 24.90it/s]


Epoch 21/30, Train Loss: 0.021966676827294127


100%|██████████| 95/95 [00:02<00:00, 40.43it/s]


Epoch 21/30, Val Loss: 0.4352561124061283
F1 Score: 0.9057566508037297
Accuracy: 0.9000357551487415


100%|██████████| 853/853 [00:34<00:00, 24.99it/s]


Epoch 22/30, Train Loss: 0.020045524066931525


100%|██████████| 95/95 [00:02<00:00, 38.52it/s]


Epoch 22/30, Val Loss: 0.5215891289867853
F1 Score: 0.9103613049055342
Accuracy: 0.9011584668192221


100%|██████████| 853/853 [00:34<00:00, 24.96it/s]


Epoch 23/30, Train Loss: 0.016945086208519887


100%|██████████| 95/95 [00:02<00:00, 39.13it/s]


Epoch 23/30, Val Loss: 0.47575744547341997
F1 Score: 0.9076724926733993
Accuracy: 0.9029319221967964


100%|██████████| 853/853 [00:34<00:00, 24.95it/s]


Epoch 24/30, Train Loss: 0.02009663286751681


100%|██████████| 95/95 [00:02<00:00, 39.37it/s]


Epoch 24/30, Val Loss: 0.4676432846408141
F1 Score: 0.9064768030424031
Accuracy: 0.8988844393592678


100%|██████████| 853/853 [00:34<00:00, 25.00it/s]


Epoch 25/30, Train Loss: 0.01620707370850977


100%|██████████| 95/95 [00:02<00:00, 39.11it/s]


Epoch 25/30, Val Loss: 0.5056837751285026
F1 Score: 0.9073691050731617
Accuracy: 0.9007937643020595


100%|██████████| 853/853 [00:34<00:00, 25.04it/s]


Epoch 26/30, Train Loss: 0.017699108506805495


100%|██████████| 95/95 [00:02<00:00, 39.65it/s]


Epoch 26/30, Val Loss: 0.5069135145921456
F1 Score: 0.9044089955154194
Accuracy: 0.8977974828375286


100%|██████████| 853/853 [00:34<00:00, 24.98it/s]


Epoch 27/30, Train Loss: 0.015491542638562552


100%|██████████| 95/95 [00:02<00:00, 39.52it/s]


Epoch 27/30, Val Loss: 0.5329574229685884
F1 Score: 0.9090121674741453
Accuracy: 0.9008938787185355


100%|██████████| 853/853 [00:34<00:00, 24.88it/s]


Epoch 28/30, Train Loss: 0.012592506987746744


100%|██████████| 95/95 [00:02<00:00, 39.20it/s]


Epoch 28/30, Val Loss: 0.5880258244119192
F1 Score: 0.9095153747520868
Accuracy: 0.9012228260869566


100%|██████████| 853/853 [00:34<00:00, 24.86it/s]


Epoch 29/30, Train Loss: 0.012777391388188149


100%|██████████| 95/95 [00:02<00:00, 37.41it/s]


Epoch 29/30, Val Loss: 0.5553673773219711
F1 Score: 0.9096938096666619
Accuracy: 0.9044765446224257


100%|██████████| 853/853 [00:34<00:00, 24.94it/s]


Epoch 30/30, Train Loss: 0.012443516425639846


100%|██████████| 95/95 [00:02<00:00, 40.48it/s]


Epoch 30/30, Val Loss: 0.5995453168687067
F1 Score: 0.9091647201417161
Accuracy: 0.9014159038901602


In [12]:
model.eval()
with torch.no_grad(): 
    f_1 = 0
    accuracy = 0
    for inputs, labels in tqdm(test_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        outputs = model(inputs)
        preds = torch.argmax(outputs, dim=-1) 
        f_1 += f1_score(labels.cpu(), preds.cpu())
        accuracy += accuracy_score(labels.cpu(), preds.cpu())
        
    print(f'F1 score on test set: {f_1/len(test_loader)}')
    print(f'Accuracy on test set: {accuracy/len(test_loader)}')
        

100%|██████████| 106/106 [00:03<00:00, 34.29it/s]

F1 score on test set: 0.9139370502144327
Accuracy on test set: 0.9031544811320755



