In [1]:
!pip install datasets



In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer

from sklearn.model_selection import train_test_split
from tqdm import tqdm
import math
import numpy as np

from sklearn.metrics import f1_score, accuracy_score

In [3]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-large")

if tokenizer.bos_token is None:
    tokenizer.bos_token = '<bos>'
    tokenizer.bos_token_id = tokenizer.convert_tokens_to_ids('<bos>')

if tokenizer.eos_token is None:
    tokenizer.eos_token = '<eos>'
    tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids('<eos>')

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id



config.json:   0%|          | 0.00/558 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]

In [4]:
from datasets import load_dataset
dataset = load_dataset("nyu-mll/glue", "sst2")

train = dataset["train"]
val = dataset["validation"]

Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [5]:
train_text = train['sentence']
train_label = train['label']


In [6]:
train_text, test_text, train_label, test_label = train_test_split(train_text, train_label, test_size=0.1, random_state=42)
train_text, val_text, train_label, val_label = train_test_split(train_text, train_label, test_size=0.1, random_state=42)

In [7]:
print(len(train_text))
print(len(train_label))

print(len(test_text))
print(len(test_label))

print(len(val_text))
print(len(val_label))

54552
54552
6735
6735
6062
6062


In [8]:
class SentimentDataset(Dataset): 
  def __init__(self, text, label, tokenizer, max_length=64): 
    self.text = text
    self.label = label 
    self.tokenizer = tokenizer 
    self.max_length = max_length

  def __len__(self): 
    return len(self.label) 

  def __getitem__(self, idx): 
    text_embedding = self.tokenizer.encode_plus(self.text[idx], return_tensors='pt', padding='max_length', max_length=self.max_length, truncation=True)
    label = torch.tensor(self.label[idx], dtype=torch.long) 
    return text_embedding['input_ids'].squeeze(), label
  

train_dataset = SentimentDataset(train_text, train_label, tokenizer)
test_dataset = SentimentDataset(test_text, test_label, tokenizer)
val_dataset = SentimentDataset(val_text, val_label, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [9]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        
        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output
        
    def split_heads(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
        
    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)
        
    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))
        
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        
        output = self.W_o(self.combine_heads(attn_output))
        return output

class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        self.register_buffer('pe', pe.unsqueeze(0))
        
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

In [10]:
class SentimentTransformer(nn.Module): 
  def __init__(self, vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout): 
    super(SentimentTransformer, self).__init__() 
    self.vocab_size = vocab_size 
    self.d_model = d_model 
    self.num_heads = num_heads
    self.num_layers = num_layers 
    self.d_ff = d_ff 
    self.max_seq_length = max_seq_length 
    self.dropout = nn.Dropout(dropout) 

    self.embedding = nn.Embedding(vocab_size, d_model) 
    self.encoder = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

    self.positional_embedding = PositionalEncoding(d_model, max_seq_length) 

    self.fc = nn.Sequential(
        nn.Linear(max_seq_length * d_model, d_model), 
        nn.ReLU(), 
        nn.Linear(d_model, 128), 
        nn.ReLU(), 
        nn.Linear(128, 2)
    )

  def generate_mask(self, x): 
    x_mask = (x != tokenizer.pad_token_id).unsqueeze(1).unsqueeze(2).to(x.device) 
    return x_mask 

  def forward(self, x): 
    x_mask = self.generate_mask(x)

    x = self.embedding(x) * math.sqrt(self.d_model)
    x = self.dropout(self.positional_embedding(x)) 

    for layer in self.encoder: 
      x = layer(x, x_mask) 
    
    x = x.reshape(x.shape[0], -1)
    x = self.fc(x) 

    return x
  
  
x = torch.randint(size=(128, 64), low=0, high=1000) 

net = SentimentTransformer(vocab_size=1000, d_model=512, num_heads=8, num_layers=4, d_ff=2048, max_seq_length=64, dropout=0.1) 

print(net(x).shape)


torch.Size([128, 2])


In [11]:
vocab_size = tokenizer.vocab_size 
d_model = 512 
num_heads = 8 
num_layers = 4 
d_ff = 2048 
max_seq_length = 64 
dropout = 0.1 

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 

model = SentimentTransformer(vocab_size=vocab_size, d_model=d_model, num_heads=num_heads, num_layers=num_layers, d_ff=d_ff, max_seq_length=max_seq_length, dropout=dropout).to(device)
criterion = nn.CrossEntropyLoss() 
optimizer = optim.Adam(model.parameters(), lr=1e-4) 

epochs = 30

train_loss = []
val_loss = []

for epoch in range(epochs):
  model.train()
  total_train_loss = 0
  for inputs, labels in tqdm(train_loader):

    inputs = inputs.to(device)
    labels = labels.to(device)

    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()

    total_train_loss += loss.item()
    train_loss.append(loss.item())

  print(f'Epoch {epoch + 1}/{epochs}, Train Loss: {total_train_loss / len(train_loader)}')

  model.eval()
  total_val_loss = 0
  f_1 = 0
  accuracy = 0
  for inputs, labels in tqdm(val_loader):
    inputs = inputs.to(device)
    labels = labels.to(device)

    with torch.no_grad():
      outputs = model(inputs)
      loss = criterion(outputs, labels)
      total_val_loss += loss.item()

      val_loss.append(loss.item())

      preds = torch.argmax(outputs, dim=-1)
      f_1 += f1_score(labels.cpu(), preds.cpu())
      accuracy += accuracy_score(labels.cpu(), preds.cpu())

  print(f'Epoch {epoch + 1}/{epochs}, Val Loss: {total_val_loss / len(val_loader)}')
  print(f'F1 Score: {f_1 / len(val_loader)}')
  print(f'Accuracy: {accuracy / len(val_loader)}')

torch.save(model.state_dict(), '/kaggle/working/trans_sen.pth')

100%|██████████| 853/853 [01:45<00:00,  8.11it/s]


Epoch 1/30, Train Loss: 0.5328607682536104


100%|██████████| 95/95 [00:04<00:00, 20.50it/s]


Epoch 1/30, Val Loss: 0.4255008703783939
F1 Score: 0.8234147193518097
Accuracy: 0.807079519450801


100%|██████████| 853/853 [01:46<00:00,  8.01it/s]


Epoch 2/30, Train Loss: 0.3888571216241137


100%|██████████| 95/95 [00:04<00:00, 20.93it/s]


Epoch 2/30, Val Loss: 0.3592242043269308
F1 Score: 0.848187192180916
Accuracy: 0.8394808352402746


100%|██████████| 853/853 [01:46<00:00,  8.04it/s]


Epoch 3/30, Train Loss: 0.29972599027951324


100%|██████████| 95/95 [00:04<00:00, 21.06it/s]


Epoch 3/30, Val Loss: 0.3530123156936545
F1 Score: 0.8683066083474701
Accuracy: 0.8607980549199084


100%|██████████| 853/853 [01:45<00:00,  8.05it/s]


Epoch 4/30, Train Loss: 0.2289699506612903


100%|██████████| 95/95 [00:04<00:00, 21.08it/s]


Epoch 4/30, Val Loss: 0.31090038183488344
F1 Score: 0.8918560979040009
Accuracy: 0.8820437643020596


100%|██████████| 853/853 [01:46<00:00,  8.04it/s]


Epoch 5/30, Train Loss: 0.1684364954016203


100%|██████████| 95/95 [00:04<00:00, 21.18it/s]


Epoch 5/30, Val Loss: 0.32819195923052336
F1 Score: 0.8953561029582808
Accuracy: 0.8890517734553777


100%|██████████| 853/853 [01:45<00:00,  8.06it/s]


Epoch 6/30, Train Loss: 0.1275802036804111


100%|██████████| 95/95 [00:04<00:00, 20.95it/s]


Epoch 6/30, Val Loss: 0.2934601815907579
F1 Score: 0.9052137625591083
Accuracy: 0.8986842105263158


100%|██████████| 853/853 [01:46<00:00,  8.05it/s]


Epoch 7/30, Train Loss: 0.10058439503219699


100%|██████████| 95/95 [00:04<00:00, 21.16it/s]


Epoch 7/30, Val Loss: 0.3629246126664312
F1 Score: 0.9027704751713334
Accuracy: 0.8945080091533181


100%|██████████| 853/853 [01:45<00:00,  8.06it/s]


Epoch 8/30, Train Loss: 0.08767662328479053


100%|██████████| 95/95 [00:04<00:00, 21.11it/s]


Epoch 8/30, Val Loss: 0.35644027053525573
F1 Score: 0.9048777096178404
Accuracy: 0.896975114416476


100%|██████████| 853/853 [01:45<00:00,  8.06it/s]


Epoch 9/30, Train Loss: 0.07027770842972528


100%|██████████| 95/95 [00:04<00:00, 21.15it/s]


Epoch 9/30, Val Loss: 0.3741419283967269
F1 Score: 0.9064745392945452
Accuracy: 0.8993135011441648


100%|██████████| 853/853 [01:45<00:00,  8.07it/s]


Epoch 10/30, Train Loss: 0.060449154221028055


100%|██████████| 95/95 [00:04<00:00, 21.21it/s]


Epoch 10/30, Val Loss: 0.3883041509672215
F1 Score: 0.9093409601985188
Accuracy: 0.903983123569794


100%|██████████| 853/853 [01:45<00:00,  8.07it/s]


Epoch 11/30, Train Loss: 0.059815649852944935


100%|██████████| 95/95 [00:04<00:00, 21.24it/s]


Epoch 11/30, Val Loss: 0.4191584992565607
F1 Score: 0.8973275376297185
Accuracy: 0.8921410183066362


100%|██████████| 853/853 [01:45<00:00,  8.08it/s]


Epoch 12/30, Train Loss: 0.05103839938294094


100%|██████████| 95/95 [00:04<00:00, 20.80it/s]


Epoch 12/30, Val Loss: 0.43766663043122545
F1 Score: 0.9088626134985301
Accuracy: 0.9016804919908467


100%|██████████| 853/853 [01:45<00:00,  8.07it/s]


Epoch 13/30, Train Loss: 0.04874825495096515


100%|██████████| 95/95 [00:04<00:00, 21.15it/s]


Epoch 13/30, Val Loss: 0.41843156006775406
F1 Score: 0.9059789624989506
Accuracy: 0.8993778604118994


100%|██████████| 853/853 [01:45<00:00,  8.07it/s]


Epoch 14/30, Train Loss: 0.04423649659544636


100%|██████████| 95/95 [00:04<00:00, 20.92it/s]


Epoch 14/30, Val Loss: 0.4615880777961329
F1 Score: 0.9106813959499461
Accuracy: 0.9019450800915332


100%|██████████| 853/853 [01:45<00:00,  8.08it/s]


Epoch 15/30, Train Loss: 0.04079833829552721


100%|██████████| 95/95 [00:04<00:00, 21.23it/s]


Epoch 15/30, Val Loss: 0.3494859980125176
F1 Score: 0.9111808847067453
Accuracy: 0.9032608695652175


100%|██████████| 853/853 [01:45<00:00,  8.08it/s]


Epoch 16/30, Train Loss: 0.03715730146210147


100%|██████████| 95/95 [00:04<00:00, 20.96it/s]


Epoch 16/30, Val Loss: 0.45477123409509657
F1 Score: 0.9079405146452322
Accuracy: 0.8980978260869565


100%|██████████| 853/853 [01:45<00:00,  8.08it/s]


Epoch 17/30, Train Loss: 0.03532663046428497


100%|██████████| 95/95 [00:04<00:00, 21.30it/s]


Epoch 17/30, Val Loss: 0.49426543798885847
F1 Score: 0.9104645053565983
Accuracy: 0.9024027459954234


100%|██████████| 853/853 [01:45<00:00,  8.08it/s]


Epoch 18/30, Train Loss: 0.03346743291366194


100%|██████████| 95/95 [00:04<00:00, 20.92it/s]


Epoch 18/30, Val Loss: 0.4905975321405812
F1 Score: 0.9060666920456941
Accuracy: 0.8985554919908467


100%|██████████| 853/853 [01:45<00:00,  8.08it/s]


Epoch 19/30, Train Loss: 0.03382156903084132


100%|██████████| 95/95 [00:04<00:00, 20.97it/s]


Epoch 19/30, Val Loss: 0.40061454780791933
F1 Score: 0.9113233200898332
Accuracy: 0.9048698512585812


100%|██████████| 853/853 [01:45<00:00,  8.09it/s]


Epoch 20/30, Train Loss: 0.03220881726083718


100%|██████████| 95/95 [00:04<00:00, 20.95it/s]


Epoch 20/30, Val Loss: 0.4649682568484231
F1 Score: 0.9100136916389095
Accuracy: 0.9039473684210526


100%|██████████| 853/853 [01:45<00:00,  8.09it/s]


Epoch 21/30, Train Loss: 0.029067080785829633


100%|██████████| 95/95 [00:04<00:00, 21.25it/s]


Epoch 21/30, Val Loss: 0.47387590455381495
F1 Score: 0.9114027642841617
Accuracy: 0.9054276315789473


100%|██████████| 853/853 [01:45<00:00,  8.08it/s]


Epoch 22/30, Train Loss: 0.026154608260562786


100%|██████████| 95/95 [00:04<00:00, 21.09it/s]


Epoch 22/30, Val Loss: 0.5047960391562236
F1 Score: 0.9143072691209402
Accuracy: 0.9075014302059496


100%|██████████| 853/853 [01:45<00:00,  8.09it/s]


Epoch 23/30, Train Loss: 0.025450740994095087


100%|██████████| 95/95 [00:04<00:00, 21.21it/s]


Epoch 23/30, Val Loss: 0.5511685527076847
F1 Score: 0.9102973450562714
Accuracy: 0.9018092105263158


100%|██████████| 853/853 [01:45<00:00,  8.09it/s]


Epoch 24/30, Train Loss: 0.027582643342091526


100%|██████████| 95/95 [00:04<00:00, 21.03it/s]


Epoch 24/30, Val Loss: 0.44269589460209796
F1 Score: 0.9099058107573348
Accuracy: 0.903854405034325


100%|██████████| 853/853 [01:45<00:00,  8.08it/s]


Epoch 25/30, Train Loss: 0.02405894566770203


100%|██████████| 95/95 [00:04<00:00, 21.23it/s]


Epoch 25/30, Val Loss: 0.461241626739502
F1 Score: 0.9125397454766027
Accuracy: 0.9058924485125859


100%|██████████| 853/853 [01:45<00:00,  8.08it/s]


Epoch 26/30, Train Loss: 0.024756302184949095


100%|██████████| 95/95 [00:04<00:00, 21.18it/s]


Epoch 26/30, Val Loss: 0.4802042990922928
F1 Score: 0.9103876469150554
Accuracy: 0.9029319221967964


100%|██████████| 853/853 [01:45<00:00,  8.09it/s]


Epoch 27/30, Train Loss: 0.02245221776055398


100%|██████████| 95/95 [00:04<00:00, 21.19it/s]


Epoch 27/30, Val Loss: 0.5889129248888869
F1 Score: 0.9108428888384666
Accuracy: 0.9035540617848971


100%|██████████| 853/853 [01:45<00:00,  8.09it/s]


Epoch 28/30, Train Loss: 0.022854512333990717


100%|██████████| 95/95 [00:04<00:00, 21.15it/s]


Epoch 28/30, Val Loss: 0.5641910691010324
F1 Score: 0.910649683029494
Accuracy: 0.9034897025171625


100%|██████████| 853/853 [01:45<00:00,  8.09it/s]


Epoch 29/30, Train Loss: 0.023007016552553227


100%|██████████| 95/95 [00:04<00:00, 21.10it/s]


Epoch 29/30, Val Loss: 0.5775966953290136
F1 Score: 0.9060202174732647
Accuracy: 0.9001358695652174


100%|██████████| 853/853 [01:45<00:00,  8.09it/s]


Epoch 30/30, Train Loss: 0.021727376544953447


100%|██████████| 95/95 [00:04<00:00, 21.12it/s]


Epoch 30/30, Val Loss: 0.5406996479944179
F1 Score: 0.9086369740420516
Accuracy: 0.9020809496567505


In [12]:
model.eval()
with torch.no_grad(): 
    f_1 = 0
    accuracy = 0
    for inputs, labels in tqdm(test_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        outputs = model(inputs)
        preds = torch.argmax(outputs, dim=-1) 
        f_1 += f1_score(labels.cpu(), preds.cpu())
        accuracy += accuracy_score(labels.cpu(), preds.cpu())
        
    print(f'F1 score on test set: {f_1/len(test_loader)}')
    print(f'Accuracy on test set: {accuracy/len(test_loader)}')
        

100%|██████████| 106/106 [00:05<00:00, 20.89it/s]

F1 score on test set: 0.9129823645282286
Accuracy on test set: 0.9029677672955975



