In [1]:
!pip install transformers
!pip install -q emoji pythainlp==2.2.4 sefr_cut tinydb seqeval sentencepiece pydantic jsonlines
!pip install --no-deps thai2transformers==0.1.2
!pip install lightning
!pip install madgrad
!pip install wandb -qU

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m54.7 MB/s[0m eta [36m0:00:00[0m
Col

In [None]:
import wandb
wandb.login()

In [1]:
import madgrad
import torch

In [2]:
optimizer_list = {
    'Madgrad':madgrad.MADGRAD,
    'Adagrad':torch.optim.Adagrad,
    'Adam':torch.optim.Adam,
    'SDG':torch.optim.SGD
}

In [3]:
model_config = {'num_cls': 3}
training_config = {'lookahead':{'la_steps':5, 'la_alpha':0.8},
                   'optimizer':optimizer_list['SDG'],
                   'lr':1e-4,
                   'epochs':10,
                   'training_batch': 32,
                   'val_batch': 8,
                   'gradaccumulate':8,
                   'label_smoothness':0.1,
                   'validate_every_n_epoch':1,
                   'use_wab':False
                   }

In [None]:
!wget https://github.com/PyThaiNLP/wisesight-sentiment/archive/master.zip
!unzip master.zip
!mkdir wisesight_data; ls
!cd wisesight-sentiment-master/kaggle-competition; ls

# data preparation

In [4]:
import numpy as np
import pandas as pd
from transformers import AutoTokenizer
from  torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import random_split
import torch

In [5]:
with open("wisesight-sentiment-master/kaggle-competition/train.txt") as f:
    texts = [line.strip() for line in f.readlines()]

with open("wisesight-sentiment-master/kaggle-competition/train_label.txt") as f:
    categories = [line.strip() for line in f.readlines()]

all_df = pd.DataFrame({"category":categories, "texts":texts})
all_df.to_csv('all_df.csv',index=False)
all_df.shape

(24063, 2)

In [6]:
all_df.category.value_counts() / all_df.shape[0]

neu    0.544612
neg    0.255164
pos    0.178698
q      0.021527
Name: category, dtype: float64

In [7]:
all_df = all_df[all_df['category'] != 'q']
all_df.shape

(23545, 2)

In [8]:
all_df.category.value_counts() / all_df.shape[0]

neu    0.556594
neg    0.260777
pos    0.182629
Name: category, dtype: float64

In [9]:
classes = {'neu':0, 'neg':1, 'pos':2}

In [10]:
all_df['class'] = all_df['category'].map(classes)

In [11]:
all_df

Unnamed: 0,category,texts,class
0,neu,ประเทศเราผลิตและส่งออกยาสูบเยอะสุดในโลกจิงป่าวคับ,0
1,neu,คะ,0
2,neg,อิเหี้ยออมทำกูอยากกินเอ็มเค,1
3,neu,😅😅😅,0
4,neu,สวัสดีวันพุธ แนน อะไรนะ,0
...,...,...,...
24058,neg,แม่งควายล้วนนน,1
24059,neg,ดอยสุเทพน้องง ไปหมดแล้วววว #pm25,1
24060,neg,ค่าชุดอาจจะแพงกว่าส่วนลด,1
24061,neu,รัฐต้องการแค่ภาษีครับ,0


In [12]:
all_df = all_df.reset_index(drop=True)

In [13]:
class CustomImageDataset(Dataset):
    def __init__(self, data_dict):
        self.text = data_dict['texts']
        self.labels = data_dict['class']
        self.cls_num = 3
        self.tokenizer = AutoTokenizer.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased",  model_max_length=416)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.tokenizer(self.text[idx], return_tensors="pt",  max_length = 416, truncation=True, pad_to_max_length=True,)
        label = self.class_to_tensor(self.labels[idx], self.cls_num)
        #sent_id, attention_mask=mask
        return text['input_ids'][0],text['attention_mask'][0], label

    def class_to_tensor(self, class_number, num_classes):
      # Create a tensor with zeros of length num_classes
      tensor = torch.tensor(np.zeros(num_classes, dtype=float))

      # Set the element corresponding to the class_number to 1
      tensor[class_number] = 1.

      return tensor

In [14]:
big_data = CustomImageDataset(all_df)

In [15]:
big_data[0]



(tensor([    5,    10,   136,    88,   932,    13,  3789, 12525,   485,  7773,
          3774,  2916,   801,     6,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,  

In [16]:
generator = torch.Generator().manual_seed(100)
train, val= random_split(big_data, [0.8, 0.2], generator=generator)

In [17]:
trains_dataloader = DataLoader(train, batch_size=training_config['training_batch'], shuffle=True)
vals_dataloader = DataLoader(val, batch_size=training_config['val_batch'], shuffle=False)


In [18]:
val[0][0].shape

torch.Size([416])

# Model

In [19]:
import lightning.pytorch as pl
import torch
from torch import nn
import torch.nn.functional as F
from transformers import AutoModelForMaskedLM

In [20]:
if training_config['use_wab']:
    wandb.init(
      # Set the project where this run will be logged
      project="basic-bert",
      # We pass a run name (otherwise it’ll be randomly assigned, like sunshine-lollypop-10)
      name=f"experiment",
      # Track hyperparameters and run metadata
      config={
        model_config,
        training_config
      })

### lookahead

In [21]:
from collections import defaultdict

import torch
from torch.optim.optimizer import Optimizer


class Lookahead(Optimizer):
    r"""PyTorch implementation of the lookahead wrapper.

    Lookahead Optimizer: https://arxiv.org/abs/1907.08610
    """

    def __init__(self, optimizer, la_steps=5, la_alpha=0.8, pullback_momentum="none"):
        """optimizer: inner optimizer
        la_steps (int): number of lookahead steps
        la_alpha (float): linear interpolation factor. 1.0 recovers the inner optimizer.
        pullback_momentum (str): change to inner optimizer momentum on interpolation update
        """
        self.optimizer = optimizer
        self._la_step = 0  # counter for inner optimizer
        self.la_alpha = la_alpha
        self._total_la_steps = la_steps
        pullback_momentum = pullback_momentum.lower()
        assert pullback_momentum in ["reset", "pullback", "none"]
        self.pullback_momentum = pullback_momentum

        self.state = defaultdict(dict)

        # Cache the current optimizer parameters
        for group in optimizer.param_groups:
            for p in group['params']:
                param_state = self.state[p]
                param_state['cached_params'] = torch.zeros_like(p.data)
                param_state['cached_params'].copy_(p.data)
                if self.pullback_momentum == "pullback":
                    param_state['cached_mom'] = torch.zeros_like(p.data)

    def __getstate__(self):
        return {
            'state': self.state,
            'optimizer': self.optimizer,
            'la_alpha': self.la_alpha,
            '_la_step': self._la_step,
            '_total_la_steps': self._total_la_steps,
            'pullback_momentum': self.pullback_momentum
        }

    def zero_grad(self):
        self.optimizer.zero_grad()

    def get_la_step(self):
        return self._la_step

    def state_dict(self):
        return self.optimizer.state_dict()

    def load_state_dict(self, state_dict):
        self.optimizer.load_state_dict(state_dict)

    def _backup_and_load_cache(self):
        """Useful for performing evaluation on the slow weights (which typically generalize better)
        """
        for group in self.optimizer.param_groups:
            for p in group['params']:
                param_state = self.state[p]
                param_state['backup_params'] = torch.zeros_like(p.data)
                param_state['backup_params'].copy_(p.data)
                p.data.copy_(param_state['cached_params'])

    def _clear_and_load_backup(self):
        for group in self.optimizer.param_groups:
            for p in group['params']:
                param_state = self.state[p]
                p.data.copy_(param_state['backup_params'])
                del param_state['backup_params']

    @property
    def param_groups(self):
        return self.optimizer.param_groups

    def step(self, closure=None):
        """Performs a single Lookahead optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = self.optimizer.step(closure)
        self._la_step += 1

        if self._la_step >= self._total_la_steps:
            self._la_step = 0
            # Lookahead and cache the current optimizer parameters
            for group in self.optimizer.param_groups:
                for p in group['params']:
                    param_state = self.state[p]
                    p.data.mul_(self.la_alpha).add_(param_state['cached_params'], alpha=1.0 - self.la_alpha)  # crucial line
                    param_state['cached_params'].copy_(p.data)
                    if self.pullback_momentum == "pullback":
                        internal_momentum = self.optimizer.state[p]["momentum_buffer"]
                        self.optimizer.state[p]["momentum_buffer"] = internal_momentum.mul_(self.la_alpha).add_(
                            1.0 - self.la_alpha, param_state["cached_mom"])
                        param_state["cached_mom"] = self.optimizer.state[p]["momentum_buffer"]
                    elif self.pullback_momentum == "reset":
                        self.optimizer.state[p]["momentum_buffer"] = torch.zeros_like(p.data)

        return loss

### model config

In [22]:
#class Clsfication_head(torch.nn.Module):
#    def __init__(self, num_target_classes):
#      super().__init__()
#      self.dense = nn.Linear(768, 768)
#      self.act1 = nn.LeakyReLU()
#      self.dropout = nn.Dropout()
#      self.classifier = nn.Linear(768, num_target_classes)
#      self.act2 = nn.Softmax(dim=1)
#    def forward(self,x):
#      x = self.dense(x)
#      x = self.act1(x)
#      x = self.dropout(x)
#      x = self.classifier(x)
#      x = self.act2(x)
#      return x

class SICModel(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size

        self.W_1 = nn.Linear(hidden_size, hidden_size)
        self.W_2 = nn.Linear(hidden_size, hidden_size)
        self.W_3 = nn.Linear(hidden_size, hidden_size)
        self.W_4 = nn.Linear(hidden_size, hidden_size)

    def forward(self, hidden_states, start_indexs, end_indexs):
        W1_h = self.W_1(hidden_states)  # (bs, length, hidden_size)
        W2_h = self.W_2(hidden_states)
        W3_h = self.W_3(hidden_states)
        W4_h = self.W_4(hidden_states)

        W1_hi_emb = torch.index_select(W1_h, 1, start_indexs)  # (bs, span_num, hidden_size)
        W2_hj_emb = torch.index_select(W2_h, 1, end_indexs)
        W3_hi_start_emb = torch.index_select(W3_h, 1, start_indexs)
        W3_hi_end_emb = torch.index_select(W3_h, 1, end_indexs)
        W4_hj_start_emb = torch.index_select(W4_h, 1, start_indexs)
        W4_hj_end_emb = torch.index_select(W4_h, 1, end_indexs)

        # [w1*hi, w2*hj, w3(hi-hj), w4(hi⊗hj)]
        span = W1_hi_emb + W2_hj_emb + (W3_hi_start_emb - W3_hi_end_emb) + torch.mul(W4_hj_start_emb, W4_hj_end_emb)
        h_ij = torch.tanh(span)
        return h_ij


class InterpretationModel(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.h_t = nn.Linear(hidden_size, 1)

    def forward(self, h_ij, span_masks):
        o_ij = self.h_t(h_ij).squeeze(-1)  # (ba, span_num)
        # mask illegal span
        o_ij = o_ij - span_masks
        # normalize all a_ij, a_ij sum = 1
        a_ij = nn.functional.softmax(o_ij, dim=1)
        # weight average span representation to get H
        H = (a_ij.unsqueeze(-1) * h_ij).sum(dim=1)  # (bs, hidden_size)
        return H, a_ij

class BertCls(pl.LightningModule):
    def __init__(self, num_target_classes):
      super().__init__()
      encoder = AutoModelForMaskedLM.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased")
      self.encoder = encoder.roberta
      self.classification_head = Clsfication_head(num_target_classes)
      self.span_info_collect = SICModel(768)#bert vec
      self.interpretation = InterpretationModel(768)
      self.output = nn.Linear(768, num_target_classes)

    def forward(self, sent_id, mask):
      x = self.encoder(input_ids=sent_id, attention_mask=mask)
      h_ij = self.span_info_collect(hidden_states, start_indexs, end_indexs)
      H, a_ij = self.interpretation(h_ij, span_masks)
      out = self.output(H)
      #x = self.classification_head(x)
      return out, a_ij


In [23]:
model = BertCls(model_config['num_cls'])

# training

In [24]:
from lightning.fabric import Fabric
import torch.nn.functional as F
import torch
import lightning as L
import numpy as np
from sklearn.metrics import f1_score

In [25]:
fabric = Fabric(accelerator="cuda", devices="auto", strategy="auto", precision="16-mixed") #need callback
fabric.launch()

INFO: Using 16-bit Automatic Mixed Precision (AMP)
INFO:lightning.pytorch.utilities.rank_zero:Using 16-bit Automatic Mixed Precision (AMP)


In [26]:
#optimizer = Lookahead(training_config['optimizer'](model.parameters(),lr=training_config['lr']))
optimizer = training_config['optimizer'](model.parameters(),lr=training_config['lr'])

model, optimizer = fabric.setup(model, optimizer)
train_dataloader = fabric.setup_dataloaders(trains_dataloader)
val_dataloader = fabric.setup_dataloaders(vals_dataloader)

### trainer

In [27]:
from tqdm import tqdm

In [28]:
def train_fn(fabric, model, optimizer, train_dataloader, val_dataloader):
  model.train()
  for epoch in range(training_config['epochs']):
    for batch_idx,batch in tqdm(enumerate(train_dataloader)):
      input_ids, mask, target = batch
      out, a_ij =  model(input_ids,mask)
      loss = F.cross_entropy(output, target, label_smoothing=training_config['label_smoothness'])
      reg_loss =  0.8 * a_ij.pow(2).sum(dim=1).mean()
      total_loss = loss - reg_loss
      fabric.backward(loss)

      if (batch_idx + 1) % training_config['gradaccumulate'] == 0:
          optimizer.step()
          optimizer.zero_grad()
      if training_config['use_wab']:
        wandb.log({"train_loss": loss.item()})

    if epoch % training_config['validate_every_n_epoch'] == 0:
          validate_fn(fabric, model, val_dataloader)

def validate_fn(fabric, model, dataloader):
    # Validation loop
    model.eval()
    total_preds = []
    total_labels = []
    total_loss = 0.0
    for i, batch in enumerate(dataloader):
      input_ids, mask, target = batch
      output =  model(input_ids,mask)
      loss = F.cross_entropy(output, target, label_smoothing=training_config['label_smoothness'])
      total_loss = total_loss + loss.item()

      preds = output.detach().cpu().numpy()
      preds = np.argmax(preds, axis=1)
      total_preds+=list(preds)
      total_labels+=target.tolist()

    f1 = f1_score(total_labels, total_preds, average='weighted')
    avg_loss = total_loss / len(val_dataloader)
    print(avg_loss)
    if training_config['use_wab']:
      wandb.log({"f1": f1, 'avg_val_loss':avg_loss})

# train 🚉

In [29]:
train_fn(fabric, model, optimizer, train_dataloader, val_dataloader)

589it [07:37,  1.29it/s]


OutOfMemoryError: ignored