### Intro

In [1]:
!nvidia-smi

Thu Jan 26 16:05:29 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.86.01    Driver Version: 515.86.01    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A4500    On   | 00000000:C2:00.0 Off |                  Off |
| 30%   21C    P8    39W / 200W |      1MiB / 20470MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!wget https://msadraeij.ir/data.tar.gz
!tar -xzf ./data.tar.gz

--2023-01-26 15:22:54--  https://msadraeij.ir/data.tar.gz
Resolving msadraeij.ir (msadraeij.ir)... 91.107.163.12
Connecting to msadraeij.ir (msadraeij.ir)|91.107.163.12|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 601526 (587K) [application/octet-stream]
Saving to: ‘data.tar.gz’


2023-01-26 15:22:54 (2.95 MB/s) - ‘data.tar.gz’ saved [601526/601526]



In [1]:
!pip install -U transformers
!pip install protobuf==3.20.*
!pip install sentencepiece
!pip install scikit-learn
!pip install pandas
!pip install torchsampler

[0m

In [2]:
import json

import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt

# import seaborn as sns
# from tqdm import tqdm_notebook, tnrange
# from tqdm.auto import tqdm
# from nltk.corpus import stopwords
# from nltk.stem import WordNetLemmatizer 
from tqdm.notebook import trange, tqdm

import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchsampler import ImbalancedDatasetSampler

from transformers import AutoTokenizer
from transformers import AutoModel

### Constants and Datasets

In [3]:
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
MODEL_NAME = "bert-base-uncased"
BATCH_SIZE = 64

NUMBER_OF_CLASSES = 20
EVAL_BATCH_SIZE = NUMBER_OF_CLASSES

N_SEPRATOR1 = ' [SEP] '
N_SEPRATOR2 = ' [SEP] '

CD_SEPRATOR1 = ' [SEP] '
CD_SEPRATOR2 = ' or '
CD_SEPRATOR3 = ' : '
CD_SEPRATOR4 = ' , '

OLD_MODEL_FILE_NAME = None
NEW_MODEL_FILE_NAME = 'bert_adv_attention'

In [4]:
load_3df = lambda name: [ pd.read_table(path)
    for path in [f'Data/arguments-{name}.tsv', f'Data/labels-{name}.tsv', f'Data/level1-labels-{name}.tsv']
]

train_data = load_3df("training")
eval_data = load_3df("validation")
eval_zh_data = load_3df("validation-zhihu")

with open('Data/value-categories.json') as f:
    classes_desc = json.load(f)

In [5]:
# TODO a normalizer may help!!!!

def join_data(row):
    return row['Conclusion'] + N_SEPRATOR1 + row['Premise'] + N_SEPRATOR2 + row['Stance']

def class_desc_to_text(class_info):
    return_value = {}
    for name, items in class_info.items():
        items_str = CD_SEPRATOR2.join(map(lambda x: f'{x[0]}{CD_SEPRATOR3}{CD_SEPRATOR4.join(x[1])}', items.items()))
        return_value[name] = f"{name}{CD_SEPRATOR1}{items_str}"
    return return_value

In [6]:
class ValueEvalDataset(Dataset):
    def __init__(self, data_3df, classes_desc, limit=None):
        self.data = []
        self.classes_text = class_desc_to_text(classes_desc)
        # self.classes_emb = {
        #     key: str_to_emb(text)[:, [0], :]  # CLS TOKEN
        #     for (key, text) in self.classes_text.items()
        # }
    
        
        args, labels, lv1_labels = data_3df
        self.classes_name_to_id = {label: idx for idx, label in enumerate(labels.columns[1:])}
        
        if limit is not None:
            args = args[:limit]
            labels = labels[:limit]
            lv1_labels = lv1_labels[:limit]

        for (_, args_row), (_, labels_row) in zip(args.iterrows(), labels.iterrows()):
            assert args_row['Argument ID'] == labels_row['Argument ID']

            args_str = join_data(args_row)
            # args_emb = str_to_emb(args_str)
            
            for key, val in labels_row.items():
                if key == 'Argument ID':
                    continue
                self.data.append((args_str, key, val))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        args_str, key, val = self.data[idx]
        return args_str, (self.classes_name_to_id[key], self.classes_text[key]), val

train_dataset = ValueEvalDataset(train_data, classes_desc)
eval_dataset = ValueEvalDataset(eval_data, classes_desc)
eval_zh_dataset = ValueEvalDataset(eval_zh_data, classes_desc)

In [7]:
class sadra_imbalanced_sampler:
    def __init__(self, dataset, callback, bacth_size):
        class_and_label = callback(dataset)
        self.each_class_pos_count = pd.DataFrame(callback(dataset), columns=['class', 'label']).groupby('class').sum('label').to_dict()['label']
        self.class_count = pd.DataFrame(class_and_label, columns=['class', 'label']).groupby('class').count()['label'][0]
        self.weights = torch.DoubleTensor([self.calc_weight(class_id, label) for class_id, label in class_and_label])
        
        self.bacth_size = bacth_size
        self.batch_per_epoch = len(dataset) // bacth_size
        
        self.len = self.batch_per_epoch * bacth_size
        
        self.batch_class_count = 4
    
    def calc_weight(self, class_id, label):
        count = self.each_class_pos_count[class_id]
        if label == 1:
            return 1 / count
        else:
            return 1 / (self.class_count - count)
        
    def __iter__(self):
        for _ in range(self.batch_per_epoch):
            classes = np.random.choice(NUMBER_OF_CLASSES, self.batch_class_count, replace=False)
            for c in classes:
                yield from ((c + i * NUMBER_OF_CLASSES) for i in torch.multinomial(self.weights[c::NUMBER_OF_CLASSES], self.bacth_size // self.batch_class_count, replacement=True))

    def __len__(self):
        return self.len

In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def custom_collate_fn(_dataset):
    args, class_descs_and_id, labels = tuple(zip(*_dataset))
    class_id, class_desc_embs = tuple(zip(*class_descs_and_id))
    _args = tokenizer(args, return_tensors="pt", padding=True)
    _class_id = torch.tensor(class_id)
    # _class_descs = tokenizer(class_descs, return_tensors="pt", padding=True)
    _labels = torch.tensor(labels)
    return _args, _class_id, _labels

train_loader = torch.utils.data.DataLoader(
    train_dataset,
    sampler=sadra_imbalanced_sampler(train_dataset, lambda x: [(item[1][0], item[2]) for item in x], BATCH_SIZE),
    batch_size=BATCH_SIZE,
    collate_fn=custom_collate_fn,
    # shuffle=True
)
eval_loader = torch.utils.data.DataLoader(eval_dataset, batch_size=EVAL_BATCH_SIZE, collate_fn=custom_collate_fn)
eval_zh_loader = torch.utils.data.DataLoader(eval_zh_dataset, batch_size=EVAL_BATCH_SIZE, collate_fn=custom_collate_fn)

### MODEL Goes here

In [13]:
class FeedForward(nn.Module):
    def __init__(self, emb_size, hidden_size, droput):
        super().__init__()
        self.linear_1 = nn.Linear(emb_size, hidden_size)
        self.linear_2 = nn.Linear(hidden_size, emb_size)
        self.gelu = nn.GELU()
        self.dropout = nn.Dropout(droput)
        
    def forward(self, x):
        x = self.linear_1(x)
        x = self.gelu(x)
        x = self.linear_2(x)
        x = self.dropout(x)
        return x

class TransformerDecoderLayer(nn.Module):
    def __init__(self, emb_size, dropout):
        super().__init__()
        self.layer_norm_1 = nn.LayerNorm(emb_size)
        self.layer_norm_2 = nn.LayerNorm(emb_size)
        self.attention = nn.MultiheadAttention(embed_dim=emb_size, num_heads=12, batch_first=True, dropout=dropout)
        self.feed_forward = FeedForward(emb_size, 4 * emb_size, dropout)

    def forward(self, input_embeds, class_embed, attention_mask):
        hidden_state = self.layer_norm_1(input_embeds)
        att =  self.attention(class_embed, hidden_state, hidden_state, key_padding_mask=attention_mask, need_weights=False)[0].squeeze(1)
        out = input_embeds[:, 0, :] + att
        out = out + self.feed_forward(self.layer_norm_2(out))
        return out
     
    
class BertAdvancedAttentionModelReduced(nn.Module):
    def __init__(self, emb_size=768, dropout=0.5):
        super(BertAdvancedAttentionModelReduced, self).__init__()
        self.class_desc_emb = nn.Embedding(20, emb_size)
        self.input_bert = AutoModel.from_pretrained(MODEL_NAME)

        # self.class_desc_emb.weight = nn.Parameter(emb_matrix)

        self.attention = TransformerDecoderLayer(emb_size, dropout)
        
        self.classifier = nn.Sequential(
            nn.Dropout(p=dropout),
            nn.Linear(emb_size, 2)
        )


    def forward(self, input_tokens, class_desc_id):
        input_sent_embed = self.input_bert(**input_tokens).last_hidden_state
        class_sent_embed = self.class_desc_emb(class_desc_id)

        class_embed = class_sent_embed[:, None, :]

        attended_embed = self.attention(input_sent_embed, class_embed, (input_tokens['attention_mask']==0))
        output = self.classifier(attended_embed)

        return output

In [14]:
model = BertAdvancedAttentionModelReduced().to(DEVICE)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
import re

BASE_LR = 1e-5
LR_LAYER_DECAY = 0.5

MAX_SCORE = 13

layer_scores = {}
for idx, (name, param) in enumerate(model.named_parameters()):
    score = 0
    if 'input_bert' in name:
        if 'layer' in name:
            score = int(re.findall(r'layer\.(\d+)', name)[0]) + 1
        else:
            score = 0
    elif 'class_desc_emb' in name:
        score = MAX_SCORE - 2
    else:
        score = MAX_SCORE
    layer_scores[name] = score

parameters = [{'params': param, 'lr': BASE_LR * (LR_LAYER_DECAY**(MAX_SCORE - layer_scores[name]))} for (name, param) in model.named_parameters() if param.requires_grad]
optimizer = torch.optim.Adam(parameters)

In [19]:
n_epochs = 100
criterion = nn.CrossEntropyLoss()

EPOCHS = 100

def train_loop(model, loader, epoch):
    model.train()

    batch_losses = []
    for arg, class_desc, y in tqdm(loader):
        optimizer.zero_grad()
 
        out = model(arg.to(DEVICE), class_desc.to(DEVICE))
        loss = criterion(out, y.to(DEVICE))
 
        batch_loss_value = loss.item()
        loss.backward()
        optimizer.step()
 
        batch_losses.append(batch_loss_value)
    
    loss_value = np.mean(batch_losses)
    print("epoch:{:2d} train: loss:{:.3f}".format(epoch, loss_value))
    return loss_value

def eval_loop(model, loader, epoch):
    model.eval()

    batch_losses = []
    with torch.no_grad():
        for arg, class_desc, y in loader:
            out = model(arg.to(DEVICE), class_desc.to(DEVICE))
            loss = criterion(out, y.to(DEVICE))
    
            batch_loss_value = loss.item()

            batch_losses.append(batch_loss_value)

    loss_value = np.mean(batch_losses)
    print("epoch:{:2d} eval: loss:{:.3f}".format(epoch, loss_value))
    return loss_value

from sklearn.metrics import f1_score, recall_score, precision_score

def evaluate_and_calc_metrics(model, loader):
    return_value = {}
    model.eval()
    preds = []
    targets = []
    with torch.no_grad():
        for arg, class_desc, y in tqdm(loader): # arg, class_desc, y
            out = model(arg.to(DEVICE), class_desc.to(DEVICE))
            preds.append(out.argmax(1).tolist())
            targets.append(y.tolist())

    preds = np.asarray(preds)
    targets = np.asarray(targets)
    
    for i in range(20):
        # recall = recall_score(y_true=targets.T[i], y_pred=preds.T[i])
        # precision = precision_score(y_true=targets.T[i], y_pred=preds.T[i])
        return_value[i] = f1_score(y_true=targets.T[i], y_pred=preds.T[i])

    return preds, targets, return_value

def print_scores(model, loader):
    preds, targets, scores = evaluate_and_calc_metrics(model, eval_loader)
    print(np.asarray(list(scores.values())).mean())
    print(f1_score(preds.ravel(), targets.ravel()))

for epoch in range(100):
    train_loop(model, train_loader, epoch)
    # valid_loss = eval_loop(model, eval_loader, epoch)
    print_scores(model, eval_loader)
    torch.save(model.state_dict(), NEW_MODEL_FILE_NAME + f'2_{epoch}')

  0%|          | 0/1685 [00:00<?, ?it/s]

epoch: 0 train: loss:0.230


  0%|          | 0/1896 [00:00<?, ?it/s]

0.46321046998905724
0.5444817007918956


  0%|          | 0/1685 [00:00<?, ?it/s]

epoch: 1 train: loss:0.206


  0%|          | 0/1896 [00:00<?, ?it/s]

0.46069326594302185
0.5451347358052487


  0%|          | 0/1685 [00:00<?, ?it/s]

epoch: 2 train: loss:0.194


  0%|          | 0/1896 [00:00<?, ?it/s]

0.45749962631324886
0.5457840933807003


  0%|          | 0/1685 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [20]:
!curl --upload-file ./bert_adv_attention2_1 https://transfer.sh/bert_adv_attention2_1

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
https://transfer.sh/nTAxHX/bert_adv_attention2_1

In [17]:
model.load_state_dict(torch.load(NEW_MODEL_FILE_NAME + '_2'))

<All keys matched successfully>

In [18]:
from sklearn.metrics import f1_score, recall_score, precision_score

def evaluate_and_calc_metrics(model, loader):
    return_value = {}
    model.eval()
    preds = []
    targets = []
    with torch.no_grad():
        for arg, class_desc, y in tqdm(loader): # arg, class_desc, y
            out = model(arg.to(DEVICE), class_desc.to(DEVICE))
            preds.append(out.argmax(1).tolist())
            targets.append(y.tolist())

    preds = np.asarray(preds)
    targets = np.asarray(targets)
    
    for i in range(20):
        # recall = recall_score(y_true=targets.T[i], y_pred=preds.T[i])
        # precision = precision_score(y_true=targets.T[i], y_pred=preds.T[i])
        return_value[i] = f1_score(y_true=targets.T[i], y_pred=preds.T[i])

    return preds, targets, return_value

def print_scores(model, loader):
    print(np.asarray(list(scores.values())).mean())
    print(f1_score(preds.ravel(), targets.ravel()))

preds, targets, scores = evaluate_and_calc_metrics(model, eval_loader)

  0%|          | 0/1896 [00:00<?, ?it/s]

In [31]:
# scibert_best
print(np.asarray(list(scores.values())).mean())
print(f1_score(preds.ravel(), targets.ravel()))

0.44010156641382736
0.5282816229116944


In [19]:
# scibert_PRIM


0.4272184157027185
0.5224344735673034


In [None]:
np.asarray(list(scores.values())).mean()

In [None]:
d

In [None]:
sample = ['everybody different & people like express clothes , wearing uniform allow person individuality favor abandon use school uniform'
 ,'making child work actor deprives childhood experiences interactions . favor ban use child actors'
 ,'- strikes law good keeping criminals streets abolish - strikes laws'
 ,'school uniforms positive effect academic achievement . favor abandon use school uniform'
 ,'abolish - strikes laws stick law , stop reoffending harsh favor abolish - strikes laws'
 ,'prey poor charge astronomic interest . favor payday loans banned'
 ,'olympics positive athletic contest builds pride country promotes cooperation participating countries . goodwill ambassador world remain . abolish olympic games'
 ,'blockade gaza strip reasonable response crimes committed hamas members . blockade gaza strip ended'
 ,'pride parades grown point increasingly difficult police scale events need looked ensure safety participants . favor cancel pride parades'
 ,'adopt atheism believe . adopt atheism']

target = [[1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
 [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0],
 [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1],
 [0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0],
 [0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
target2 = []
for i in target:
    tmp = ''
    for j in range(len(i)):
        if i[j]==1:
            tmp=tmp+d[j]+'/'
    target2.append(tmp)
# for x in sample:
sample = labse_model.encode(sample)
x = torch.Tensor(sample)
x.to(device)
out = model(x)
pred = np.array(out > 0.5, dtype=float)
# print(pred)
pred2 = []
for i in pred:
    tmp = ''
    for j in range(len(i)):
        if i[j]==1:
            tmp=tmp+d[j]+'/'
    pred2.append(tmp)
print(target2)
print(pred2)