<a href="https://colab.research.google.com/github/leejonghyeong/BiMPM/blob/main/BiMPM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Semantic Textual Similarity


## 데이터셋 로딩
`train.csv`와 `test.csv`파일을 읽고, dictionary 형태로 데이터를 파싱합니다

In [None]:
import os, sys
from google.colab import drive
drive.mount('/content/drive')

my_path = '/content/notebooks'
# Colab Notebooks 안에 my_env 폴더에 패키지 저장
os.symlink('/content/drive/My Drive/Colab Notebooks/my_env', my_path)
sys.path.insert(0, my_path)

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/Colab\ Notebooks/BiMPM/

/content/drive/MyDrive/Colab Notebooks/BiMPM


In [None]:
!pip install transformers
from transformers import BertModel, BertTokenizer
bert = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 7.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 92.6 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 91.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 79.0 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 632 kB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attem

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [None]:
import csv
from typing import List, Dict, Union, Tuple

# session_id, text1_ids, text2_ids, label(default: -1)
ExampleType = Tuple[str, List[int], List[int], float]

def load_examples(dataset_path: str) -> List[ExampleType]:
    examples = []
    with open(dataset_path) as f:
        csv_reader = csv.DictReader(f)
        for example in csv_reader:
            #text1_ids = [int(token_id) + 1 for token_id in example["question1"].split(" ")]
            #text2_ids = [int(token_id) + 1 for token_id in example["question2"].split(" ")]
            text1_ids = tokenizer.encode(example["question1"])
            text2_ids = tokenizer.encode(example["question2"])
            label = float(example["is_duplicate"]) if "is_duplicate" in example else -1.0
            examples.append((example["id"], text1_ids, text2_ids, label))
    return examples

In [None]:
trainable_examples = load_examples("train.csv")
dev_split_index = int(len(trainable_examples) * 0.8)

train_examples = trainable_examples[:dev_split_index]
dev_examples = trainable_examples[dev_split_index:]
test_examples = load_examples("test.csv")

In [None]:
print(len(trainable_examples))
print(len(test_examples))

242574
161716


In [None]:
import torch
from torch import nn
from torch.nn import functional as fnn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from torch.optim import Adadelta
from transformers import get_constant_schedule_with_warmup
from torch.nn.utils.clip_grad import clip_grad_norm_
from tqdm.notebook import tqdm

## 모델 / 데이터셋 정의

pytorch 기준으로 Model, Dataset class를 상속받아 모델을 구현합니다. 모델 구현은 아래 논문을 참고하였습니다.

Siamese Recurrent Architectures for Learning Sentence Similarity https://www.aaai.org/ocs/index.php/AAAI/AAAI16/paper/viewPaper/12195

In [None]:
from torch.nn.utils.rnn import pad_sequence

class STSDataset(Dataset):
    def __init__(self, datas: List[ExampleType]):
        self.datas = datas
    
    def __getitem__(self, index: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        _, text1, text2, label = self.datas[index]
        return torch.tensor(text1), torch.tensor(text2), torch.tensor(float(label))
    
    def __len__(self) -> int:
        return len(self.datas)

def collate_fn(features: List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]):
    text1_batch, text2_batch, labels = list(zip(*features))
    
    text1_batch_tensor = pad_sequence(text1_batch, batch_first=True, padding_value=0)
    text2_batch_tensor = pad_sequence(text2_batch, batch_first=True, padding_value=0)
    
    return text1_batch_tensor, text2_batch_tensor, torch.stack(labels)

In [None]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.nn.init import xavier_uniform_

def cosine_match(v1: torch.tensor, v2: torch.tensor, W: torch.tensor, num_perspectives: int):
    cosine = nn.CosineSimilarity(dim = -1)
    weighted_vector1 = W * v1.unsqueeze(1).repeat(1,num_perspectives, 1)
    weighted_vector2 = W * v2.unsqueeze(1).repeat(1,num_perspectives, 1)
    return cosine(weighted_vector1, weighted_vector2)
    

class MatchingLayer(nn.Module):
    def __init__(self, hidden_size: int, num_perspectives: int, device):
        super().__init__()
        self.weight1 = nn.Parameter(torch.Tensor(num_perspectives, hidden_size))
        self.weight2 = nn.Parameter(torch.Tensor(num_perspectives, hidden_size))
        self.weight3 = nn.Parameter(torch.Tensor(num_perspectives, hidden_size))
        self.weight4 = nn.Parameter(torch.Tensor(num_perspectives, hidden_size))
        self.weight5 = nn.Parameter(torch.Tensor(num_perspectives, hidden_size))
        self.weight6 = nn.Parameter(torch.Tensor(num_perspectives, hidden_size))
        self.weight7 = nn.Parameter(torch.Tensor(num_perspectives, hidden_size))
        self.weight8 = nn.Parameter(torch.Tensor(num_perspectives, hidden_size))
        self.cosine = nn.CosineSimilarity(dim = -1)
        self.hidden_size = hidden_size
        self.num_perspectives = num_perspectives
        self.device = device
        
        #Xaiver Initialization
        xavier_uniform_(self.weight1)
        xavier_uniform_(self.weight2)
        xavier_uniform_(self.weight3)
        xavier_uniform_(self.weight4)
        xavier_uniform_(self.weight5)
        xavier_uniform_(self.weight6)
        xavier_uniform_(self.weight7)
        xavier_uniform_(self.weight8)
        
    
    def forward(self, text1, text2, M, N):
        forward_h_1 = text1[:,:, :self.hidden_size]
        backward_h_1 = text1[:,:, self.hidden_size:]
        forward_h_2 = text2[:,:, :self.hidden_size]
        backward_h_2 = text2[:,:, self.hidden_size:]
        #Full-Matching
        forward_full_match = torch.cat([cosine_match(forward_h_1[:,i], forward_h_2[:,-1], self.weight1, self.num_perspectives).unsqueeze(1) for i in range(M)], dim = 1)
        backward_full_match = torch.cat([cosine_match(backward_h_1[:,i], backward_h_2[:,0], self.weight2, self.num_perspectives).unsqueeze(1) for i in range(M)], dim = 1)
        full = torch.cat((forward_full_match, backward_full_match), dim = -1)
        
        #Maxpooling-Matching
        maxpooling = torch.tensor([], device = device)
        for i in range(M):
            forward_maxpool_match, _ = torch.cat([cosine_match(forward_h_1[:,i], forward_h_2[:,j], self.weight3, self.num_perspectives).unsqueeze(1) for j in range(N)], dim = 1).max(dim=1)
            backward_maxpool_match, _ = torch.cat([cosine_match(backward_h_1[:,i], backward_h_2[:,j], self.weight4, self.num_perspectives).unsqueeze(1) for j in range(N)], dim = 1).max(dim=1)
            maxpooling = torch.cat( (maxpooling, torch.cat((forward_maxpool_match, backward_maxpool_match), dim=-1).unsqueeze(1)), dim = 1)
        
        #Attentive-Matching
        forward_attentive_weight = self.cosine(forward_h_1.unsqueeze(2).repeat(1,1,N,1), forward_h_2.unsqueeze(1).repeat(1,M,1,1))
        backward_attentive_weight = self.cosine(backward_h_1.unsqueeze(2).repeat(1,1,N,1), backward_h_2.unsqueeze(1).repeat(1,M,1,1))
        forward_weighted_h_2 = forward_attentive_weight.unsqueeze(3) * forward_h_2.unsqueeze(1).repeat(1,M,1,1)
        backward_weighted_h_2 = backward_attentive_weight.unsqueeze(3) * backward_h_2.unsqueeze(1).repeat(1,M,1,1)
        forward_h_mean = torch.mean(forward_weighted_h_2, 2)
        backward_h_mean = torch.mean(backward_weighted_h_2, 2)
        forward_attentive_match = torch.cat([cosine_match(forward_h_1[:,i], forward_h_mean[:,i], self.weight5, self.num_perspectives).unsqueeze(1) for i in range(M)], dim = 1)
        backward_attentive_match = torch.cat([cosine_match(backward_h_1[:,i], backward_h_mean[:,i], self.weight6, self.num_perspectives).unsqueeze(1) for i in range(M)], dim = 1)
        attentive = torch.cat((forward_attentive_match, backward_attentive_match), dim = -1)
        
        #Max-Attentive-Matching
        max_attentive = torch.tensor([], device = device) 
        for i in range(M):
            _, forward_max_indices = forward_attentive_weight[:,i].max(dim=1)
            _, backward_max_indices = backward_attentive_weight[:,i].max(dim=1)
            forward_max_h_2 = torch.cat([torch.index_select(each_batch, 0, k) for each_batch, k in zip(forward_h_2, forward_max_indices)])
            backward_max_h_2 = torch.cat([torch.index_select(each_batch, 0, k) for each_batch, k in zip(backward_h_2, backward_max_indices)])
            forward_max_attentive_match = cosine_match(forward_h_1[:,i], forward_max_h_2, self.weight7, self.num_perspectives)
            backward_max_attentive_match = cosine_match(backward_h_1[:,i], backward_max_h_2, self.weight8, self.num_perspectives)
            max_attentive = torch.cat( (max_attentive, torch.cat((forward_max_attentive_match, backward_max_attentive_match), dim=-1).unsqueeze(1)) , dim = 1)
   
        match = torch.cat((full, maxpooling, attentive, max_attentive), dim = -1)
        #알파
        #co = cosine_match(forward_h_1[:,0], forward_h_2[:,-1], self.weight1, self.num_perspectives)
        #return match, co, self.weight1, forward_h_1[:,0]
        return match
        
class PredictionLayer(nn.Module):
    def __init__(self, dimension: int):
            super().__init__()
            self.linear1 = nn.Linear(dimension, dimension)
            self.linear2 = nn.Linear(dimension, 2)
            
    def forward(self, inputs):
        output1 = self.linear1(inputs)
        output2 = self.linear2(output1)
        
        return output2
    
class STSBaselineModel(nn.Module):
    def __init__(self, vocab_size: int, hidden_size: int, num_perspectives: int, device):
        super().__init__()
        #self.word_embedding = nn.Embedding(vocab_size, hidden_size, padding_idx=0)
        self.word_embedding = nn.Linear(768, emb_size)
        self.encoder = nn.LSTM(emb_size, hidden_size, batch_first=True, num_layers=1)
        self.context_embedding = nn.LSTM(hidden_size, hidden_size, batch_first = True, num_layers = 1, bidirectional = True)
        self.match_layer = MatchingLayer(hidden_size, num_perspectives, device)
        self.aggregation = nn.LSTM(8 * num_perspectives, 8 * num_perspectives, batch_first = True, num_layers = 1, bidirectional = True)
        self.prediction = PredictionLayer(32 * num_perspectives)
        self.num_perspectives = num_perspectives

    
    def forward(self, text1: torch.Tensor, text2: torch.Tensor) -> torch.Tensor:
        text1_lengths = text1.gt(0).long().sum(-1)
        text2_lengths = text2.gt(0).long().sum(-1)
        
        text1_word_embeds = self.word_embedding(emb(text1))
        text2_word_embeds = self.word_embedding(emb(text2))
        
        packed_text1 = pack_padded_sequence(text1_word_embeds, text1_lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_text2 = pack_padded_sequence(text2_word_embeds, text2_lengths.cpu(), batch_first=True, enforce_sorted=False)
        
        #Encoding Layer
        context_vector1, _ = self.encoder(packed_text1)
        context_vector2, _ = self.encoder(packed_text2)
        
        #Context Representation Layer
        context_vector1, _ = self.context_embedding(context_vector1)
        context_vector2, _ = self.context_embedding(context_vector2)
        
        unpacked_context_vector1, _ = pad_packed_sequence(context_vector1, batch_first=True)
        unpacked_context_vector2, _ = pad_packed_sequence(context_vector2, batch_first=True)
        
        #Matching Layer
        #, co1, wt, forward_h 
        #, co2,_,_
        matching_vector1 = self.match_layer(unpacked_context_vector1, unpacked_context_vector2, text1.size(1), text2.size(1))
        matching_vector2 = self.match_layer(unpacked_context_vector2, unpacked_context_vector1, text2.size(1), text1.size(1))
        
        #Aggregation Layer
        aggregated_vector1, _ = self.aggregation(matching_vector1)
        aggregated_vector2, _ = self.aggregation(matching_vector2)
        new_hidden_size = 8 * self.num_perspectives
        aggregation_output = torch.cat( (aggregated_vector1[:, 0, new_hidden_size:],
                                         aggregated_vector1[:, -1, :new_hidden_size],
                                         aggregated_vector2[:, 0, new_hidden_size:],
                                         aggregated_vector2[:, -1, :new_hidden_size]) ,dim = -1)
        
        #Prediction Layer
        outputs = self.prediction(aggregation_output)
        
        #return outputs, aggregation_output[:,:10], matching_vector1[:,:,:10], unpacked_context_vector1[:,:,:10], co1, wt, forward_h
        return outputs

## 학습


학습에 필요한 instance 들을 선언하고, 실제 학습을 수행합니다.

In [None]:
import numpy as np
import os
torch.manual_seed(0)
np.random.seed(0)

In [None]:
vocab_size = 30007
hidden_size = 128
emb_size = 32
batch_size = 32
learning_rate = 1e-3
epochs = 3

In [None]:
train_dataset = STSDataset(train_examples)
dev_dataset = STSDataset(dev_examples)
test_dataset = STSDataset(test_examples)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
dev_dataloader = DataLoader(dev_dataset, batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn)

print(f"train_examples: {len(train_dataset)}")
print(f"val_examples: {len(dev_dataset)}")
print(f"test_examples: {len(test_dataset)}")
print(f"train_steps_per_epoch: {len(train_dataloader)}")

train_examples: 194059
val_examples: 48515
test_examples: 161716
train_steps_per_epoch: 6065


In [None]:
import torch
from torch.optim import AdamW
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = STSBaselineModel(vocab_size, hidden_size=hidden_size, num_perspectives = 10, device = device)
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50, eta_min=0)
#scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps = 0)
emb = bert.embeddings.word_embeddings.to(device)

print(device)

cuda


In [None]:
for i, batch in enumerate(train_dataloader):
    if i>0:
      break
    else:
      text1, text2, label = batch
sentence1 = tokenizer.decode(text1[2])
sentence2 = tokenizer.decode(text2[2])
print(sentence1)
print(sentence2)
print(label[2])

[CLS] what hairstyles did vikings have? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
[CLS] how did men's hairstyles become so uniform? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
tensor(0.)


In [None]:
gru = nn.GRU(hidden_size, hidden_size//2, batch_first = True, num_layers = 1, bidirectional = True)
print(get_n_params(gru))
linear = nn.Linear(4 * hidden_size, hidden_size)
print(get_n_params(linear))
word_embedding = nn.Embedding(vocab_size, hidden_size, padding_idx=0)
print(get_n_params(word_embedding))
print(get_n_params(model))

74496
65664
3840896
638178


In [None]:
print(model)

MultiwayAttentionNetworks(
  (encoder_p): GRU(8, 32, batch_first=True, bidirectional=True)
  (encoder_q): GRU(8, 32, batch_first=True, bidirectional=True)
  (W1_c): Linear(in_features=64, out_features=64, bias=False)
  (W2_c): Linear(in_features=64, out_features=64, bias=False)
  (v_c): Linear(in_features=64, out_features=1, bias=False)
  (W_b): Linear(in_features=64, out_features=64, bias=False)
  (W_g): Linear(in_features=128, out_features=128, bias=False)
  (W1): Linear(in_features=64, out_features=64, bias=False)
  (W2): Linear(in_features=64, out_features=64, bias=False)
  (proj_inside): Linear(in_features=128, out_features=64, bias=False)
  (proj_mixed): Linear(in_features=128, out_features=64, bias=False)
  (W1_q): Linear(in_features=64, out_features=64, bias=False)
  (W2_q): Linear(in_features=64, out_features=64, bias=False)
  (W1_p): Linear(in_features=64, out_features=64, bias=False)
  (W2_p): Linear(in_features=64, out_features=64, bias=False)
  (v): Linear(in_features=64, 

In [None]:
def get_n_params(model):
    pp=0
    for p in list(model.parameters()):
        nn=1
        for s in list(p.size()):
            nn = nn*s
        pp += nn
    return pp

print(get_n_params(model))
#previous: 1725824
#after: 194560

638178


In [None]:
#gradient init
model.zero_grad()
from tqdm.notebook import tqdm

In [None]:
torch.cuda.empty_cache()
!nvidia-smi

Wed Dec 29 11:32:40 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P0    34W / 250W |   1257MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
learning_rate = 0.001
for epoch in range(1, 16):
    if epoch == 1:
        train_losses = []
        dev_losses = []
        dev_correct = []
    else:
        train_losses = np.load('model/train_loss.npy')
        dev_losses = np.load('model/dev_loss.npy')
        dev_correct = np.load('model/dev_correct.npy')
        model.load_state_dict(torch.load(f'/content/drive/MyDrive/Colab Notebooks/multiway attention/model/model_epoch{epoch-1}.pt'))

    model.train()
    train_loss_stack = []
    train_correct_stack = []
    with tqdm(total = len(train_dataloader), desc = f"Train {epoch}") as pbar:
        for step_id, train_batch in enumerate(train_dataloader):
            text1, text2, label = (tensor.to(device) for tensor in train_batch)
            label = label.type(torch.long)
            pred = model.forward(text1, text2)
            _, indices = pred.max(dim=1)
            loss = criterion(pred, label)

            loss.backward()
            clip_grad_norm_(model.parameters(), max_norm= 1.0)
            optimizer.step()
            optimizer.zero_grad()

            match = torch.eq(indices, label).detach()
            train_correct_stack.extend(match.cpu())

            train_loss_stack.append(loss.item())
            pbar.update(1)
            pbar.set_postfix_str(f"Loss: {loss.item():.3f} ({np.mean(train_loss_stack):.3f})   ACC: {np.mean(train_correct_stack):.3f}")
        scheduler.step()

    print(f"[TRAIN] EP: {epoch} mse: {np.mean(train_loss_stack):.4f}")
    
    model.eval()
    val_loss_stack, val_correct_stack = [], []
    with tqdm(total = len(dev_dataloader), desc = f"Valid {epoch}") as pbar:
        for val_batch in dev_dataloader:
            with torch.no_grad():
                text1, text2, label = (tensor.to(device) for tensor in val_batch)
                label = label.type(torch.long)
                pred = model.forward(text1, text2)
                _, indices = pred.max(dim=1)
                loss = criterion(pred, label)

                match = torch.eq(indices, label).detach()
                val_correct_stack.extend(match.cpu())
                val_loss_stack.append(loss.item())
                
                pbar.update(1)
                pbar.set_postfix_str(f"Acc: ({np.mean(val_correct_stack):.3f})")

    eval_mse = np.mean(val_loss_stack)
    eval_acc = np.mean(val_correct_stack)
    print(f"[VAL] EP:{epoch} mse: {eval_mse:.4f} acc: {eval_acc:.4f}")
    
    np.save('model/train_loss.npy', np.append(train_losses, np.mean(train_loss_stack)))
    np.save('model/dev_loss.npy', np.append(dev_losses, eval_mse))
    np.save('model/dev_correct.npy', np.append(dev_correct, eval_acc))

    #save_model
    torch.save(model.state_dict(), f'/content/drive/MyDrive/Colab Notebooks/multiway attention/model/model_epoch{epoch}.pt')
    #clear gpu cache
    torch.cuda.empty_cache()

Train 1:   0%|          | 0/6065 [00:00<?, ?it/s]

RuntimeError: ignored

## Inference / Submission

학습된 모델을 이용해서 test-set에 대한 예측 값을 추론하고 submission 에 필요한 결과물을 생성합니다.
저장된 submission 파일은 내부적으로 보유하고 있는 test 셋의 성능을 측정하는 용도로 사용됩니다.

In [None]:
test_preds = []
with torch.no_grad():
    with tqdm(total = len(test_dataloader), desc = f"Valid {epoch}") as pbar:
        for test_batch in test_dataloader:
            text1, text2 = (tensor.to(device) for tensor in test_batch[:-1])
            pred = model.forward(text1, text2)
            _, indices = pred.max(dim = 1)
            test_preds.extend(indices.tolist())
            
            pbar.update(1)

Valid 69:   0%|          | 0/1250 [00:00<?, ?it/s]

In [None]:
train_losses = np.load('no_aggre/train_loss.npy')
dev_losses = np.load('no_aggre/dev_loss.npy')
dev_correct = np.load('no_aggre/dev_correct.npy')

In [None]:
print(train_losses)
print(dev_losses)
print(dev_correct)

[0.66550725 0.63837952 0.62381835 0.61178183 0.59588806 0.57841013
 0.56218144 0.5525658  0.54052579 0.52787299 0.5157729  0.50355515
 0.48858688 0.47209139 0.45713906 0.43809829 0.42128532 0.40088578
 0.38069204 0.35853255]
[0.60805831 0.54608613 0.52381911 0.53294223 0.51046906 0.47753168
 0.46794153 0.5229     0.47811883 0.46345225 0.46592411 0.4691898
 0.47234011 0.46911839 0.49132976 0.51131137 0.51394913 0.55096054
 0.59312383 0.57084112]
[0.60805831 0.54608613 0.52381911 0.53294223 0.51046906 0.47753168
 0.46794153 0.5229     0.47811883 0.46345225 0.46592411 0.4691898
 0.47234011 0.46911839 0.49132976 0.51131137 0.51394913 0.55096054
 0.59312383 0.768125  ]


In [None]:
with open("submission/torch-submission_multiway.csv", "w") as f:
    f.write("id,label\n")
    for features, pred in zip(test_examples, test_preds):
        f.write(f"{features[0]},{pred}\n")

In [None]:
!pip install jamo

Collecting jamo
  Downloading jamo-0.4.1-py3-none-any.whl (9.5 kB)
Installing collected packages: jamo
Successfully installed jamo-0.4.1


In [None]:
from utils_copy import symbols
model = SeqtoSeq()
print(get_n_params(model))

532130
