In [1]:
import os
import math
import torch
import pathlib
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader

In [2]:
def read_data(current_path, data_type, track, language):
    data_path = current_path / 'data' / data_type / track / f'{language}.csv'
    return pd.read_csv(data_path)

In [3]:
def get_labels(data):
    labels = data.columns[2:].tolist()
    labels_to_ids = {label:id for id, label in enumerate(labels)}
    ids_to_labels = {id:label for label, id in labels_to_ids.items()}
    return labels, labels_to_ids, ids_to_labels

In [17]:
current_path = pathlib.Path().resolve()
train_eng = read_data(current_path,'train','track_a','eng')
dev_eng = read_data(current_path,'dev','track_a','eng')


num_of_dummy_data = 100
split_index = len(train_eng) - num_of_dummy_data
train_eng, dev_eng = train_eng.iloc[:split_index], train_eng.iloc[split_index:]

In [18]:
train_eng.head()

Unnamed: 0,id,text,Anger,Fear,Joy,Sadness,Surprise
0,eng_train_track_a_00001,But not very happy.,0,0,1,1,0
1,eng_train_track_a_00002,Well she's not gon na last the whole song like...,0,0,1,0,0
2,eng_train_track_a_00003,She sat at her Papa's recliner sofa only to mo...,0,0,0,0,0
3,eng_train_track_a_00004,"Yes, the Oklahoma city bombing.",1,1,0,1,1
4,eng_train_track_a_00005,They were dancing to Bolero.,0,0,1,0,0


In [19]:
dev_eng.head()

Unnamed: 0,id,text,Anger,Fear,Joy,Sadness,Surprise
2668,eng_train_track_a_02669,The dream ended with me riding my friendly pte...,0,0,1,0,1
2669,eng_train_track_a_02670,Every boarding school has some odd tradition o...,0,0,0,0,1
2670,eng_train_track_a_02671,"People are starting to chant ""Fight, fight fig...",1,1,0,0,1
2671,eng_train_track_a_02672,He didn't swerve.,0,1,0,0,0
2672,eng_train_track_a_02673,I felt a slight heat creep over my cheeks and ...,0,0,1,0,0


In [16]:
labels, labels_to_ids, ids_to_labels = get_labels(dev_eng)
print(f'Labels: {labels}')
print(f'Labels to ids: {labels_to_ids}')
print(f'Ids to labels: {ids_to_labels}')

# math.isnan(dev_eng['Anger'].iloc[0])

Labels: ['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']
Labels to ids: {'Anger': 0, 'Fear': 1, 'Joy': 2, 'Sadness': 3, 'Surprise': 4}
Ids to labels: {0: 'Anger', 1: 'Fear', 2: 'Joy', 3: 'Sadness', 4: 'Surprise'}


In [20]:
plm = 'google-bert/bert-base-uncased'
max_token = 50
tokenizer = AutoTokenizer.from_pretrained(plm)

In [21]:
class RawData(Dataset):

    def __init__(self, data, labels, tokenizer, max_token):
        self.data = data
        self.labels = labels
        self.max_token = max_token
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        id = self.data['id'].iloc[idx]
        text = self.data['text'].iloc[idx]

        tokenizer_output = self.tokenizer(text, padding='max_length',
                                        truncation=True,max_length=self.max_token, return_tensors='pt')
        
        input_ids = tokenizer_output.input_ids[0]
        attention_mask = tokenizer_output.attention_mask[0]

        labels = torch.Tensor(self.data[self.labels].iloc[idx].values)

        data = {
            'id': id,
            'text': text,
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels,
        }
        return data

In [None]:
trainset_raw = RawData(train_eng, labels, tokenizer, max_token)
devset_raw = RawData(dev_eng, labels, tokenizer, max_token)

trainset_dataloader = DataLoader(trainset_raw, batch_size=64, shuffle=True)
devset_dataloader = DataLoader(devset_raw, batch_size=32, shuffle=False)

In [23]:
for batch_index, data in enumerate(devset_dataloader):
    if batch_index == 1: break
    print(data['input_ids'])

tensor([[ 101, 1996, 3959,  ..., 1012,  102,    0],
        [ 101, 2296, 9405,  ...,    0,    0,    0],
        [ 101, 2111, 2024,  ...,    0,    0,    0],
        ...,
        [ 101, 1045, 2018,  ...,    0,    0,    0],
        [ 101, 1045, 2272,  ...,    0,    0,    0],
        [ 101, 2057, 2018,  ...,    0,    0,    0]])


In [15]:
trainset_dataloader[0]

TypeError: 'DataLoader' object is not subscriptable

In [None]:
test_data[1]

{'id': 'eng_train_track_a_00002',
 'text': "Well she's not gon na last the whole song like that, so since I'm behind her and the audience can't see below my torso pretty much, I use my hand to push down on the lid and support her weight.",
 'labels': [0, 0, 1, 0, 0]}

In [None]:
train_eng[labels].iloc[:2].values

array([[0, 0, 1, 1, 0],
       [0, 0, 1, 0, 0]], dtype=int64)

In [None]:
labels

['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']

In [None]:
y_true

NameError: name 'y_true' is not defined

In [None]:
from torch import nn, optim

In [None]:
import copy

In [None]:
model_1 = nn.Linear(20, 5) # predict logits for 5 classes
model_2 = copy.deepcopy(model_1)
x = torch.randn(1, 20)
y = torch.tensor([[1., 0., 1., 0., 0.]]) # get classA and classC as active

In [None]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model_1.parameters(), lr=1e-1)

for epoch in range(20):
    optimizer.zero_grad()
    output = model_1(x)
    loss = criterion(output, y)
    loss.backward()
    optimizer.step()
    print('Loss: {:.3f}'.format(loss.item()))

Loss: 0.827
Loss: 0.693
Loss: 0.585
Loss: 0.499
Loss: 0.431
Loss: 0.376
Loss: 0.332
Loss: 0.296
Loss: 0.267
Loss: 0.242
Loss: 0.221
Loss: 0.203
Loss: 0.188
Loss: 0.175
Loss: 0.163
Loss: 0.153
Loss: 0.144
Loss: 0.136
Loss: 0.129
Loss: 0.122


In [None]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model_2.parameters(), lr=1e-1)

for epoch in range(20):
    optimizer.zero_grad()
    output = model_2(x)
    loss = criterion(output, y)
    loss.backward()
    optimizer.step()
    print('Loss: {:.3f}'.format(loss.item()))

Loss: 0.827
Loss: 0.693
Loss: 0.585
Loss: 0.499
Loss: 0.431
Loss: 0.376
Loss: 0.332
Loss: 0.296
Loss: 0.267
Loss: 0.242
Loss: 0.221
Loss: 0.203
Loss: 0.188
Loss: 0.175
Loss: 0.163
Loss: 0.153
Loss: 0.144
Loss: 0.136
Loss: 0.129
Loss: 0.122


In [None]:
input = torch.randn(1, 5, requires_grad=True)
# every element in target should have 0 <= value < C
target = torch.tensor([1])

m = nn.LogSoftmax(dim=1)
nll_loss = nn.NLLLoss()
output = nll_loss(m(input), target)
output.backward()

In [None]:
criterion(input, target)

ValueError: Target size (torch.Size([1])) must be the same as input size (torch.Size([1, 5]))

In [None]:
input

tensor([[-1.3253, -0.0722,  1.5362,  1.0491,  0.5770]], requires_grad=True)

In [None]:
target

tensor([1])

In [None]:
output

tensor([[ 0.4831,  0.6032,  0.4338, -0.6756, -0.2784]],
       grad_fn=<AddmmBackward0>)

In [None]:
y

tensor([3])

In [None]:
y

tensor([3])

In [None]:
output

tensor([[ 0.5119, -0.7870, -0.1049,  0.4185,  0.7991]],
       grad_fn=<AddmmBackward0>)

In [None]:
x

tensor([[-2.2381, -0.7510, -0.1611, -0.6983, -0.6827,  0.7043,  1.0685,  0.4725,
         -0.0206,  1.2230,  0.7853, -0.1046, -0.7214, -0.2001, -3.3045,  1.5603,
         -1.5575, -1.1850,  0.6585, -1.1367]])