##### **Settings**

In [1]:
!pip install transformers --quiet # package installer for python

[K     |████████████████████████████████| 675kB 2.7MB/s 
[K     |████████████████████████████████| 1.1MB 13.5MB/s 
[K     |████████████████████████████████| 3.8MB 13.8MB/s 
[K     |████████████████████████████████| 890kB 30.5MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [2]:
import torch
from transformers import BertModel, BertTokenizer

In [3]:
pretrained_weights = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
model = BertModel.from_pretrained(pretrained_weights)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




##### **Dataset**

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [5]:
import json

data = {'train': {'speaker': [], 'utterance': [], 'emotion': [], 'annotation': []},
        'dev': {'speaker': [], 'utterance': [], 'emotion': [], 'annotation': []},
        'test': {'speaker': [], 'utterance': [], 'emotion': [], 'annotation': []}}


for dtype in ['train', 'dev', 'test']:
  for dialog in json.loads(open('/content/gdrive/My Drive/test/friends_' + dtype + '.json').read()):
    for line in dialog:
        data[dtype]['speaker'].append(line['speaker'])
        data[dtype]['utterance'].append(line['utterance'])
        data[dtype]['emotion'].append(line['emotion'])
        data[dtype]['annotation'].append(line['annotation'])

In [6]:
e2i_dict = dict((emo, i) for i, emo in enumerate(set(data['train']['emotion'])))
i2e_dict = {i: e for e, i in e2i_dict.items()}
print(e2i_dict)

emo_list = ['neutral', 'joy', 'sadness', 'fear', 'anger', 'surprise', 'disgust', 'non-neutral']

{'anger': 0, 'sadness': 1, 'surprise': 2, 'joy': 3, 'neutral': 4, 'non-neutral': 5, 'fear': 6, 'disgust': 7}


##### **Model**

In [7]:
import torch.nn as nn
from transformers import BertModel, BertTokenizer

class Model(nn.Module):
  def __init__(self):
    super().__init__()
    self.bert_tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
    self.bert_model = BertModel.from_pretrained(pretrained_weights)
    self.linear = torch.nn.Linear(768, len(e2i_dict))

  def forward(self, utterance):
    tokens = self.bert_tokenizer.tokenize(utterance)
    tokens = ['[CLS]'] + tokens + ['[SEP]'] # (len)
    ids = [self.bert_tokenizer.convert_tokens_to_ids(tokens)] # (bat=1, len)
    input_tensor = torch.tensor(ids).cuda()

    hidden_tensor = self.bert_model(input_tensor)[0] # (bat, len, hid)
    hidden_tensor = hidden_tensor[:, 0, :] # (bat, hid)
    logit = self.linear(hidden_tensor)
    return logit

##### **Evaluation Metrics**

In [8]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate(true_list, pred_list):
  precision = precision_score(true_list, pred_list, average=None)
  recall = recall_score(true_list, pred_list, average=None)
  micro_f1 = f1_score(true_list, pred_list, average='micro')
  print('precision:\t', ['%.4f' % v for v in precision])
  print('recall:\t\t', ['%.4f' % v for v in recall])
  print('micro_f1: %.6f' % micro_f1)

##### **Hyper-parameters**

In [9]:
pretrained_weights = 'bert-base-uncased'
learning_rate = 1e-5
n_epoch = 1

##### **Training**

In [13]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import torch
from tqdm import tqdm_notebook

model = Model()
model.cuda()

# e2i_dict changes for each execution of the pre-processing code
# e2w_dict stores weight which is used in loss calculation
# this loss is calculated by relative ratio of the data
e2w_dict = {'anger': 0.95, 'disgust': 0.97, 'fear': 0.98, 'joy': 0.87, 'neutral': 0.55, 'non-neutral': 0.8, 'sadness': 0.96, 'surprise': 0.88}

# order of e2i_dict changes.. so we should consider this when we make list 'weights'
weights = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

for emo in emo_list:
    weights[e2i_dict[emo]] = e2w_dict[emo]

class_weights = torch.FloatTensor(weights).cuda()
criterion = nn.CrossEntropyLoss(weight=class_weights)

#criterion = torch.nn.CrossEntropyLoss() # LogSoftmax & NLLLoss
optimizer = torch.optim.Adam(model.parameters(), learning_rate)

for i_epoch in range(n_epoch):
  print('i_epoch:', i_epoch)

  model.train()
  for i_batch in tqdm_notebook(range(len(data['train']['utterance']))):
    logit = model(data['train']['utterance'][i_batch])
    target = torch.tensor([e2i_dict[data['train']['emotion'][i_batch]]]).cuda()
    loss = criterion(logit, target)
      
    if data['train']['emotion'][i_batch] != 'non-neutral':
        tmp_index = emo_list.index(data['train']['emotion'][i_batch])
        ratio = float(data['train']['annotation'][i_batch][tmp_index])
        ratio = ratio - 2
        loss = ratio * loss
           
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
  
  model.eval()
  pred_list, true_list = [], []
  for i_batch in tqdm_notebook(range(len(data['dev']['utterance']))):
    logit = model(data['dev']['utterance'][i_batch])
    _, max_idx = torch.max(logit, dim=-1)
    pred_list += max_idx.tolist()
    true_list += [e2i_dict[data['dev']['emotion'][i_batch]]]
    
  evaluate(pred_list, true_list) # print results

i_epoch: 0


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=10561.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=1178.0), HTML(value='')))


precision:	 ['0.4118', '0.3065', '0.5960', '0.5610', '0.9267', '0.0421', '0.0000', '0.1304']
recall:		 ['0.3241', '0.5758', '0.6207', '0.6000', '0.6091', '0.3750', '0.0000', '0.5000']
micro_f1: 0.577250


  _warn_prf(average, modifier, msg_start, len(result))


**print out F1-score which is calculated with test dataset**

In [14]:
model.eval()
pred_list, true_list = [], []
for i_batch in tqdm_notebook(range(len(data['test']['utterance']))):
    logit = model(data['test']['utterance'][i_batch])
    _, max_idx = torch.max(logit, dim=-1)
    pred_list += max_idx.tolist()
    true_list += [e2i_dict[data['test']['emotion'][i_batch]]]

evaluate(pred_list, true_list) # print results

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=2764.0), HTML(value='')))


precision:	 ['0.5280', '0.3176', '0.5629', '0.5987', '0.9246', '0.0573', '0.0000', '0.0735']
recall:		 ['0.3244', '0.4219', '0.5689', '0.5759', '0.6758', '0.5000', '0.0000', '0.3125']
micro_f1: 0.608177


  _warn_prf(average, modifier, msg_start, len(result))


**this is for leader board csv file**

In [15]:
import csv
 
f1 = open('/content/gdrive/My Drive/test/en_data.csv','r',encoding = 'euc-kr')
f2 = open('/content/gdrive/My Drive/test/test_output.csv', 'w',encoding = 'euc-kr', newline='')

rdr = csv.reader(f1)
wr = csv.writer(f2)
wr.writerow(['Id', 'Predicted'])
i = -1
for line in rdr:
    if i != -1:
        logit = model(line[4])
        _, max_idx = torch.max(logit, dim=-1)
        tmp_list = max_idx.tolist()
        wr.writerow([i, i2e_dict[tmp_list[0]]])
    i = i+1

f1.close()
f2.close()