<a href="https://colab.research.google.com/github/jaeryong77/Actto/blob/master/%5BKU%5DBERT%EC%98%88%EC%8B%9C%EC%BD%94%EB%93%9C_HJR%EC%97%B0%EC%8A%B5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Preparation**

- Edit > Notebook settings > Hardward accelerators > GPU > SAVE
- Download the Friends dataset in EmotionLines website:
http://doraemon.iis.sinica.edu.tw/emotionlines/download.html
- Download the unlabeled json file.

# **Tutorials**

In [12]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


##### **Settings**

In [1]:
!pip install transformers --quiet # package installer for python

[K     |████████████████████████████████| 1.3MB 4.1MB/s 
[K     |████████████████████████████████| 1.1MB 15.9MB/s 
[K     |████████████████████████████████| 2.9MB 20.8MB/s 
[K     |████████████████████████████████| 890kB 39.2MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [2]:
import torch
from transformers import BertModel, BertTokenizer

In [3]:
pretrained_weights = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
model = BertModel.from_pretrained(pretrained_weights)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




##### **Tokenization**

In [4]:
sentence = 'All the classes are provided.'

In [5]:
tokens = tokenizer.tokenize(sentence)
tokens = ['[CLS]'] + tokens + ['[SEP]']
print(tokens)

['[CLS]', 'all', 'the', 'classes', 'are', 'provided', '.', '[SEP]']


In [6]:
ids = [tokenizer.convert_tokens_to_ids(tokens)]
print(ids)

[[101, 2035, 1996, 4280, 2024, 3024, 1012, 102]]


In [7]:
input_tensor = torch.tensor(ids)
print(input_tensor.data)

tensor([[ 101, 2035, 1996, 4280, 2024, 3024, 1012,  102]])


##### **Model**

In [8]:
hidden_tensor = model(input_tensor)[0]
print(hidden_tensor.size())

torch.Size([1, 8, 768])


In [9]:
logit = torch.nn.Linear(768, 2)(hidden_tensor)
print(logit.size())
print(logit.data)

torch.Size([1, 8, 2])
tensor([[[ 0.3832, -0.2933],
         [ 0.5125, -0.0720],
         [-0.0985,  0.2200],
         [ 0.1374,  0.3125],
         [ 0.2897, -0.1343],
         [-0.0047, -0.1139],
         [ 0.3663, -0.1305],
         [ 0.2206, -0.0971]]])


In [10]:
prediction = torch.nn.Softmax(dim=-1)(logit)
print(prediction.size())
print(prediction.data)

torch.Size([1, 8, 2])
tensor([[[0.6629, 0.3371],
         [0.6421, 0.3579],
         [0.4210, 0.5790],
         [0.4563, 0.5437],
         [0.6044, 0.3956],
         [0.5273, 0.4727],
         [0.6217, 0.3783],
         [0.5788, 0.4212]]])


# **Emotion Recognition**

##### **Dataset**

In [13]:
import json

data = {'train': {'speaker': [], 'utterance': [], 'emotion': []},
        'dev': {'speaker': [], 'utterance': [], 'emotion': []},
        'test': {'speaker': [], 'utterance': [], 'emotion': []}}

for dtype in ['train', 'dev', 'test']:
  for dialog in json.loads(open('/content/gdrive/My Drive/Colab Notebooks/friend_eng/friends_' + dtype + '.json').read()):
    for line in dialog:
      data[dtype]['speaker'].append(line['speaker'])
      data[dtype]['utterance'].append(line['utterance'])
      data[dtype]['emotion'].append(line['emotion'])

In [14]:
e2i_dict = dict((emo, i) for i, emo in enumerate(set(data['train']['emotion'])))
i2e_dict = {i: e for e, i in e2i_dict.items()}

##### **Model**

In [15]:
import torch.nn as nn
from transformers import BertModel, BertTokenizer

class Model(nn.Module):
  def __init__(self):
    super().__init__()
    self.bert_tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
    self.bert_model = BertModel.from_pretrained(pretrained_weights)
    self.linear = torch.nn.Linear(768, len(e2i_dict))

  def forward(self, utterance):
    tokens = self.bert_tokenizer.tokenize(utterance)
    tokens = ['[CLS]'] + tokens + ['[SEP]'] # (len)
    ids = [tokenizer.convert_tokens_to_ids(tokens)] # (bat=1, len)
    input_tensor = torch.tensor(ids).cuda()

    hidden_tensor = self.bert_model(input_tensor)[0] # (bat, len, hid)
    hidden_tensor = hidden_tensor[:, 0, :] # (bat, hid)
    logit = self.linear(hidden_tensor)
    return logit

##### **Evaluation Metrics**

In [16]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate(true_list, pred_list):
  precision = precision_score(true_list, pred_list, average=None)
  recall = recall_score(true_list, pred_list, average=None)
  micro_f1 = f1_score(true_list, pred_list, average='micro')
  print('precision:\t', ['%.4f' % v for v in precision])
  print('recall:\t\t', ['%.4f' % v for v in recall])
  print('micro_f1: %.6f' % micro_f1)

##### **Hyper-parameters**

In [17]:
pretrained_weights = 'bert-base-uncased'
learning_rate = 1e-5
n_epoch = 3

##### **Training**

In [18]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import torch
from tqdm import tqdm_notebook

model = Model()
model.cuda()
criterion = torch.nn.CrossEntropyLoss() # LogSoftmax & NLLLoss
optimizer = torch.optim.Adam(model.parameters(), learning_rate)

for i_epoch in range(n_epoch):
  print('i_epoch:', i_epoch)

  model.train()
  for i_batch in tqdm_notebook(range(len(data['train']['utterance']))):
    logit = model(data['train']['utterance'][i_batch])
    target = torch.tensor([e2i_dict[data['train']['emotion'][i_batch]]]).cuda()
    loss = criterion(logit, target)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
  
  model.eval()
  pred_list, true_list = [], []
  for i_batch in tqdm_notebook(range(len(data['dev']['utterance']))):
    logit = model(data['dev']['utterance'][i_batch])
    _, max_idx = torch.max(logit, dim=-1)
    pred_list += max_idx.tolist()
    true_list += [e2i_dict[data['dev']['emotion'][i_batch]]]
  evaluate(pred_list, true_list) # print results

i_epoch: 0


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  from ipykernel import kernelapp as app


HBox(children=(FloatProgress(value=0.0, max=10561.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=1178.0), HTML(value='')))


precision:	 ['0.3412', '0.5695', '0.0000', '0.2419', '0.1304', '0.8961', '0.5366', '0.1262']
recall:		 ['0.2959', '0.5931', '0.0000', '0.5000', '0.6000', '0.6331', '0.6168', '0.2755']
micro_f1: 0.565365
i_epoch: 1


  _warn_prf(average, modifier, msg_start, len(result))


HBox(children=(FloatProgress(value=0.0, max=10561.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1178.0), HTML(value='')))


precision:	 ['0.3882', '0.5828', '0.0345', '0.4194', '0.3043', '0.8411', '0.6667', '0.2290']
recall:		 ['0.4648', '0.5986', '0.5000', '0.4815', '0.3043', '0.6976', '0.5655', '0.3403']
micro_f1: 0.593379
i_epoch: 2


HBox(children=(FloatProgress(value=0.0, max=10561.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1178.0), HTML(value='')))


precision:	 ['0.3059', '0.6623', '0.0345', '0.2097', '0.2174', '0.8208', '0.5691', '0.2804']
recall:		 ['0.4906', '0.5319', '1.0000', '0.5200', '0.4167', '0.6819', '0.5691', '0.3243']
micro_f1: 0.575552


##### **Labeling**


In [20]:
from collections import OrderedDict

labeled = []
dialogs = json.loads(open('/content/gdrive/My Drive/Colab Notebooks/friend_eng/unlabeled.json').read())
for dialog in tqdm_notebook(dialogs):
  dialog_list = []
  for line in dialog:
    logit = model(line['utterance'])
    _, max_idx = torch.max(logit, dim=-1)
    pred_emotion = max_idx.tolist()[0]

    line_dict = OrderedDict()
    line_dict['speaker'] = line['speaker']
    line_dict['utterance'] = line['utterance']
    line_dict['emotion'] = i2e_dict[pred_emotion]
    dialog_list.append(line_dict)
  labeled.append(dialog_list)

with open('labeled.json', 'w') as json_file:
  json.dump(labeled, json_file, indent='\t')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=0.0, max=240.0), HTML(value='')))




##### **Proposal**

- There is a class imbalance problem. (Use weighted cross-entropy etc.)

- Our model takes a single sentence. (Make it grasp its context as well.)

- Our model does not consider speaker information. (Make it consider the info.)

- Batch size is set as 1. (Increase the batch size.)