<a href="https://colab.research.google.com/github/hotorch/SST2_fine_tuning/blob/master/SST2_data_with_huggingface_transformer(remark_version).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## huggingface transformers 

In [None]:
!pip install transformers

In [None]:
from transformers import BertModel, BertTokenizer # load language model & tokenizer

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

huggingface에서 https://huggingface.co/transformers/model_doc/bert.html 를 참조

In [None]:
bert = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# tokenizer test
text = 'The BERT model was proposed in BERT'

print(tokenizer(text))

len(tokenizer(text))

In [None]:
# encode -> list type
# encode 함수에 스페셜 토큰을 보고 싶지 않을 때에는 add_special_tokens = False 활용
tokenizer.encode(text)

In [None]:
# decode
tokenizer.decode(tokenizer.encode(text))

#### 조금 더 살펴보기

In [None]:
input_ids = tokenizer.encode(text)
input_ids_tensor = torch.tensor(input_ids, dtype = torch.long).unsqueeze(0).cuda()
bert = BertModel.from_pretrained('bert-base-uncased').cuda()

In [None]:
embeddings = bert(input_ids_tensor)
embeddings[0]
print(embeddings[0].shape) # |batch_size, # of tokens, bert dimension|

#### Load Data(SST-2)

https://nlp.stanford.edu/sentiment/index.html



In [None]:
import os
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
path = "drive/My Drive/SST-2/"
os.listdir(path)

In [None]:
train = pd.read_csv(path + 'train.tsv', delimiter = '\t')[:1000] # 일부분만 활용
# dev = pd.read_csv('dev.tsv', delimiter = '\t')
# test = pd.read_csv('test.tsv', delimiter = '\t')

In [None]:
# 데이터 문장 길이가 다르기 때문에 [PAD] token을 활용해야함, 또한 최대 길이도 정해야함
print(train['sentence'].apply(lambda x: tokenizer.encode(x))[0])
print(train['sentence'].apply(lambda x: tokenizer.encode(x))[1]) 

#### Define Max Length & fix dimension

In [None]:
MAX_LEN = train['sentence'].apply(lambda x: len(x)).max() # 250
train['sentence'] = train['sentence'].apply(lambda x: tokenizer.encode(x))
# insert [pad]
padded_ids = np.array([sentence + [0] * (MAX_LEN - len(sentence)) 
					   for sentence in train['sentence']])
print(padded_ids.shape)

In [None]:
# 필요한 문장만을 활용하기 위해 attention mask 활용
attention_mask = np.where(np.array(padded_ids) != 0, 1, 0)
# tensor로 변환하기 
padded_ids_tensor = torch.tensor(padded_ids, dtype=torch.long)
attention_mask_tensor = torch.tensor(attention_mask, dtype= torch.long)
output_tensor = torch.tensor(train[['label']].values, dtype = torch.long) 

#### Define Model

In [None]:
class SentimentClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = bert
        self.linear = nn.Linear(768, 2)

        for parameter in self.bert.parameters():
            parameter.requires_grad = False # bert안에 있는 parameter 건드리지 않기, 오로지 nn.linear만 학습

    def forward(self, input_ids, attention_mask):
        # input_ids dimension : |batch_size, MAX_LEN, embedding_dim|
        input_ids = self.bert(input_ids, attention_mask= attention_mask)[0]
        # ouput dimension : |batch_size, MAX_LEN, embedding_dim|
        input_ids = input_ids[:,0,:]
        return self.linear(input_ids)

#### Define DataLoader

In [None]:
from torch.utils.data import Dataset, DataLoader
# 데이터 길이, 데이터 인덱스 던져줬을 때 어떤 것을 리턴할지만 고려하기

class SST2(Dataset):
    def __init__(self, 
               input_ids,
               attention_mask,
               output):
        super().__init__()
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.output = output
  
    def __len__(self): 
        return len(self.input_ids)

    def __getitem__(self, index):
        return self.input_ids[index], self.attention_mask[index], self.output[index]

In [None]:
net = SentimentClassifier().cuda()
optimizer = optim.SGD(net.parameters(), lr = 0.001)
loss_fn = nn.CrossEntropyLoss().cuda()
EPOCHS = 100

In [None]:
train_dataset = SST2(padded_ids_tensor[:700],
                     attention_mask_tensor[:700],
                     output_tensor[:700])

valid_dataset = SST2(padded_ids_tensor[700:],
                     attention_mask_tensor[700:],
                     output_tensor[700:])

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=8, shuffle=True)

In [None]:
# 25분 정도 소요, accuracy 53%
import tqdm

for epoch in tqdm.tqdm_notebook(range(EPOCHS)):
	for input_ids, attention_mask, output in tqdm.tqdm_notebook(train_dataloader):
		predictions = net(input_ids.cuda(), attention_mask.cuda())
		loss = loss_fn(predictions, output.cuda().squeeze())
		loss.backward()
		optimizer.step()
		optimizer.zero_grad()

	with torch.no_grad():
		num_correct = 0
		for input_ids, attention_mask, output in tqdm.tqdm_notebook(valid_dataloader):
			predictions = net(input_ids.cuda(), attention_mask.cuda())
			loss = loss_fn(predictions, output.cuda().squeeze())

			num_correct += (predictions.max(dim=1)[1] == output.cuda().squeeze()).sum().item()

		accuracy = num_correct / len(valid_dataloader) * 8

		print(accuracy)