# **모델 구현 (ing)**

In [1]:
!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
!pip install transformers==3.0.2
!pip install torch
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'
!pip install torchmetrics

Collecting transformers==3.0.2
  Using cached transformers-3.0.2-py3-none-any.whl (769 kB)
Collecting tokenizers==0.8.1.rc1
  Using cached tokenizers-0.8.1rc1-cp37-cp37m-manylinux1_x86_64.whl (3.0 MB)
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.12.1
    Uninstalling tokenizers-0.12.1:
      Successfully uninstalled tokenizers-0.12.1
  Attempting uninstall: transformers
    Found existing installation: transformers 4.18.0
    Uninstalling transformers-4.18.0:
      Successfully uninstalled transformers-4.18.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
kobert 0.2.3 requires transformers>=4.8.1, but you have transformers 3.0.2 which is incompatible.[0m
Successfully installed tokenizers-0.8.1rc1 transformers-3.0.2
Collecting git+https://****@github.com/SKTBrain/K

In [2]:
import torch
import time
import os
import pandas as pd
import numpy as np
import torch.optim as optim
import torch.nn.functional as F
import gluonnlp as nlp
import matplotlib.pyplot as plt
from tqdm import tqdm
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertTokenizer, BertModel
from kobert import get_pytorch_kobert_model
from sklearn.metrics import f1_score

In [3]:
# Drive Mount
from google.colab import drive
drive.mount('/content/gdrive')

directory = "주분"
path = "/content/gdrive/My Drive/" + directory
os.chdir(path)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## **1. 데이터로더 준비**

In [4]:
# KoBERT tokenizer 및 모델 불러오기

tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
model, vocab = get_pytorch_kobert_model()

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


using cached model. /content/gdrive/My Drive/주분/.cache/kobert_v1.zip
using cached model. /content/gdrive/MyDrive/주분/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [5]:
# Custom Dataset 불러오기

from dataloader import *

In [6]:
# Custom Dataset 객체 생성(Train Set)

dataset_train = KEMDset(file='train_aft_aug_kobert.pickle', tokenizer=tokenizer, balance=False, shuffle=True)

100%|██████████| 10043/10043 [00:09<00:00, 1108.49it/s]


In [7]:
# DataLoader 정의

dataloader_train = torch.utils.data.DataLoader(dataset_train, batch_size=64,
                                               shuffle=False, drop_last=True)

In [8]:
# Custom Dataset 객체 생성(Valid Set)

dataset_valid = KEMDset(file='valid_tokenized.pickle', tokenizer=tokenizer, balance=False, shuffle=True)

100%|██████████| 1018/1018 [00:00<00:00, 1153.41it/s]


In [9]:
# DataLoader 정의

dataloader_valid = torch.utils.data.DataLoader(dataset_valid, batch_size=64,
                                               shuffle=False, drop_last=True)

## **2. 모델**

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [11]:
# 모델 정의해놓은 모듈 불러오기

from text_model import *
from EarlyStopping import *

In [12]:
import importlib
import text_model
importlib.reload(text_model)
from text_model import *

In [13]:
# 모델 객체 생성

model = TextRegressor(n_layers=5).to(device)

using cached model. /content/gdrive/MyDrive/주분/.cache/kobert_v1.zip
using cached model. /content/gdrive/MyDrive/주분/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


## **3. 학습**

In [14]:
# 하이퍼 파라미터 설정

learning_rate = 2e-5
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.8)

In [None]:
# 모델 학습

epochs=30 
train_losses = []
valid_losses = []

start_time = time.time()
es = EarlyStopping(patience=8, path='text_valence.pt')

for epoch in range(epochs):
    
    #########
    # Train #
    #########
    model.train()
    train_loss = 0
    
    for i, dict in enumerate(dataloader_train):
        input_ids = dict['input_ids'].to(device=device, dtype=torch.int32)
        token_type_ids = dict['token_type_ids'].to(device=device, dtype=torch.int32)
        attention_mask = dict['attention_mask'].to(device=device, dtype=torch.int32)
        targets = dict['valence'].to(device=device, dtype=torch.float32).unsqueeze(1)

        scores, features = model(input_ids, token_type_ids, attention_mask)
    
        loss_fn = nn.MSELoss(size_average=None, reduce=None, reduction='mean')
        loss = torch.sqrt(loss_fn(scores, targets))

        train_loss += loss
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    scheduler.step()
        
    ##############
    # Validation #
    ##############
    model.eval()
    valid_loss = 0

    with torch.no_grad():
        for i, dict in enumerate(dataloader_valid):
            input_ids = dict['input_ids'].to(device=device, dtype=torch.int32)
            token_type_ids = dict['token_type_ids'].to(device=device, dtype=torch.int32)
            attention_mask = dict['attention_mask'].to(device=device, dtype=torch.int32)
            targets = dict['valence'].to(device=device, dtype=torch.float32).unsqueeze(1)
  
            scores, features = model(input_ids, token_type_ids, attention_mask)
            _, preds = scores.max(dim=1)

            valid_loss += torch.sqrt(loss_fn(scores, targets))

    #######
    # Log #
    #######
    train_losses.append(train_loss.detach().cpu().numpy() / len(dataloader_train))
    valid_losses.append(valid_loss.detach().cpu().numpy() / len(dataloader_valid))

    elapsed_time = time.time() - start_time
        
    print(f"[{time.strftime('%H:%M:%S', time.gmtime(elapsed_time))}] Epoch {epoch+1:2d} \
    >>> Train RMSE Loss: {train_losses[-1]:6.4f} \
    >>> Valid RMSE Loss: {valid_losses[-1]:6.4f} ")
    
    es(valid_losses[-1], model)

    if es.early_stop:
        print('Early Stopping Activated!')
        break

# torch.save(model.state_dict(), 'text_valence.pt')

In [None]:
# 시각화

plt.figure(figsize=(6,5))

plt.plot(train_losses)
plt.plot(valid_losses)
plt.title('RMSE Loss')
plt.legend(['Train','Valid'])
plt.grid(True)

plt.tight_layout()
plt.show()

In [17]:
import csv

with open('[Text]n_layers=5.csv', 'w', newline='') as f: 
    writer = csv.writer(f) 
    writer.writerow(train_losses)
    writer.writerow(valid_losses)