# 미세조정(Fine Tuning)
- 사전 학습되어있는 모델의 가중치를 이용하여 새로운 문제를 해결하기위해 최소한의 가중치를 추가해서 모델을 추가로 학습하는 방법

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm
import random # 시드 고정을 위해
import os # 시드 고정을 위해

def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)    # 파이썬 환경변수 시드 고정
    np.random.seed(seed)
    torch.manual_seed(seed) # cpu 연산 무작위 고정
    torch.cuda.manual_seed(seed) # gpu 연산 무작위 고정
    torch.backends.cudnn.deterministic = True  # cuda 라이브러리에서 Deterministic(결정론적)으로 예측하기 (예측에 대한 불확실성 제거 )

# 데이터 경로를 변경하시오

In [None]:
DATA_PATH = "/content/drive/MyDrive/data/"
SEED = 42
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

In [None]:
df = pd.read_csv(f"{DATA_PATH}imdb.csv")
df.head()

In [None]:
!pip install transformers

# 전이학습 실습해보기

In [None]:
model_name = "bert-base-uncased"

In [None]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained(model_name)

# 사전학습모델 토크나이저
- add_special_tokens
    - True: 특수 토큰 포함하겠다.
- max_length
    -  문장의 최대 길이 조절
- padding
    - max_length : 모델이 입력받을수 있는 최대 길이로 패딩
    - True : 패딩 여부
- truncation
    - True : 문장이 최대길이를 넘으면 자르겟다.


In [None]:
token = tokenizer(df["review"][0], add_special_tokens=True,padding="max_length", truncation=True)
token

# 학습데이터와 정답 데이터 생성

In [None]:
train = df["review"].to_numpy()
target = df["sentiment"].to_numpy().reshape(-1,1)
train.shape , target.shape

# 데이터셋

In [None]:
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self ,tokenizer , x, y = None ): 
        self.tokenizer = tokenizer
        self.x = x
        self.y = y
    def __len__(self): 
        return self.x.shape[0]
    def __getitem__(self, idx): 
        item = {}
        item["x"] = self.__tokenizer(self.x[idx]) 
        if self.y is not None:
            item["y"] = torch.Tensor(self.y[idx])
        return item
    def __tokenizer(self,text):
        inputs = self.tokenizer(text, add_special_tokens=True,padding="max_length", truncation=True)
        for k, v in inputs.items(): 
            inputs[k] = torch.LongTensor(v) 
        return inputs

In [None]:
dt = ReviewDataset(tokenizer,train,target)
dl = torch.utils.data.DataLoader(dt, batch_size=1,shuffle=False) 
batch = next(iter(dl))
batch

# 사전학습모델 생성

In [None]:
model = AutoModel.from_pretrained(model_name)

In [None]:
batch["x"].keys()

In [None]:
outputs = model(**batch["x"])
outputs.keys()

In [None]:
outputs["last_hidden_state"].shape 

In [None]:
outputs["pooler_output"].shape 

In [None]:
outputs[0].shape , outputs[1].shape 

# 모델

In [None]:
class Net(torch.nn.Module):
    def __init__(self, model_name): 
        super().__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.output_layer = torch.nn.Linear(self.model.config.hidden_size, 1)
    def forward(self, x):
        x = self.model(**x) 
        x = self.output_layer(x[1])
        return x

In [None]:
model = Net(model_name)
model(batch["x"]) 

In [None]:
def train_loop(dataloader,model,loss_fn,optimizer,device):
    epoch_loss = 0 
    model.train()
    for batch in tqdm(dataloader): 
        pred = model(batch["x"].to(device))
        loss = loss_fn(pred, batch["y"].to(device)) 
        
        optimizer.zero_grad() 
        loss.backward()  
        optimizer.step() 
        
        epoch_loss += loss.item() 

    epoch_loss /= len(dataloader) 

    return epoch_loss 

In [None]:
@torch.no_grad() 
def test_loop(dataloader,model,loss_fn,device): 
    epoch_loss = 0
    model.eval() 

    pred_list = []
    sig = torch.nn.Sigmoid()

    for batch in tqdm(dataloader):
        
        pred = model(batch["x"].to(device))
        if batch.get("y") is not None: 
            loss = loss_fn(pred, batch["y"].to(device))
            epoch_loss += loss.item()
        
        pred = sig(pred)
        pred = pred.to("cpu").numpy()
        pred_list.append(pred)

    epoch_loss /= len(dataloader)

    pred = np.concatenate(pred_list) 
    return epoch_loss , pred 

In [None]:
n_splits = 5
epochs = 20
batch_size = 16 
loss_fn = torch.nn.BCEWithLogitsLoss()

In [None]:
from sklearn.model_selection import KFold
cv = KFold(n_splits=n_splits,shuffle=True, random_state=SEED)

In [None]:
from sklearn.metrics import accuracy_score

# 학습

In [None]:
is_holdout = True
reset_seeds(SEED)
best_score_list = []
for i,(tri,vai) in enumerate(cv.split(train)):
    
    model = Net(model_name).to(device)
    optimizer = torch.optim.Adam(model.parameters()) 

    train_dt = ReviewDataset(tokenizer,train[tri],target[tri])
    valid_dt = ReviewDataset(tokenizer,train[vai],target[vai])
    train_dl = torch.utils.data.DataLoader(train_dt, batch_size=batch_size, shuffle=True)
    valid_dl = torch.utils.data.DataLoader(valid_dt, batch_size=batch_size,shuffle=False)

    best_score = 0
    patience = 0

    for epoch in range(epochs):
        
        train_loss = train_loop(train_dl, model, loss_fn,optimizer,device )
        valid_loss , pred = test_loop(valid_dl, model, loss_fn,device  )
        pred = (pred > 0.5).astype(int) 

        score = accuracy_score(target[vai],pred )
        patience += 1
        print(train_loss,valid_loss,score,sep="\t") 
        if best_score < score:
            patience = 0
            best_score = score
            torch.save(model.state_dict(),f"model_{i}.pth")

        if patience == 3:
            break
    
    best_score_list.append(best_score)

    if is_holdout:
        break