In [1]:
import os
import numpy as np

import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings(action='ignore') 

In [9]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

## Pretrained BERT 모델 및 토크나이저 로드
- 'bert-base-multilingual-cased' : 다국어 지원 모델

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
# bert_model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=256)

## Data Load 및 전처리

In [5]:
# 예시 데이터
sentences = ["This is a positive sentence.", "이 문장은 긍정적인 문장입니다."]
labels = [1, 0]

# BERT 입력 데이터로 변환
inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
labels = torch.tensor(labels)

In [6]:
train = pd.read_csv('./processed_data/log_train.csv')
test =  pd.read_csv('./processed_data/log_test.csv')

# 정형 데이터 to Natural lang data
train_texts = [', '.join([f'{c}: {train.iloc[i][c]}' for c in test.columns]) for i in range(train.shape[0])]
test_texts = [', '.join([f'{c}: {test.iloc[i][c]}' for c in test.columns]) for i in range(test.shape[0])]

# Natural lang data to Tokens
train_texts = tokenizer(train_texts, padding=True, truncation=True, return_tensors='pt')
labels = torch.tensor(train.log_and_filtered_price.to_numpy())

test_texts  = tokenizer(test_texts, padding=True, truncation=True, return_tensors='pt')

In [33]:
train_texts

{'input_ids': tensor([[  101, 22560,   131,  ...,     0,     0,     0],
        [  101, 22560,   131,  ...,     0,     0,     0],
        [  101, 22560,   131,  ...,     0,     0,     0],
        ...,
        [  101, 22560,   131,  ...,     0,     0,     0],
        [  101, 22560,   131,  ...,     0,     0,     0],
        [  101, 22560,   131,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

## Data Loader 정의

In [7]:
dataset = TensorDataset(train_texts['input_ids'], train_texts['attention_mask'], labels)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

## Model 정의

In [44]:
# 모델 정의 LSTM with Multi-Head Self-Attention
class JSNet(nn.Module):
    def __init__(self, hidden_size):
        super(JSNet, self).__init__()
        self.bert_model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=hidden_size)
        
        self.fc1 = nn.Linear(hidden_size*4, hidden_size*2)
        self.fc2 = nn.Linear(hidden_size*2, hidden_size)
        self.fc3 = nn.Linear(hidden_size, 1)

    def forward(self, inputs, tokens, attentions):
        out = self.bert_model(inputs, tokens, attentions).logits
        print(out)
        out = self.fc2(out)
        out = self.fc3(out)
        return out

## Train-Test Set 정의

In [11]:
X_train_input, X_valid_input, X_train_token, X_valid_token = train_test_split(train_texts.input_ids, train_texts.token_type_ids, test_size=0.2, random_state=42)
X_train_attention, X_valid_attention, y_train, y_val = train_test_split(train_texts.attention_mask, labels, test_size=0.2, random_state=42)

In [32]:
X_train = {}
X_train['input_ids'] = X_train_input
X_train['token_type_ids'] = X_train_token
X_train['attention_mask'] = X_train_attention

X_valid = {}
X_valid['input_ids'] = X_valid_input
X_valid['token_type_ids'] = X_valid_token
X_valid['attention_mask'] = X_valid_attention

## 모델 호출

In [45]:
js_net = JSNet(256)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 모델 변수 선언

In [46]:
optimizer = AdamW(js_net.parameters(), lr=1e-5)
criterion = torch.nn.MSELoss()

# Early stopping 관련 설정
best_val_loss = float('inf')
patience = 30  # 일정 횟수 동안 검증 손실이 향상되지 않을 때 조기 종료
counter = 0

## 모델 학습

In [47]:
for epoch in range(3):
    js_net.train()
    optimizer.zero_grad()
    outputs = js_net(X_train_input, X_train_token, X_train_attention)
    
    loss = criterion(outputs, y_train)
    loss.backward()
    
    optimizer.step()
    
    model.eval()
    
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids, tokens, attention_mask, labels = batch
        output = js_net(input_ids, input_ids, tokens, attention_mask)
        
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        model.eval()

        with torch.no_grad():
            val_outputs = js_net(X_valid_input, X_valid_token, X_valid_attention)
            val_loss = criterion(val_outputs, y_val)

        print(f"epoch : {epoch} / Train loss : {math.sqrt(criterion(outputs, y_train))} / Validation loss: {math.sqrt(criterion(val_outputs, y_val))}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), './weights/best_model.pth')
            counter = 0
        else:
            counter += 1
        
        # 검증 손실이 일정 횟수 동안 향상되지 않으면 조기 종료
        if counter >= patience:
            print("Early stopping.")
            break

KeyboardInterrupt: 