In [24]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
from transformers import BertTokenizer, BertModel
from torchviz import make_dot

# 데이터 전처리
class DataPreprocessor:
    def __init__(self, stop_words):
        self.stop_words = stop_words

    def preprocess(self, text):
        # 불용어 제거
        text = ' '.join([word for word in text.split() if word.lower() not in self.stop_words])
        return text

# 데이터셋
class ScoringDataset:
    def __init__(self, file_path, preprocessor):
        self.data = pd.read_excel(file_path)
        self.preprocessor = preprocessor
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.max_len = 512

    def preprocess_data(self):
        # 데이터 전처리
        self.data['text'] = self.data['주제글'] + ' ' + self.data['모범글'] + ' ' + self.data['수험자 응답글']
        self.data['text'] = self.data['text'].apply(self.preprocessor.preprocess)
        self.X = self.data['text'].tolist()
        self.y = self.data[['루블릭1점수', '루블릭2점수', '루블릭3점수']].values

    def tokenize_data(self):
        # 데이터 토큰화
        self.X = self.tokenizer.batch_encode_plus(
            self.X,
            max_length=self.max_len,
            padding=True,
            truncation=True,
            return_tensors='pt'
        )['input_ids']

    def split_data(self, test_size=0.2, val_size=0.2):
        # 데이터 분할
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=test_size, random_state=42)
        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_size, random_state=42)
        return X_train, X_val, X_test, torch.tensor(y_train, dtype=torch.float32), torch.tensor(y_val, dtype=torch.float32), torch.tensor(y_test, dtype=torch.float32)

# 어텐션 레이어
class AttentionLayer(nn.Module):
    def __init__(self, hidden_dim):
        super(AttentionLayer, self).__init__()
        self.hidden_dim = hidden_dim
        self.query = nn.Linear(hidden_dim, hidden_dim)
        self.key = nn.Linear(hidden_dim, hidden_dim)
        self.value = nn.Linear(hidden_dim, hidden_dim)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        query = self.query(x)
        key = self.key(x)
        value = self.value(x)
        attention_scores = torch.matmul(query, key.transpose(-2, -1)) / np.sqrt(self.hidden_dim)
        attention_weights = self.softmax(attention_scores)
        context_vector = torch.matmul(attention_weights, value)
        return context_vector, attention_weights

# 채점 모델
class ScoringModel(nn.Module):
    def __init__(self, hidden_dim):
        super(ScoringModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-multilingual-cased')
        self.attention1 = AttentionLayer(self.bert.config.hidden_size)
        self.attention2 = AttentionLayer(self.bert.config.hidden_size)
        self.attention3 = AttentionLayer(self.bert.config.hidden_size)
        self.hidden = nn.Linear(self.bert.config.hidden_size, hidden_dim)
        self.output1 = nn.Linear(hidden_dim, 1)
        self.output2 = nn.Linear(hidden_dim, 1)
        self.output3 = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        if x.size(1) > 512:
            x = x[:, :512]  # 입력 데이터의 길이를 512로 조정
        if x.dim() == 1:
            x = x.unsqueeze(0)  # 배치 차원 추가
        outputs = self.bert(x)
        last_hidden_state = outputs[0]
        context_vector1, attention_weights1 = self.attention1(last_hidden_state)
        context_vector2, attention_weights2 = self.attention2(last_hidden_state)
        context_vector3, attention_weights3 = self.attention3(last_hidden_state)
        hidden1 = torch.relu(self.hidden(context_vector1[:, 0, :]))
        hidden2 = torch.relu(self.hidden(context_vector2[:, 0, :]))
        hidden3 = torch.relu(self.hidden(context_vector3[:, 0, :]))
        output1 = self.output1(hidden1)
        output2 = self.output2(hidden2)
        output3 = self.output3(hidden3)
        return output1, output2, output3, attention_weights1, attention_weights2, attention_weights3

# 학습
class Trainer:
    def __init__(self, model, train_data, val_data, epochs, batch_size, learning_rate):
        self.model = model
        self.train_data = train_data
        self.val_data = val_data
        self.epochs = epochs
        self.batch_size = batch_size
        self.criterion = nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=learning_rate)
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model.to(self.device)

    def train(self):
        train_losses = []
        val_losses = []

        for epoch in range(self.epochs):
            self.model.train()
            train_loss = 0.0
            for i in range(0, len(self.train_data[0]), self.batch_size):
                batch_X = self.train_data[0][i:i+self.batch_size].to(self.device)
                batch_y = self.train_data[1][i:i+self.batch_size].to(self.device)

                self.optimizer.zero_grad()
                output1, output2, output3, _, _, _ = self.model(batch_X)
                loss = self.criterion(output1, batch_y[:, 0].unsqueeze(1)) + \
                       self.criterion(output2, batch_y[:, 1].unsqueeze(1)) + \
                       self.criterion(output3, batch_y[:, 2].unsqueeze(1))
                loss.backward()
                self.optimizer.step()
                train_loss += loss.item()

            train_loss /= len(self.train_data[0]) / self.batch_size
            train_losses.append(train_loss)

            self.model.eval()
            with torch.no_grad():
                val_X = self.val_data[0].to(self.device)
                val_y = self.val_data[1].to(self.device)
                output1, output2, output3, _, _, _ = self.model(val_X)
                val_loss = self.criterion(output1, val_y[:, 0].unsqueeze(1)) + \
                           self.criterion(output2, val_y[:, 1].unsqueeze(1)) + \
                           self.criterion(output3, val_y[:, 2].unsqueeze(1))
                val_losses.append(val_loss.item())

            print(f"Epoch [{epoch+1}/{self.epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_losses[-1]:.4f}")

        return train_losses, val_losses

# 평가
class Evaluator:
    def __init__(self, model, test_data, tokenizer):
        self.model = model
        self.test_data = test_data
        self.tokenizer = tokenizer
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model.to(self.device)

    def evaluate(self):
        self.model.eval()
        with torch.no_grad():
            test_X = self.test_data[0].to(self.device)
            test_y = self.test_data[1].numpy()
            output1, output2, output3, attention_weights1, attention_weights2, attention_weights3 = self.model(test_X)
            y_pred = torch.cat((output1, output2, output3), dim=1).cpu().numpy()

        mse = mean_squared_error(test_y, y_pred)
        mae = mean_absolute_error(test_y, y_pred)
        r2 = r2_score(test_y, y_pred)

        print(f"Test MSE: {mse:.4f}, Test MAE: {mae:.4f}, Test R2: {r2:.4f}")

        # 어텐션 시각화
        self.visualize_attention(test_X, attention_weights1, attention_weights2, attention_weights3)

        # 성능 시각화
        self.visualize_performance(test_y, y_pred)

        return mse, mae, r2

    def visualize_attention(self, test_X, attention_weights1, attention_weights2, attention_weights3):
        sample_idx = 0
        sample_input_ids = test_X[sample_idx].cpu().numpy()
        sample_tokens = self.tokenizer.convert_ids_to_tokens(sample_input_ids)

        fig, axs = plt.subplots(1, 3, figsize=(20, 6))
        for i, (ax, attention_weights) in enumerate(zip(axs, [attention_weights1, attention_weights2, attention_weights3])):
            attention = attention_weights[sample_idx].cpu().numpy()
            ax.imshow(attention, cmap='hot', interpolation='nearest', aspect='auto')
            ax.set_xticks(range(len(sample_tokens)))
            ax.set_xticklabels(sample_tokens, rotation=45, ha='right')
            ax.set_yticks(range(attention.shape[0]))
            ax.set_yticklabels([f'Layer {j+1}' for j in range(attention.shape[0])])
            ax.set_title(f'Attention Weights - Rubric {i+1}')
        plt.tight_layout()
        plt.show()

    def visualize_performance(self, test_y, y_pred):
        fig, axs = plt.subplots(1, 3, figsize=(20, 6))
        for i, ax in enumerate(axs):
            ax.scatter(test_y[:, i], y_pred[:, i], alpha=0.5)
            ax.plot([0, 1], [0, 1], 'r--')
            ax.set_xlabel('True Score')
            ax.set_ylabel('Predicted Score')
            ax.set_title(f'Rubric {i+1}')
        plt.tight_layout()
        plt.show()

# 예측
class Predictor:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model.to(self.device)

    def predict(self, prompts, exemplars, responses):
        input_texts = [f"{prompt} {exemplar} {response}" for prompt, exemplar, response in zip(prompts, exemplars, responses)]
        input_ids = self.tokenizer.batch_encode_plus(
            input_texts,
            max_length=512,
            padding=True,
            truncation=True,
            return_tensors='pt'
        )['input_ids'].to(self.device)

        self.model.eval()
        with torch.no_grad():
            output1, output2, output3, _, _, _ = self.model(input_ids)
            scores = torch.cat((output1, output2, output3), dim=1).cpu().numpy()

        return scores

In [23]:


def main():
    # 데이터 로드 및 전처리
    file_path = 'scoring_data.xlsx'
    stop_words = ['은', '는', '이', '가', '을', '를', '에', '의', '과', '도', '으로', '만', '겠다', '습니다', '니다', '하다']
    preprocessor = DataPreprocessor(stop_words)
    dataset = ScoringDataset(file_path, preprocessor)
    dataset.preprocess_data()
    dataset.tokenize_data()
    X_train, X_val, X_test, y_train, y_val, y_test = dataset.split_data()

    # 모델 구성
    hidden_dim = 64
    model = ScoringModel(hidden_dim)

    # 모델 구조 시각화
    input_data = torch.randn(1, 512).long()
    output1, output2, output3, _, _, _ = model(input_data)
    graph = make_dot(output1, params=dict(model.named_parameters()))
    graph.render("model_graph", format="png")

    # 학습
    epochs = 5
    batch_size = 16
    learning_rate = 2e-5
    trainer = Trainer(model, (X_train, y_train), (X_val, y_val), epochs, batch_size, learning_rate)
    train_losses, val_losses = trainer.train()

    # 학습 과정 시각화
    plt.figure(figsize=(10, 5))
    plt.plot(train_losses, label='Train Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.title('Training and Validation Loss')
    plt.show()

    # 평가
    evaluator = Evaluator(model, (X_test, y_test), dataset.tokenizer)
    mse, mae, r2 = evaluator.evaluate()

    # 예측
    prompts = ['새로운 주제글 1', '새로운 주제글 2']
    exemplars = ['새로운 모범글 1', '새로운 모범글 2']
    responses = ['새로운 응답글 1', '새로운 응답글 2']
    predictor = Predictor(model, dataset.tokenizer)
    scores = predictor.predict(prompts, exemplars, responses)

    print("\n예측 결과:")
    for i in range(len(prompts)):
        print(f"주제글: {prompts[i]}")
        print(f"모범글: {exemplars[i]}")
        print(f"응답글: {responses[i]}")
        print(f"예측 점수: 루브릭1 - {scores[i][0]:.2f}, 루브릭2 - {scores[i][1]:.2f}, 루브릭3 - {scores[i][2]:.2f}")
        print()

if __name__ == '__main__':
    main()

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB (GPU 0; 7.78 GiB total capacity; 352.00 MiB already allocated; 3.06 MiB free; 352.00 MiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF