In [16]:
import re
import torch
from konlpy.tag import Mecab
from transformers import BertTokenizer
import pandas as pd

In [17]:
class KoBERTDataset(torch.utils.data.Dataset):
    def __init__(self, topics, rubrics, responses, scores, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.mecab = Mecab()
        self.data = self._preprocess_data(topics, rubrics, responses, scores)

    def _preprocess_data(self, topics, rubrics, responses, scores):
        data = []
        for topic, rubric, response, score in zip(topics, rubrics, responses, scores):
            text = self._combine_text(topic, rubric, response)
            text = self._preprocess_text(text)
            inputs = self.tokenizer(text, max_length=self.max_length, truncation=True)
            data.append((inputs["input_ids"], inputs["attention_mask"], score))
        return data

    def _combine_text(self, topic, rubrics, response):
        text = f"**주제:** {topic}\n\n"
        text += f"**평가항목:** {rubrics} ({response})\n\n"
        return text

    def _preprocess_text(self, text):
        stopwords = ['은', '는', '이', '가', '의', '을', '를', '으로', '에서', '에', '한', '하는']
        text = ' '.join(word for word in self.mecab.morphs(text) if word not in stopwords)
        text = re.sub(r'\d+', '0', text)  # Normalize numbers
        text = re.sub(r'[a-zA-Z]+', 'a', text)  # Normalize English letters
        text = re.sub(r'[^\w\s]', ' ', text)  # Remove non-alphanumeric characters
        return text

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [13]:
import re
import torch
from konlpy.tag import Mecab
from transformers import BertTokenizer

class KoBERTDataset(torch.utils.data.Dataset):
    def __init__(self, topics, rubrics, responses, scores, rubric_max_scores, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.mecab = Mecab()
        self.rubric_max_scores = rubric_max_scores
        self.data = self._preprocess_data(topics, rubrics, responses, scores)

    def _preprocess_data(self, topics, rubrics, responses, scores):
        data = []
        for topic, rubric, response, score in zip(topics, rubrics, responses, scores):
            text = self._combine_text(topic, rubric, response)
            text = self._preprocess_text(text)
            inputs = self.tokenizer(text, max_length=self.max_length, truncation=True)
            normalized_scores = [s / m for s, m in zip(score, self.rubric_max_scores)]
            data.append((inputs["input_ids"], inputs["attention_mask"], normalized_scores))
        return data

    def _combine_text(self, topic, rubrics, response):
        text = f"**주제:** {topic}\n\n"
        for r, s in zip(rubrics, response):
            text += f"**평가항목:** {r} ({s})\n\n"
        return text

    def _preprocess_text(self, text):
        stopwords = ['은', '는', '이', '가', '의', '을', '를', '으로', '에서', '에', '한', '하는']
        text = ' '.join(word for word in self.mecab.morphs(text) if word not in stopwords)
        text = re.sub(r'\d+', '0', text)  # Normalize numbers
        text = re.sub(r'[a-zA-Z]+', 'a', text)  # Normalize English letters
        text = re.sub(r'[^\w\s]', ' ', text)  # Remove non-alphanumeric characters
        return text

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [15]:
class KoBertRegressorModel(nn.Module):
    def __init__(self, num_labels, config=None):
        super().__init__()
        if config is None:
            config = BertConfig.from_pretrained('monologg/kobert')
        self.bert = BertModel.from_pretrained('monologg/kobert', config=config)
        self.dropout = nn.Dropout(0.2)
        self.regressor = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.regressor(pooled_output)
        return logits

def train_model(model, train_dataloader, val_dataloader, optimizer, criterion, num_epochs, device):
    for epoch in range(num_epochs):
        model.train()
        for batch in train_dataloader:
            input_ids, attention_mask, labels = [data.to(device) for data in batch]
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        model.eval()
        with torch.no_grad():
            val_loss = 0
            for batch in val_dataloader:
                input_ids, attention_mask, labels = [data.to(device) for data in batch]
                outputs = model(input_ids, attention_mask)
                val_loss += criterion(outputs, labels).item()
            val_loss /= len(val_dataloader)
            print(f"Epoch {epoch+1}/{num_epochs}, Validation Loss: {val_loss:.4f}")

def evaluate_model(model, dataloader, device, num_rubrics):
    model.eval()
    all_predictions = []
    all_labels = []
    mse_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = [data.to(device) for data in batch]
            outputs = model(input_ids, attention_mask)
            predictions = outputs.argmax(dim=1)
            all_predictions.extend(predictions.tolist())
            all_labels.extend(labels.tolist())
            mse_loss += nn.MSELoss()(outputs, labels)

    accuracy = sum(p == l for p, l in zip(all_predictions, all_labels)) / len(all_labels)
    rubric_scores = []
    for rubric_idx in range(num_rubrics):
        rubric_predictions = [p[rubric_idx] for p in all_predictions]
        rubric_labels = [l[rubric_idx] for l in all_labels]
        rubric_accuracy = sum(p == l for p, l in zip(rubric_predictions, rubric_labels)) / len(rubric_labels)
        rubric_scores.append(rubric_accuracy)
    mse_loss /= len(dataloader)

    return accuracy, rubric_scores, mse_loss

def load_data(file_path):
    data = pd.read_csv(file_path, sep='\t')
    topics = data['topic'].tolist()
    rubrics = [data[f'rubric{i}'].tolist() for i in range(1, 4)]
    rubrics = list(zip(*rubrics))
    responses = [data[f'response{i}'].tolist() for i in range(1, 4)]
    responses = list(zip(*responses))
    scores = [data[f'score{i}'].tolist() for i in range(1, 4)]
    scores = list(zip(*scores))
    rubric_max_scores = [data[f'rubric{i}_max_score'].tolist()[0] for i in range(1, 4)]
    return topics, rubrics, responses, scores, rubric_max_scores

In [None]:
data_file = 'scoring_data.xlsx'  # CSV 파일 경로
topics, rubrics, responses, scores, rubric_max_scores = load_data(data_file)

In [None]:
def main():
    # topics = [...]  # 주제글 데이터
    # rubrics = [...]  # 루브릭 데이터
    # responses = [...]  # 응답글 데이터
    # scores = [...]  # 점수 데이터
    data_file = 'data.tsv'  # CSV 파일 경로
    topics, rubrics, responses, scores, rubric_max_scores = load_data(data_file)
    
    num_epochs = 10
    batch_size = 16
    learning_rate = 2e-5
    max_length = 512
    num_labels = 3
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    tokenizer = BertTokenizer.from_pretrained('monologg/kobert')
    dataset = KoBERTDataset(topics, rubrics, responses, scores, rubric_max_scores, tokenizer, max_length=512)

    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

    model = KoBertRegressorModel(num_labels)
    model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()

    train_model(model, train_dataloader, val_dataloader, optimizer, criterion, num_epochs, device)

    test_topics = [...]  # 테스트 주제글 데이터
    test_rubrics = [...]  # 테스트 루브릭 데이터
    test_responses = [...]  # 테스트 응답글 데이터
    test_scores = [...]  # 테스트 점수 데이터

    test_dataset = KoBERTDataset(test_topics, test_rubrics, test_responses, test_scores, tokenizer, max_length)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

    accuracy, rubric_scores, mse_loss = evaluate_model(model, test_dataloader, device, num_labels)
    print(f"Test Accuracy: {accuracy:.4f}")
    print(f"Rubric Scores: {rubric_scores}")
    print(f"MSE Loss: {mse_loss:.4f}")

if __name__ == "__main__":
    main()