In [None]:
# This script was originally written in Korean. Comments and string literals have been translated into English for wider accessibility.

import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from kobert_transformers import get_tokenizer
from transformers import BertForSequenceClassification

# Input CSV file
input_filename = "03_cleaned_dc_bitcoin_data.csv"
input_file_path = f"../data/{input_filename}"

# Output CSV file
output_filename = "05_kobert_baseline_test_output_1k.csv"
output_file_path = f"../data/{output_filename}"

# 1. Load Tokenizer and Model
kobert_tokenizer = get_tokenizer()

MODEL_NAME = "monologg/kobert"
NUM_LABELS = 3  # Assuming 3 labels (positive, negative, neutral)
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 2. Define Test Dataset
class TestDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer.encode_plus(
            text,
            return_tensors='pt',
            max_length=self.max_length,
            padding='max_length',
            truncation=True
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
        }

# 3. Read CSV file (use 'Title' column)
# Assumes the input CSV has an English header 'Title' (translated from '제목')
df = pd.read_csv(input_file_path)

# Check CSV column names
print("CSV Columns:", df.columns)

# Extract text data from 'Title' column
texts = df["Title"].fillna("").tolist() # Changed '제목' to 'Title'

# Use only the top 1000 data entries for testing
texts = texts[:1000]

# 4. Create Dataset and DataLoader
test_dataset = TestDataset(texts, kobert_tokenizer, max_length=128)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 5. Perform Inference
results = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Inference"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)
        preds = torch.argmax(probs, dim=1).cpu().numpy()

        for i in range(len(preds)):
            results.append({
                "pred_label": int(preds[i]),
                "prob_0": float(probs[i][0]),
                "prob_1": float(probs[i][1]),
                "prob_2": float(probs[i][2]) if NUM_LABELS == 3 else None
            })

# 6. Save Results
results_df = pd.DataFrame(results)
final_df = pd.concat([df[:1000], results_df], axis=1)
final_df.to_csv(output_file_path, index=False)

print(f"Inference complete. Results saved to '{output_file_path}'.") # Changed from "추론 완료. 결과가 ...에 저장되었습니다."
