In [None]:
import torch
import pandas as pd
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from tqdm import tqdm
import matplotlib.pyplot as plt
import re

# 디바이스 설정 (Apple Silicon GPU 사용)
device = torch.device("mps")
print(f"Using device: {device}")

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = RobertaForSequenceClassification.from_pretrained("microsoft/codebert-base", num_labels=2)
model.load_state_dict(torch.load("codebert_finetuned1.pt", map_location=device))
model.to(device)
model.eval()

In [None]:
test_df = pd.read_csv("codebert_validation.csv")

def clean_code(code: str) -> str:
    code = re.sub(r'""".*?"""|\'\'\'.*?\'\'\'', '', code, flags=re.DOTALL)
    code = re.sub(r'#.*', '', code)
    code = re.sub(r'\s+', ' ', code)
    return code.strip()

test_df['code1'] = test_df['code1'].astype(str).apply(clean_code)
test_df['code2'] = test_df['code2'].astype(str).apply(clean_code)

In [None]:
predictions = []
with torch.no_grad():
    for i in tqdm(range(len(test_df))):
        code1 = test_df.iloc[i]['code1']
        code2 = test_df.iloc[i]['code2']
        inputs = tokenizer(code1, code2, return_tensors="pt", truncation=True, padding=True, max_length=256)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        outputs = model(**inputs)
        pred = torch.argmax(outputs.logits, dim=1)
        predictions.append(pred.item())

In [None]:
submission = pd.DataFrame({
    "pair_id": test_df["pair_id"],
    "similar": predictions
})
submission.to_csv("submission.csv", index=False)
submission.head()

In [None]:
plt.figure(figsize=(6, 4))
submission["similar"].value_counts().sort_index().plot(kind="bar", color=["skyblue", "salmon"])
# print(submission["similar"].value_counts().sort_index())
plt.title("Prediction Distribution")
plt.xlabel("Similarity Label")
plt.ylabel("Count")
plt.xticks(ticks=[0, 1], labels=["Not Similar", "Similar"], rotation=0)
plt.grid(axis="y")
plt.tight_layout()
plt.savefig("prediction_distribution.png")
plt.show()

In [None]:
from sklearn.metrics import accuracy_score

# 실제 정답과 예측값 비교
true_labels = test_df["similar"].tolist()
accuracy = accuracy_score(true_labels, predictions)

print(f"Validation Accuracy: {accuracy:.4f}")