In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModel, AutoTokenizer
import random
import numpy as np
import itertools
import pickle
from tqdm import tqdm
import csv
import re

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(42) # Seed 고정

In [4]:
MODEL_NAME = "neulab/codebert-cpp"
MODEL_TAG = "neulab_codebert-cpp"
root_dir = "/home/leadawon5/decs_jupyter_lab/gitfiles/DACON/code_similarity/bigdata/train_code" 

class CodeEncoder(nn.Module):
    def __init__(self, model_name=MODEL_NAME):
        super(CodeEncoder, self).__init__()
        self.encoder = AutoModel.from_pretrained(model_name)

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.pooler_output  # Use the pooled output






In [6]:
class CodeComparisonModel(nn.Module):
    def __init__(self, encoder_model_name):
        super(CodeComparisonModel, self).__init__()
        self.encoder = CodeEncoder(encoder_model_name)
        # 두 임베딩을 결합한 후 사용할 추가적인 레이어를 정의합니다.
        self.fc = nn.Linear(self.encoder.encoder.config.hidden_size * 2, 1)

    def forward(self, input_ids1, attention_mask1, input_ids2, attention_mask2):
        embedding1 = self.encoder(input_ids1, attention_mask1)
        embedding2 = self.encoder(input_ids2, attention_mask2)
        # 두 임베딩을 결합합니다.
        combined_embedding = torch.cat((embedding1, embedding2), 1)
        # 결합된 임베딩을 추가적인 레이어에 통과시켜 이진 분류를 위한 로짓을 예측합니다.
        logits = self.fc(combined_embedding)
        # Sigmoid 함수를 적용하여 확률 값으로 변환
        probabilities = torch.sigmoid(logits)
        return probabilities.squeeze(-1)




In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)




In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CodeComparisonModel(MODEL_NAME).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

Some weights of the model checkpoint at neulab/codebert-cpp were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at neulab/codebert-cpp and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0 epoch is running!: 1it [00:16, 16.46s/it]

Checkpoint saved to ./model/neulab_codebert-cpp_checkpoint_epoch_0_step_0.pth


0 epoch is running!: 3278it [54:50,  1.06s/it]

In [None]:
def load_checkpoint(model, optimizer, filepath):
    if os.path.exists(filepath):
        checkpoint = torch.load(filepath)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        epoch = checkpoint['epoch']
        print(f"Checkpoint loaded from {filepath} at epoch {epoch}")
        return epoch
    else:
        print("No checkpoint found at specified path!")
        return None


# 체크포인트 파일 경로 지정
checkpoint_path = "./bigdata/model/checkpoint_epoch_1.pth"

# 체크포인트 불러오기
epoch = load_checkpoint(model, optimizer, checkpoint_path)

In [None]:
def infer(model, tokenizer, text1, text2, device, threshold=0.5):
    model.eval()  # 모델을 평가 모드로 설정
    with torch.no_grad():  # 그래디언트 계산을 비활성화
        # 두 코드 스니펫을 토큰화하고 디바이스로 이동
        inputs1 = tokenizer(text1, return_tensors='pt', max_length=512, padding='max_length', truncation=True)
        inputs2 = tokenizer(text2, return_tensors='pt', max_length=512, padding='max_length', truncation=True)
        
        inputs1 = {k: v.to(device) for k, v in inputs1.items()}
        inputs2 = {k: v.to(device) for k, v in inputs2.items()}

        # 모델을 통해 유사도 점수(확률) 계산
        probabilities = model(**inputs1, **inputs2)

        # 유사도 점수를 기반으로 판단
        predicted_label = (probabilities > threshold).long()  # 확률이 임계값보다 크면 1, 아니면 0
        print(f"Similarity score: {probabilities.item()}")
        print(f"Predicted label: {'Same' if predicted_label.item() == 1 else 'Different'}")

# 추론 실행
infer(model, tokenizer, code_text1, code_text2, device, threshold=0.5)


In [None]:
# tokenizer와 model은 미리 정의되어 있어야 합니다.
# device는 'cuda' 또는 'cpu'일 수 있습니다.

def predict(model, tokenizer, test_data, device, threshold=0.5):
    model.eval()  # 모델을 평가 모드로 설정
    predictions = []
    
    with torch.no_grad():  # 그래디언트 계산 비활성화
        for index, row in tqdm(test_data.iterrows(), total=test_data.shape[0]):
            # 코드 쌍을 토큰화합니다.
            inputs1 = tokenizer(row['code1'], return_tensors='pt', max_length=512, padding='max_length', truncation=True).to(device)
            inputs2 = tokenizer(row['code2'], return_tensors='pt', max_length=512, padding='max_length', truncation=True).to(device)
            
            # 모델을 통해 유사도 점수(로짓)를 계산합니다.
            logits = model(inputs1['input_ids'], inputs1['attention_mask'], inputs2['input_ids'], inputs2['attention_mask'])
            
            # 로짓을 확률로 변환하기 위해 sigmoid 함수를 적용합니다.
            probs = torch.sigmoid(logits).cpu().numpy()
            
            # 설정한 임계값을 기준으로 유사 여부를 판단합니다.
            prediction = 1 if probs > threshold else 0
            predictions.append(prediction)
    
    return predictions

# 예제 사용
test_data = pd.read_csv("./bigdata/test.csv")
# 모델과 tokenizer가 정의되어 있어야 합니다.
predictions = predict(model, tokenizer, test_data, device, threshold=0.5)

# 결과를 제출 파일로 저장
submission = pd.read_csv('./bigdata/sample_submission.csv')
submission['similar'] = predictions
submission.to_csv('./bigdata/predictions_submit.csv', index=False)