In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModel, AutoTokenizer
import random
import numpy as np
import itertools
import pickle
from tqdm import tqdm
import csv
import re

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(42) # Seed 고정

In [4]:
MODEL_NAME = "neulab/codebert-cpp"
MODEL_TAG = "neulab_codebert-cpp"
root_dir = "/home/leadawon5/decs_jupyter_lab/gitfiles/DACON/code_similarity/bigdata/train_code" 

class CodeEncoder(nn.Module):
    def __init__(self, model_name=MODEL_NAME):
        super(CodeEncoder, self).__init__()
        self.encoder = AutoModel.from_pretrained(model_name)

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.pooler_output  # Use the pooled output






In [5]:
class CodePairsDataset(Dataset):
    def __init__(self, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.samples = []

    def create_csv_dataset(self, root_dir, csv_file_path):
        with open(csv_file_path, 'w', newline='', encoding='utf-8') as file:
            DIVIDER = 2
            writer = csv.writer(file)
            writer.writerow(["file_path1", "file_path2", "label"])  # CSV 헤더

            problem_dirs = [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))]
            problem_files_count = {d: len([f for f in os.listdir(os.path.join(root_dir, d)) if f.endswith('.cpp')]) for d in problem_dirs}

            # 긍정적 쌍 생성
            # 긍정적 쌍 생성 과정 수정
            for problem_dir, file_count in tqdm(problem_files_count.items(), desc="Creating positive pairs"):
                positive_pairs = []  # 긍정적 쌍을 저장할 임시 리스트
                for i in range(1, file_count + 1):
                    for j in range(i + 1, file_count + 1):
                        file1 = os.path.join(root_dir, problem_dir, f"{problem_dir}_{i}.cpp")
                        file2 = os.path.join(root_dir, problem_dir, f"{problem_dir}_{j}.cpp")
                        positive_pairs.append([file1, file2, 1])  # 임시 리스트에 긍정적 쌍 추가

                # 생성된 긍정적 쌍 중 절반을 랜덤하게 선택
                num_pairs_to_keep = len(positive_pairs) // DIVIDER  # 유지할 긍정적 쌍의 수
                selected_pairs = random.sample(positive_pairs, num_pairs_to_keep)  # 랜덤하게 절반 선택

                # 선택된 긍정적 쌍을 CSV 파일에 쓰기
                for pair in selected_pairs:
                    writer.writerow(pair)  # CSV에 쓰기

            # 부정적 쌍 생성
            for problem_dir, file_count in tqdm(problem_files_count.items(), desc="Creating negative pairs"):
                num_neg_samples_per_file = file_count // (DIVIDER *2)

                for i in range(1, file_count + 1):
                    current_file = f"{problem_dir}_{i}.cpp"
                    neg_samples_added = 0
                    while neg_samples_added < num_neg_samples_per_file:
                        other_dir = random.choice(list(problem_files_count.keys()))
                        if other_dir == problem_dir:
                            continue
                        
                        other_file_index = random.randint(1, problem_files_count[other_dir])
                        other_file = f"{other_dir}_{other_file_index}.cpp"

                        writer.writerow([os.path.join(root_dir, problem_dir, current_file), os.path.join(root_dir, other_dir, other_file), 0])
                        neg_samples_added += 1

    def load_from_csv(self, csv_file_path):
        with open(csv_file_path, 'r', encoding='utf-8') as file:
            reader = csv.reader(file)
            next(reader)  # 헤더 건너뛰기
            self.samples = [(row[0], row[1], int(row[2])) for row in reader]
            


    def _remove_comments(self,cpp_code):
        # 멀티라인 주석 제거
        code = re.sub(r'/\*.*?\*/', '', cpp_code, flags=re.DOTALL)
        # 단일 라인 주석 제거
        code = re.sub(r'//.*', '', cleaned_code)
        
        # 문자열 내용 제거 (" " 안의 내용과 ' ' 안의 내용)
        code = re.sub(r'"(.*?)"', '""', code)
        code = re.sub(r"'(.*?)'", "''", code)
        # 빈 줄 제거
        code = re.sub(r'\n\s*\n', '\n', code)
        # 불필요한 공백 및 탭 변환 (연속된 공백을 하나의 공백으로)
        code = re.sub(r'\s+', ' ', code)
        # 문자열 앞뒤 공백 제거
        cleaned_code = code.strip()
        
        return cleaned_code

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        file_path1, file_path2, label = self.samples[idx]
        with open(file_path1, 'r', encoding='utf-8') as f:
            text1 = self._remove_comments(f.read())
        with open(file_path2, 'r', encoding='utf-8') as f:
            text2 = self._remove_comments(f.read())
        
        inputs1 = self.tokenizer(text1, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        inputs2 = self.tokenizer(text2, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)

        return {
            "input_ids1": inputs1['input_ids'].squeeze(0),
            "attention_mask1": inputs1['attention_mask'].squeeze(0),
            "input_ids2": inputs2['input_ids'].squeeze(0),
            "attention_mask2": inputs2['attention_mask'].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.float)
        }

In [6]:
class CodeComparisonModel(nn.Module):
    def __init__(self, encoder_model_name):
        super(CodeComparisonModel, self).__init__()
        self.encoder = CodeEncoder(encoder_model_name)
        # 두 임베딩을 결합한 후 사용할 추가적인 레이어를 정의합니다.
        self.fc = nn.Linear(self.encoder.encoder.config.hidden_size * 2, 1)

    def forward(self, input_ids1, attention_mask1, input_ids2, attention_mask2):
        embedding1 = self.encoder(input_ids1, attention_mask1)
        embedding2 = self.encoder(input_ids2, attention_mask2)
        # 두 임베딩을 결합합니다.
        combined_embedding = torch.cat((embedding1, embedding2), 1)
        # 결합된 임베딩을 추가적인 레이어에 통과시켜 이진 분류를 위한 로짓을 예측합니다.
        logits = self.fc(combined_embedding)
        # Sigmoid 함수를 적용하여 확률 값으로 변환
        probabilities = torch.sigmoid(logits)
        return probabilities.squeeze(-1)




In [7]:
def save_checkpoint(model, optimizer, epoch,step, filepath):
    if not os.path.exists(filepath):
        os.makedirs(filepath)
    checkpoint_path = os.path.join(filepath, f"{MODEL_TAG}_checkpoint_epoch_{epoch}_step_{step}.pth")
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, checkpoint_path)
    print(f"Checkpoint saved to {checkpoint_path}")

def train(model, data_loader, optimizer, device, epochs=1):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for idx,batch in tqdm(enumerate(data_loader),desc=f"{epoch} epoch is running!"):
            optimizer.zero_grad()
            input_ids1 = batch['input_ids1'].to(device)
            attention_mask1 = batch['attention_mask1'].to(device)
            input_ids2 = batch['input_ids2'].to(device)
            attention_mask2 = batch['attention_mask2'].to(device)
            labels = batch['labels'].to(device)
            # BCELoss 인스턴스를 생성
            criterion = nn.BCELoss()

            # 모델의 forward pass를 실행
            outputs = model(input_ids1, attention_mask1, input_ids2, attention_mask2)

            # loss 계산
            loss = criterion(outputs.squeeze(), labels.float())
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            if idx % 6000 == 0:
                save_checkpoint(model, optimizer, epoch,idx, "./model/")
        print(f"Epoch {epoch+1}/{epochs}, Training loss: {total_loss / len(data_loader)}")

        # Save checkpoint after each epoch
        

In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
dataset = CodePairsDataset(tokenizer)
#dataset.create_csv_dataset(root_dir, f'./bigdata/csvs/{MODEL_TAG}_dataset.csv')  # 데이터셋을 CSV 파일로 생성
dataset.load_from_csv(f'./bigdata/csvs/{MODEL_TAG}_dataset.csv')  # 생성된 CSV 파일에서 데이터셋 로드



In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
data_loader = DataLoader(dataset, batch_size=10, shuffle=True)
model = CodeComparisonModel(MODEL_NAME).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

Some weights of the model checkpoint at neulab/codebert-cpp were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at neulab/codebert-cpp and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train(model, data_loader, optimizer, device)

0 epoch is running!: 1it [00:16, 16.46s/it]

Checkpoint saved to ./model/neulab_codebert-cpp_checkpoint_epoch_0_step_0.pth


0 epoch is running!: 6001it [1:35:52,  2.49s/it]

Checkpoint saved to ./model/neulab_codebert-cpp_checkpoint_epoch_0_step_6000.pth


0 epoch is running!: 12001it [2:54:38,  2.32s/it]

Checkpoint saved to ./model/neulab_codebert-cpp_checkpoint_epoch_0_step_12000.pth


0 epoch is running!: 18001it [4:01:59,  2.41s/it]

Checkpoint saved to ./model/neulab_codebert-cpp_checkpoint_epoch_0_step_18000.pth


0 epoch is running!: 24001it [5:01:01,  2.20s/it]

Checkpoint saved to ./model/neulab_codebert-cpp_checkpoint_epoch_0_step_24000.pth


0 epoch is running!: 30001it [5:55:19,  3.82s/it]

Checkpoint saved to ./model/neulab_codebert-cpp_checkpoint_epoch_0_step_30000.pth


0 epoch is running!: 36001it [6:46:51,  2.00s/it]

Checkpoint saved to ./model/neulab_codebert-cpp_checkpoint_epoch_0_step_36000.pth


0 epoch is running!: 42001it [7:38:04,  1.98s/it]

Checkpoint saved to ./model/neulab_codebert-cpp_checkpoint_epoch_0_step_42000.pth


0 epoch is running!: 48001it [8:26:48,  2.09s/it]

Checkpoint saved to ./model/neulab_codebert-cpp_checkpoint_epoch_0_step_48000.pth


0 epoch is running!: 54001it [9:14:54,  2.14s/it]

Checkpoint saved to ./model/neulab_codebert-cpp_checkpoint_epoch_0_step_54000.pth


0 epoch is running!: 54023it [9:15:20,  1.16s/it]

In [None]:
def load_checkpoint(model, optimizer, filepath):
    if os.path.exists(filepath):
        checkpoint = torch.load(filepath)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        epoch = checkpoint['epoch']
        print(f"Checkpoint loaded from {filepath} at epoch {epoch}")
        return epoch
    else:
        print("No checkpoint found at specified path!")
        return None


# 체크포인트 파일 경로 지정
checkpoint_path = "./bigdata/model/checkpoint_epoch_1.pth"

# 체크포인트 불러오기
epoch = load_checkpoint(model, optimizer, checkpoint_path)

In [None]:
def infer(model, tokenizer, text1, text2, device, threshold=0.5):
    model.eval()  # 모델을 평가 모드로 설정
    with torch.no_grad():  # 그래디언트 계산을 비활성화
        # 두 코드 스니펫을 토큰화하고 디바이스로 이동
        inputs1 = tokenizer(text1, return_tensors='pt', max_length=512, padding='max_length', truncation=True)
        inputs2 = tokenizer(text2, return_tensors='pt', max_length=512, padding='max_length', truncation=True)
        
        inputs1 = {k: v.to(device) for k, v in inputs1.items()}
        inputs2 = {k: v.to(device) for k, v in inputs2.items()}

        # 모델을 통해 유사도 점수(확률) 계산
        probabilities = model(**inputs1, **inputs2)

        # 유사도 점수를 기반으로 판단
        predicted_label = (probabilities > threshold).long()  # 확률이 임계값보다 크면 1, 아니면 0
        print(f"Similarity score: {probabilities.item()}")
        print(f"Predicted label: {'Same' if predicted_label.item() == 1 else 'Different'}")

# 추론 실행
infer(model, tokenizer, code_text1, code_text2, device, threshold=0.5)
