In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


In [3]:
'''
"MickyMike/graphcodebert-c"
# tokenizer와 model은 미리 정의되어 있어야 합니다.
# device는 'cuda' 또는 'cpu'일 수 있습니다.

def predict(model, tokenizer, test_data, device, threshold=0.5):
    model.eval()  # 모델을 평가 모드로 설정
    predictions = []
    
    with torch.no_grad():  # 그래디언트 계산 비활성화
        for index, row in tqdm(test_data.iterrows(), total=test_data.shape[0]):
            # 코드 쌍을 토큰화합니다.
            text1 = remove_comments(row['code1'])
            text2 = remove_comments(row['code2'])
            inputs1 = tokenizer(text1, return_tensors='pt', max_length=512, padding='max_length', truncation=True).to(device)
            inputs2 = tokenizer(text2, return_tensors='pt', max_length=512, padding='max_length', truncation=True).to(device)
            
            # 모델을 통해 유사도 점수(로짓)를 계산합니다.
            logits = model(inputs1['input_ids'], inputs1['attention_mask'], inputs2['input_ids'], inputs2['attention_mask'])
            
            # 로짓을 확률로 변환하기 위해 sigmoid 함수를 적용합니다.
            probs = torch.sigmoid(logits).cpu().numpy()
            
            # 설정한 임계값을 기준으로 유사 여부를 판단합니다.
            prediction = 1 if probs > threshold else 0
            predictions.append(prediction)
    
    return predictions

# 예제 사용
test_data = pd.read_csv("./bigdata/test.csv")
# 모델과 tokenizer가 정의되어 있어야 합니다.
predictions = predict(model, tokenizer, test_data, device, threshold=0.5)

# 결과를 제출 파일로 저장
submission = pd.read_csv('./bigdata/sample_submission.csv')
submission['similar'] = predictions
submission.to_csv('./bigdata/predictions_submit.csv', index=False)

'''

'\n"MickyMike/graphcodebert-c"\n# tokenizer와 model은 미리 정의되어 있어야 합니다.\n# device는 \'cuda\' 또는 \'cpu\'일 수 있습니다.\n\ndef predict(model, tokenizer, test_data, device, threshold=0.5):\n    model.eval()  # 모델을 평가 모드로 설정\n    predictions = []\n    \n    with torch.no_grad():  # 그래디언트 계산 비활성화\n        for index, row in tqdm(test_data.iterrows(), total=test_data.shape[0]):\n            # 코드 쌍을 토큰화합니다.\n            text1 = remove_comments(row[\'code1\'])\n            text2 = remove_comments(row[\'code2\'])\n            inputs1 = tokenizer(text1, return_tensors=\'pt\', max_length=512, padding=\'max_length\', truncation=True).to(device)\n            inputs2 = tokenizer(text2, return_tensors=\'pt\', max_length=512, padding=\'max_length\', truncation=True).to(device)\n            \n            # 모델을 통해 유사도 점수(로짓)를 계산합니다.\n            logits = model(inputs1[\'input_ids\'], inputs1[\'attention_mask\'], inputs2[\'input_ids\'], inputs2[\'attention_mask\'])\n            \n            # 로짓을 확률로 변환하기 위해 sigm

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModel, AutoTokenizer
import random
import numpy as np
import itertools
import pickle
from tqdm import tqdm
import csv
import re

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(42) # Seed 고정

In [6]:
MODEL_NAME = "MickyMike/graphcodebert-c"
MODEL_TAG = "MickyMike_graphcodebert-c"
root_dir = "/home/leadawon5/decs_jupyter_lab/gitfiles/DACON/code_similarity/bigdata/train_code" 

class CodeEncoder(nn.Module):
    def __init__(self, model_name=MODEL_NAME):
        super(CodeEncoder, self).__init__()
        self.encoder = AutoModel.from_pretrained(model_name)

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.pooler_output  # Use the pooled output






In [7]:
class CodePairsDataset(Dataset):
    def __init__(self, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.samples = []

    def create_csv_dataset(self, root_dir, csv_file_path):
        with open(csv_file_path, 'w', newline='', encoding='utf-8') as file:
            DIVIDER = 8
            writer = csv.writer(file)
            writer.writerow(["file_path1", "file_path2", "label"])  # CSV 헤더

            problem_dirs = [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))]
            problem_files_count = {d: len([f for f in os.listdir(os.path.join(root_dir, d)) if f.endswith('.cpp')]) for d in problem_dirs}

            # 긍정적 쌍 생성
            # 긍정적 쌍 생성 과정 수정
            for problem_dir, file_count in tqdm(problem_files_count.items(), desc="Creating positive pairs"):
                positive_pairs = []  # 긍정적 쌍을 저장할 임시 리스트
                for i in range(1, file_count + 1):
                    for j in range(i + 1, file_count + 1):
                        file1 = os.path.join(root_dir, problem_dir, f"{problem_dir}_{i}.cpp")
                        file2 = os.path.join(root_dir, problem_dir, f"{problem_dir}_{j}.cpp")
                        positive_pairs.append([file1, file2, 1])  # 임시 리스트에 긍정적 쌍 추가

                # 생성된 긍정적 쌍 중 절반을 랜덤하게 선택
                num_pairs_to_keep = len(positive_pairs) // DIVIDER  # 유지할 긍정적 쌍의 수
                selected_pairs = random.sample(positive_pairs, num_pairs_to_keep)  # 랜덤하게 절반 선택

                # 선택된 긍정적 쌍을 CSV 파일에 쓰기
                for pair in selected_pairs:
                    writer.writerow(pair)  # CSV에 쓰기

            # 부정적 쌍 생성
            for problem_dir, file_count in tqdm(problem_files_count.items(), desc="Creating negative pairs"):
                num_neg_samples_per_file = file_count // (DIVIDER *2)

                for i in range(1, file_count + 1):
                    current_file = f"{problem_dir}_{i}.cpp"
                    neg_samples_added = 0
                    while neg_samples_added < num_neg_samples_per_file:
                        other_dir = random.choice(list(problem_files_count.keys()))
                        if other_dir == problem_dir:
                            continue
                        
                        other_file_index = random.randint(1, problem_files_count[other_dir])
                        other_file = f"{other_dir}_{other_file_index}.cpp"

                        writer.writerow([os.path.join(root_dir, problem_dir, current_file), os.path.join(root_dir, other_dir, other_file), 0])
                        neg_samples_added += 1

    def load_from_csv(self, csv_file_path):
        with open(csv_file_path, 'r', encoding='utf-8') as file:
            reader = csv.reader(file)
            next(reader)  # 헤더 건너뛰기
            self.samples = [(row[0], row[1], int(row[2])) for row in reader]
            


    def _remove_comments(self,cpp_code):
        # 멀티라인 주석 제거
        code = re.sub(r'/\*.*?\*/', '', cpp_code, flags=re.DOTALL)
        # 단일 라인 주석 제거
        code = re.sub(r'//.*', '', code)
        
        # 문자열 내용 제거 (" " 안의 내용과 ' ' 안의 내용)
        code = re.sub(r'"(.*?)"', '""', code)
        code = re.sub(r"'(.*?)'", "''", code)
        # 빈 줄 제거
        code = re.sub(r'\n\s*\n', '\n', code)
        # 불필요한 공백 및 탭 변환 (연속된 공백을 하나의 공백으로)
        code = re.sub(r'\s+', ' ', code)
        # 문자열 앞뒤 공백 제거
        cleaned_code = code.strip()
        
        return cleaned_code

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        file_path1, file_path2, label = self.samples[idx]
        with open(file_path1, 'r', encoding='utf-8') as f:
            text1 = self._remove_comments(f.read())
        with open(file_path2, 'r', encoding='utf-8') as f:
            text2 = self._remove_comments(f.read())
        
        inputs1 = self.tokenizer(text1, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        inputs2 = self.tokenizer(text2, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)

        return {
            "input_ids1": inputs1['input_ids'].squeeze(0),
            "attention_mask1": inputs1['attention_mask'].squeeze(0),
            "input_ids2": inputs2['input_ids'].squeeze(0),
            "attention_mask2": inputs2['attention_mask'].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.float)
        }

In [8]:
class CodeComparisonModel(nn.Module):
    def __init__(self, encoder_model_name):
        super(CodeComparisonModel, self).__init__()
        self.encoder = CodeEncoder(encoder_model_name)
        # 두 임베딩을 결합한 후 사용할 추가적인 레이어를 정의합니다.
        self.fc = nn.Linear(self.encoder.encoder.config.hidden_size * 2, 1)

    def forward(self, input_ids1, attention_mask1, input_ids2, attention_mask2):
        embedding1 = self.encoder(input_ids1, attention_mask1)
        embedding2 = self.encoder(input_ids2, attention_mask2)
        # 두 임베딩을 결합합니다.
        combined_embedding = torch.cat((embedding1, embedding2), 1)
        # 결합된 임베딩을 추가적인 레이어에 통과시켜 이진 분류를 위한 로짓을 예측합니다.
        logits = self.fc(combined_embedding)
        # Sigmoid 함수를 적용하여 확률 값으로 변환
        probabilities = torch.sigmoid(logits)
        return probabilities.squeeze(-1)




In [9]:
def save_checkpoint(model, optimizer, epoch, step, filepath, MODEL_TAG):
    if not os.path.exists(filepath):
        os.makedirs(filepath)
    checkpoint_path = os.path.join(filepath, f"{MODEL_TAG}_checkpoint_epoch_{epoch}_step_{step}.pth")
    torch.save({
        'epoch': epoch,
        'step': step,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, checkpoint_path)
    print(f"Checkpoint saved to {checkpoint_path}")

def train(model, data_loader, optimizer, device, start_epoch=0, start_step=0, epochs=2):
    for epoch in range(start_epoch, epochs):
        model.train()
        total_loss = 0
        for idx, batch in enumerate(data_loader, start=1):  # enumerate starts at 1 for correct modulo operation
            if epoch == start_epoch and idx < start_step:
                continue  # Skip to the saved step of the current epoch
            optimizer.zero_grad()
            input_ids1 = batch['input_ids1'].to(device)
            attention_mask1 = batch['attention_mask1'].to(device)
            input_ids2 = batch['input_ids2'].to(device)
            attention_mask2 = batch['attention_mask2'].to(device)
            labels = batch['labels'].to(device)

            criterion = nn.BCELoss()

            outputs = model(input_ids1, attention_mask1, input_ids2, attention_mask2)

            loss = criterion(outputs.squeeze(), labels.float())
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            if idx % 6000 == 0:
                save_checkpoint(model, optimizer, epoch, idx, "./model/", MODEL_TAG)
        print(f"Epoch {epoch+1}/{epochs}, Training loss: {total_loss / (len(data_loader)-start_step) if epoch == start_epoch else len(data_loader)}")

        # Adjust for next epochs
        start_step = 0  # Reset start_step after the first resumed epoch
        

In [10]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
dataset = CodePairsDataset(tokenizer)
dataset.create_csv_dataset(root_dir, f'./bigdata/csvs/{MODEL_TAG}_dataset.csv'
                          
                          )  # 데이터셋을 CSV 파일로 생성
dataset.load_from_csv(f'./bigdata/csvs/{MODEL_TAG}_dataset.csv')  # 생성된 CSV 파일에서 데이터셋 로드



Downloading tokenizer_config.json: 100%|████| 1.30k/1.30k [00:00<00:00, 116kB/s]
Downloading vocab.json: 100%|████████████████| 798k/798k [00:00<00:00, 1.07MB/s]
Downloading merges.txt: 100%|████████████████| 456k/456k [00:00<00:00, 41.9MB/s]
Downloading tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 1.46MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████| 772/772 [00:00<00:00, 346kB/s]
Creating positive pairs: 100%|████████████████| 500/500 [05:01<00:00,  1.66it/s]
Creating negative pairs: 100%|████████████████| 500/500 [01:59<00:00,  4.17it/s]


In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
data_loader = DataLoader(dataset, batch_size=240, shuffle=True)
model = CodeComparisonModel(MODEL_NAME).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

Downloading config.json: 100%|█████████████████| 748/748 [00:00<00:00, 86.6kB/s]
Downloading pytorch_model.bin: 100%|█████████| 499M/499M [00:29<00:00, 17.2MB/s]
Some weights of the model checkpoint at MickyMike/graphcodebert-c were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at MickyMike/graphcodebert-c and are newly initialized: ['roberta.pooler

In [12]:
train(model, data_loader, optimizer, device)

Checkpoint saved to ./model/MickyMike_graphcodebert-c_checkpoint_epoch_0_step_6000.pth
Checkpoint saved to ./model/MickyMike_graphcodebert-c_checkpoint_epoch_0_step_12000.pth
Checkpoint saved to ./model/MickyMike_graphcodebert-c_checkpoint_epoch_0_step_18000.pth
Checkpoint saved to ./model/MickyMike_graphcodebert-c_checkpoint_epoch_0_step_24000.pth
Checkpoint saved to ./model/MickyMike_graphcodebert-c_checkpoint_epoch_0_step_30000.pth
Checkpoint saved to ./model/MickyMike_graphcodebert-c_checkpoint_epoch_0_step_36000.pth
Checkpoint saved to ./model/MickyMike_graphcodebert-c_checkpoint_epoch_0_step_42000.pth
Checkpoint saved to ./model/MickyMike_graphcodebert-c_checkpoint_epoch_0_step_48000.pth
Checkpoint saved to ./model/MickyMike_graphcodebert-c_checkpoint_epoch_0_step_54000.pth
Checkpoint saved to ./model/MickyMike_graphcodebert-c_checkpoint_epoch_0_step_60000.pth
Checkpoint saved to ./model/MickyMike_graphcodebert-c_checkpoint_epoch_0_step_66000.pth
Checkpoint saved to ./model/Micky

KeyboardInterrupt: 

In [None]:
def load_checkpoint(model, optimizer, filepath):
    if os.path.exists(filepath):
        checkpoint = torch.load(filepath)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        epoch = checkpoint['epoch']
        step = checkpoint.get('step', 0)  # Default to 0 if not found
        print(f"Checkpoint loaded from {filepath} at epoch {epoch}, step {step}")
        return epoch, step
    else:
        print("No checkpoint found at specified path!")
        return None, None


# 체크포인트 파일 경로 지정
checkpoint_path = "./bigdata/model/checkpoint_epoch_1.pth"

# 체크포인트 불러오기
epoch, step = load_checkpoint(model, optimizer, checkpoint_path)

In [None]:
# if epoch is not None:
#     train(model, data_loader, optimizer, device, start_epoch=epoch, start_step=step, epochs=desired_epochs, MODEL_TAG=MODEL_TAG)

In [None]:
def remove_comments(cpp_code):
        # 멀티라인 주석 제거
        code = re.sub(r'/\*.*?\*/', '', cpp_code, flags=re.DOTALL)
        # 단일 라인 주석 제거
        code = re.sub(r'//.*', '', code)
        
        # 문자열 내용 제거 (" " 안의 내용과 ' ' 안의 내용)
        code = re.sub(r'"(.*?)"', '""', code)
        code = re.sub(r"'(.*?)'", "''", code)
        # 빈 줄 제거
        code = re.sub(r'\n\s*\n', '\n', code)
        # 불필요한 공백 및 탭 변환 (연속된 공백을 하나의 공백으로)
        code = re.sub(r'\s+', ' ', code)
        # 문자열 앞뒤 공백 제거
        cleaned_code = code.strip()
        
        return cleaned_code


In [None]:
# def infer(model, tokenizer, text1, text2, device, threshold=0.5):
#     model.eval()  # 모델을 평가 모드로 설정
#     with torch.no_grad():  # 그래디언트 계산을 비활성화
#         # 두 코드 스니펫을 토큰화하고 디바이스로 이동
#         text1 = remove_comments(text1)
#         text2 = remove_comments(text2)
#         inputs1 = tokenizer(text1, return_tensors='pt', max_length=512, padding='max_length', truncation=True)
#         inputs2 = tokenizer(text2, return_tensors='pt', max_length=512, padding='max_length', truncation=True)
        
#         inputs1 = {k: v.to(device) for k, v in inputs1.items()}
#         inputs2 = {k: v.to(device) for k, v in inputs2.items()}

#         # 모델을 통해 유사도 점수(확률) 계산
#         probabilities = model(**inputs1, **inputs2)

#         # 유사도 점수를 기반으로 판단
#         predicted_label = (probabilities > threshold).long()  # 확률이 임계값보다 크면 1, 아니면 0
#         print(f"Similarity score: {probabilities.item()}")
#         print(f"Predicted label: {'Same' if predicted_label.item() == 1 else 'Different'}")

# # 추론 실행
# infer(model, tokenizer, code_text1, code_text2, device, threshold=0.5)
