In [14]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'




In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModel, AutoTokenizer
import os
import random
import numpy as np
import itertools
import pickle
from tqdm import tqdm

In [16]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(42) # Seed 고정

In [17]:
MODEL_NAME = "microsoft/codebert-base"
MODEL_TAG = "microsoft_codebert-base"
root_dir = "/home/leadawon5/decs_jupyter_lab/gitfiles/DACON/code_similarity/bigdata/train_code" 

class CodeEncoder(nn.Module):
    def __init__(self, model_name=MODEL_NAME):
        super(CodeEncoder, self).__init__()
        self.encoder = AutoModel.from_pretrained(model_name)

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.pooler_output  # Use the pooled output






In [18]:
class CodePairsDataset(Dataset):
    def __init__(self, root_dir, tokenizer, max_length=1024):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.samples = self._prepare_samples(root_dir)
        
#     def _prepare_samples(self, root_dir):
#         samples = []
#         problem_dirs = [os.path.join(root_dir, d) for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))]

#         total_positive_pairs = 0
#         total_files = 0

#         # 긍정적 쌍의 총 개수와 파일의 총 개수 계산
#         for problem_dir in tqdm(problem_dirs):
#             cpp_files = [os.path.join(problem_dir, f) for f in os.listdir(problem_dir) if f.endswith('.cpp')]
#             n = len(cpp_files)
#             total_positive_pairs += n * (n - 1) // 2
#             total_files += n

#         # 각 파일 당 생성할 부정적 쌍의 개수
#         avg_negative_pairs_per_file = total_positive_pairs // total_files if total_files else 0

#         for problem_dir in tqdm(problem_dirs):
#             cpp_files = [os.path.join(problem_dir, f) for f in os.listdir(problem_dir) if f.endswith('.cpp')]
#             # 긍정적 쌍 추가
#             for i in range(len(cpp_files)):
#                 for j in range(i + 1, len(cpp_files)):
#                     samples.append((cpp_files[i], cpp_files[j], 1))  # Positive pair

#             # 부정적 쌍 추가
#             other_problem_dirs = [d for d in problem_dirs if d != problem_dir]
#             for file in cpp_files:
#                 negative_samples = []
#                 for other_dir in other_problem_dirs:
#                     other_cpp_files = [os.path.join(other_dir, f) for f in os.listdir(other_dir) if f.endswith('.cpp')]
#                     negative_samples.extend([(file, other_file, 0) for other_file in other_cpp_files])

#                 # 부정적 쌍에서 랜덤으로 avg_negative_pairs_per_file 개수만큼 선택
#                 if len(negative_samples) > avg_negative_pairs_per_file:
#                     negative_samples = random.sample(negative_samples, avg_negative_pairs_per_file)
#                 samples.extend(negative_samples)

#         return samples


    def _prepare_samples(self, root_dir):
        samples = []
        problem_dirs = [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))]
        problem_files_count = {d: len([f for f in os.listdir(os.path.join(root_dir, d)) if f.endswith('.cpp')]) for d in problem_dirs}

        # 긍정적 쌍 생성
        for problem_dir, file_count in tqdm(problem_files_count.items(),desc="pos set"):
            for i in range(1, file_count + 1):
                for j in range(i + 1, file_count + 1):
                    file1 = os.path.join(root_dir, problem_dir, f"{problem_dir}_{i}.cpp")
                    file2 = os.path.join(root_dir, problem_dir, f"{problem_dir}_{j}.cpp")
                    samples.append((file1, file2, 1))

        # 부정적 쌍 생성
        for problem_dir, file_count in tqdm(problem_files_count.items(), desc="neg set"):
            for i in range(1, file_count + 1):
                # 현재 폴더의 파일
                current_file = f"{problem_dir}_{i}.cpp"

                # 가능한 모든 부정적 쌍의 개수를 계산
                total_neg_samples = sum(problem_files_count.values()) - file_count

                # 각 파일에 대해 생성할 부정적 쌍의 개수
                num_neg_samples_per_file = total_neg_samples // len(problem_files_count)

                # 다른 폴더의 파일과의 부정적 쌍 생성
                neg_samples_added = 0
                while neg_samples_added < num_neg_samples_per_file:
                    # 다른 폴더를 랜덤하게 선택
                    other_dir = random.choice(list(problem_files_count.keys()))
                    if other_dir == problem_dir:
                        continue

                    # 다른 폴더의 파일 중 하나를 랜덤하게 선택
                    other_file_index = random.randint(1, problem_files_count[other_dir])
                    other_file = f"{other_dir}_{other_file_index}.cpp"

                    samples.append((os.path.join(root_dir, problem_dir, current_file), os.path.join(root_dir, other_dir, other_file), 0))
                    neg_samples_added += 1

        return samples



    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        file_path1, file_path2, label = self.samples[idx]
        with open(file_path1, 'r', encoding='utf-8') as f:
            text1 = f.read()
        with open(file_path2, 'r', encoding='utf-8') as f:
            text2 = f.read()

        inputs1 = self.tokenizer(text1, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        inputs2 = self.tokenizer(text2, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)

        return {
            "input_ids1": inputs1['input_ids'].squeeze(0),
            "attention_mask1": inputs1['attention_mask'].squeeze(0),
            "input_ids2": inputs2['input_ids'].squeeze(0),
            "attention_mask2": inputs2['attention_mask'].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.float)
        }


In [19]:
class CodeComparisonModel(nn.Module):
    def __init__(self, encoder_model_name=MODEL_NAME):
        super(CodeComparisonModel, self).__init__()
        self.encoder = CodeEncoder(encoder_model_name)

    def forward(self, input_ids1, attention_mask1, input_ids2, attention_mask2):
        embedding1 = self.encoder(input_ids1, attention_mask1)
        embedding2 = self.encoder(input_ids2, attention_mask2)
        similarity_scores = torch.matmul(embedding1, embedding2.T)
        return similarity_scores




In [20]:
def save_checkpoint(model, optimizer, epoch, filepath):
    if not os.path.exists(filepath):
        os.makedirs(filepath)
    checkpoint_path = os.path.join(filepath, f"./bigdata/model/{MODEL_TAG}_checkpoint_epoch_{epoch}.pth")
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, checkpoint_path)
    print(f"Checkpoint saved to {checkpoint_path}")

def train(model, data_loader, optimizer, device, epochs=1):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in data_loader:
            optimizer.zero_grad()
            input_ids1 = batch['input_ids1'].to(device)
            attention_mask1 = batch['attention_mask1'].to(device)
            input_ids2 = batch['input_ids2'].to(device)
            attention_mask2 = batch['attention_mask2'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids1, attention_mask1, input_ids2, attention_mask2)
            loss = F.cross_entropy(outputs, labels.long())
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Training loss: {total_loss / len(data_loader)}")

        # Save checkpoint after each epoch
        save_checkpoint(model, optimizer, epoch+1, "./model/")

In [None]:
# Example usage


tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
dataset = CodePairsDataset(root_dir, tokenizer)



pos set:  61%|███████████████████▋            | 307/500 [02:15<01:42,  1.88it/s]

In [None]:
def save_dataset(dataset, file_path):
    with open(file_path, 'wb') as file:
        pickle.dump(dataset, file)

# 데이터셋 생성
# dataset = CodePairsDataset(root_dir, tokenizer)  # 예시로 데이터셋을 생성하는 코드
# 여기서 생성된 데이터셋을 pickle로 저장합니다.
save_dataset(dataset, f'./bigdata/pickles/{MODEL_TAG}_dataset.pkl')


In [None]:
def load_dataset(file_path):
    with open(file_path, 'rb') as file:
        dataset = pickle.load(file)
    return dataset

# 저장된 pickle 파일에서 데이터셋을 불러옵니다.
dataset = load_dataset(f'./bigdata/pickles/{MODEL_TAG}_dataset.pkl')


In [None]:
dataset[0]

In [None]:
data_loader = DataLoader(dataset, batch_size=2, shuffle=True)
model = CodeComparisonModel(MODEL_NAME).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

In [None]:
train(model, data_loader, optimizer, device)

In [None]:
def load_checkpoint(model, optimizer, filepath):
    if os.path.exists(filepath):
        checkpoint = torch.load(filepath)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        epoch = checkpoint['epoch']
        print(f"Checkpoint loaded from {filepath} at epoch {epoch}")
        return epoch
    else:
        print("No checkpoint found at specified path!")
        return None


# 체크포인트 파일 경로 지정
checkpoint_path = "./bigdata/model/checkpoint_epoch_1.pth"

# 체크포인트 불러오기
epoch = load_checkpoint(model, optimizer, checkpoint_path)

In [None]:
def infer(model, tokenizer, text1, text2, device):
    model.eval()  # 모델을 평가 모드로 설정
    with torch.no_grad():  # 그래디언트 계산을 비활성화
        # 두 코드 스니펫을 토큰화
        inputs1 = tokenizer(text1, return_tensors='pt', max_length=1024, padding='max_length', truncation=True).to(device)
        inputs2 = tokenizer(text2, return_tensors='pt', max_length=1024, padding='max_length', truncation=True).to(device)
        
        # 모델을 통해 유사도 점수 계산
        similarity_scores = model(inputs1['input_ids'], inputs1['attention_mask'], inputs2['input_ids'], inputs2['attention_mask'])
        # 유사도 점수를 기반으로 판단 (여기서는 단순히 점수를 출력하고 있음)
        print("Similarity score:", similarity_scores.item())

# 두 코드 스니펫의 예시 텍스트
code_text1 = "def sum(a, b): return a + b"
code_text2 = "def add(x, y): return x + y"

# 추론 실행
infer(model, tokenizer, code_text1, code_text2, device)
