## Import

In [6]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'




In [16]:
import pandas as pd
import numpy as np
import glob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import torch
from transformers import AutoModel, AutoTokenizer
import re

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Load Train / Test dataset

In [18]:
test = pd.read_csv("./bigdata/test.csv")
test.head()

Unnamed: 0,pair_id,code1,code2
0,TEST_000000,#include <bits/stdc++.h>\nusing namespace std;...,"#include <bits/stdc++.h>\n#define rep(i, n) fo..."
1,TEST_000001,"#include<bits/stdc++.h>\n#define rep(i,n)for(i...",// //bitset操作\n// #include <iostream>\n// #inc...
2,TEST_000002,#include <bits/stdc++.h>\nusing namespace std;...,#include <bits/stdc++.h>\n#include <ext/pb_ds/...
3,TEST_000003,#include <bits/stdc++.h>\nusing namespace std;...,#include <bits/stdc++.h>\nusing namespace std;...
4,TEST_000004,#include<bits/stdc++.h>\nusing namespace std;\...,#include<iostream>\n#include<algorithm>\n#incl...


In [19]:
def remove_comments(cpp_code):
        # 멀티라인 주석 제거
        code = re.sub(r'/\*.*?\*/', '', cpp_code, flags=re.DOTALL)
        # 단일 라인 주석 제거
        code = re.sub(r'//.*', '', code)
        
        # 문자열 내용 제거 (" " 안의 내용과 ' ' 안의 내용)
        code = re.sub(r'"(.*?)"', '""', code)
        code = re.sub(r"'(.*?)'", "''", code)
        # 빈 줄 제거
        code = re.sub(r'\n\s*\n', '\n', code)
        # 불필요한 공백 및 탭 변환 (연속된 공백을 하나의 공백으로)
        code = re.sub(r'\s+', ' ', code)
        # 문자열 앞뒤 공백 제거
        cleaned_code = code.strip()
        
        return cleaned_code


## Define Model (CountVectorizer+CosineSimilarity)

In [44]:
class BaselineModel():
    def __init__(self):
        super(BaselineModel, self).__init__()
        
        self.encoder = AutoModel.from_pretrained("neulab/codebert-cpp").to(device)
        self.tokenizer = AutoTokenizer.from_pretrained("neulab/codebert-cpp")
    
    def predict_proba(self, code1, code2):
        # 입력 받은 코드 쌍으로 부터 vectorizer를 통해 vector화 합니다.
        inputs1 = self.tokenizer(remove_comments(code1), return_tensors='pt', max_length=512, padding='max_length', truncation=True).to(device)
        inputs2 = self.tokenizer(remove_comments(code2), return_tensors='pt', max_length=512, padding='max_length', truncation=True).to(device)
        
    
        
        
        # 코드 벡터를 생성합니다.
        with torch.no_grad():
            outputs1 = self.encoder(**inputs1)
            outputs2 = self.encoder(**inputs2)

        # pooler_output을 사용하여 코드 벡터를 가져옵니다.
        code1_vec = outputs1.pooler_output
        code2_vec = outputs2.pooler_output

        # 벡터 간 유사도를 계산합니다.
        similarity = cosine_similarity(code1_vec.cpu().numpy(), code2_vec.cpu().numpy())[0][0]

        
        
        return similarity
    
    def predict(self, code1_series, code2_series):
        result = []
        for code1,code2 in tqdm(zip(code1_series,code2_series)):
            preds = self.predict_proba(code1, code2)
            result.append(preds)
        # cosine-similarity (유사도)가 설정한 임계값(Threshold=0.5)보다 높다면 유사하다 : 1, 아니라면 유사하지 않다 : 0
        
        # 각 코드 쌍들의 유사도를 Threshold를 통해 유사함을 판별 (이진분류)
        return preds

## Inference

In [45]:
# 모델 추론
model = BaselineModel()
preds = model.predict(test['code1'], test['code2'])

Some weights of the model checkpoint at neulab/codebert-cpp were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at neulab/codebert-cpp and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
595000it [3:15:48,

In [46]:
preds = np.where(np.array(preds)>0.5, 1, 0)

## Submission

In [None]:
submission = pd.read_csv('./bigdata/sample_submission.csv')
submission['similar'] = preds
submission.to_csv('./bigdata/base_submit.csv', index=False)