In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%capture
!pip install -U transformers

In [None]:
import numpy as np
import pandas as pd
import zipfile
import re
import string
import os
from tqdm import tqdm
import matplotlib.pyplot as plt 
import seaborn as sns
from transformers import AutoTokenizer

pd.set_option('mode.chained_assignment',  None)
path = '/content/drive/MyDrive/Dacon/Code_Similarity/'

---

In [None]:
test = pd.read_csv(path + '/data/test.csv')
test = test.drop('pair_id', axis=1)
test.head(3)

Unnamed: 0,code1,code2
0,def main():\n s = input()\n if s.count('a') ...,"N,K = map(int,input().split())\nA = list(map(i..."
1,"N,K,Q = map(int,input().split())\npoints = [0]...","N, K, Q = map(int,input().split())\n\nif K > Q..."
2,from itertools import combinations\nn = int(in...,s = input()\nt = input()\nlength_s = len(s)\nl...


---

In [None]:
%%capture
!mkdir code
zipfile.ZipFile(path + 'data/code.zip').extractall('code/')

In [None]:
code_path = 'code/'
problems = os.listdir(code_path)

def contains_NameIsMain(code):
    NameIsMain_1 = "if __name__ == '__main__':"
    NameIsMain_2 = 'if __name__ == "__main__":'
    
    if NameIsMain_1 in code:
        main_component = code[:code.index(NameIsMain_1)]
        exec_component = code[code.index(NameIsMain_1):]
    elif NameIsMain_2 in code:
        main_component = code[:code.index(NameIsMain_2)]
        exec_component = code[code.index(NameIsMain_2):]
    else:
        return code
    
    exec_component = exec_component.split('\n')[1:]
    exec_component = '\n'.join(exec_component)
    if exec_component.startswith('\t'):
        exec_component = exec_component[exec_component.index('\t')+1:]
    
    return main_component + exec_component

def clean_code(script):
    with open(script, 'r',encoding='utf-8') as file:
        code = file.readlines()
        code = contains_NameIsMain(code)
        tmp = []
        for line in code:
            if line.lstrip().startswith('#'):
                continue
            else:
                line = line.rstrip()
                line = line.lower()
                if '#' in line: 
                    line = line[:line.index('#')]
                line = line.encode('ascii', 'ignore').decode()
                line = line.replace('true', '1')
                line = line.replace('false', '0')
                line = line.replace(r'\'\w+', '') 
                line = line.replace(r'\w*\d+\w*', '')
                line = line.replace('"', "'")
                line = line.replace('    ','\t')
                line = line.replace('\n','')
                line = line.replace(r'https*\S+', '')
                line = line.replace(r'http*\S+', '')
                if line == '': 
                    continue
                tmp.append(line)
    code = '\n'.join(tmp)
    return code

cleaned = []
p_nums = []
for problems in tqdm(problems):
    scripts = os.listdir(os.path.join(code_path, problems))
    num = int(re.sub(r'[^0-9]', '', scripts[0].split('_')[0]))
    
    for script in scripts:
        script_file = os.path.join(code_path, problems, script)
        preprocessed_script = clean_code(script_file)
        cleaned.append(preprocessed_script)

    p_nums.extend([num]*len(scripts))

inputs = pd.DataFrame(data={"code" : cleaned, "problem_number" : p_nums})
print(f'\n{len(inputs)}')

100%|██████████| 300/300 [00:02<00:00, 127.62it/s]



45101


In [None]:
def clean_test_code(script):
    tmp = [] 

    script = contains_NameIsMain(script)
    script = script.lower()
    for line in script.split('\n'):
        if line.lstrip().startswith('#'):
                continue
        else:
            line = line.rstrip()
            if '#' in line: 
                line = line[:line.index('#')]
            line = line.encode('ascii', 'ignore').decode()
            line = line.replace('true', '1')
            line = line.replace('false', '0')
            line = line.replace(r'\'\w+', '') 
            line = line.replace(r'\w*\d+\w*', '')
            line = line.replace('"', "'")
            line = line.replace('    ','\t')
            line = line.replace('\n','')
            line = line.replace(r'https*\S+', '')
            line = line.replace(r'http*\S+', '')
            if line == '': 
                continue
            tmp.append(line)

    code = "\n".join(tmp)

    return code

test['code1'] = test['code1'].apply(clean_test_code)
test['code2'] = test['code2'].apply(clean_test_code)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base", do_lowercase=False)
tokenizer.truncation_side = 'left'

In [None]:
def get_length(script):
    return len(tokenizer.tokenize(script))
    
inputs['code_length'] = inputs['code'].apply(get_length)

Token indices sequence length is longer than the specified maximum sequence length for this model (716 > 512). Running this sequence through the model will result in indexing errors


In [None]:
print(f"length <= 512 : {len(inputs[inputs['code_length'] <= 512])}")
print(f"length > 512 : {len(inputs[inputs['code_length'] > 512])}")
print(f"length > 1000 : {len(inputs[inputs['code_length'] > 1000])}")

length <= 512 : 43698
length > 512 : 1403
length > 1000 : 169


In [None]:
inputs = inputs[inputs['code_length'] <= 1000]
inputs = inputs.drop('code_length', axis=1).reset_index(drop=True)

---

In [None]:
from random import shuffle, sample
from itertools import combinations, product


def split_data(x:list, frac=0.8):
    shuffle(x)
    return x[:int(len(x) * frac)], x[int(len(x)*frac):]

def get_pairs(x:list):
    return [x[i][0] for i in range(len(x))], [x[i][1] for i in range(len(x))]

def stratified_sample(df, frac=0.6):
    return df.groupby("problem_number").sample(frac=frac)

In [None]:
pos_pairs_train, pos_pairs_val = [], []
neg_pairs_train, neg_pairs_val = [], []
problem_number = inputs['problem_number'].unique().tolist()

n_size = 75000 // len(problem_number)

for p_num in tqdm(problem_number):
    tmp_pos = inputs[inputs['problem_number']==p_num]['code'].tolist()
    tmp_pos_train, tmp_pos_val = split_data(tmp_pos)
    
    pos_train = list(combinations(tmp_pos_train, 2))
    pos_train = sample(pos_train, k=n_size)
    pos_val = list(combinations(tmp_pos_train, 2))
    pos_val = sample(pos_val, k=round(n_size * 0.2))

    pos_pairs_train.extend(pos_train)
    pos_pairs_val.extend(pos_val)

    tmp_neg = inputs[inputs['problem_number']!=p_num]
    tmp_neg = stratified_sample(tmp_neg, 0.9)['code'].to_list()

    tmp_neg_train, tmp_neg_val = split_data(tmp_neg)
    neg_train = list(sample(list(product(tmp_pos_train, tmp_neg_train)), k=n_size))
    neg_val =  list(sample(list(product(tmp_pos_val, tmp_neg_val)), k=round(n_size * 0.2)))
    
    neg_pairs_train.extend(neg_train)
    neg_pairs_val.extend(neg_val)

100%|██████████| 300/300 [02:16<00:00,  2.19it/s]


In [None]:
print(f"Num. Positive Pairs for Train {len(pos_pairs_train)}")
print(f"Num. Positive Pairs for Validation {len(pos_pairs_val)}")
print(f"Num. Negative Pairs for Train {len(neg_pairs_train)}")
print(f"Num. Negative Pairs for Validation {len(neg_pairs_val)}")

Num. Positive Pairs for Train 75000
Num. Positive Pairs for Validation 15000
Num. Negative Pairs for Train 75000
Num. Negative Pairs for Validation 15000


In [None]:
pos_labels_train = [1] * len(pos_pairs_train)
neg_labels_train = [0] * len(neg_pairs_train)
pos_labels_val = [1] * len(pos_pairs_val)
neg_labels_val = [0] * len(neg_pairs_val)

labels_train = pos_labels_train + neg_labels_train
labels_val = pos_labels_val + neg_labels_val

pairs_train = pos_pairs_train + neg_pairs_train
pairs_val = pos_pairs_val + neg_pairs_val 

In [None]:
code1_train, code2_train = get_pairs(pairs_train)
code1_val, code2_val = get_pairs(pairs_val)

In [None]:
df_train = pd.DataFrame(
    data = {
        "code1" : code1_train,
        "code2" : code2_train,
        "label" : labels_train
    }
)

df_val = pd.DataFrame(
    data = {
        "code1" : code1_val,
        "code2" : code2_val,
        "label" : labels_val
    }
)

In [None]:
df_train = df_train.sample(frac=1).reset_index(drop=True)
df_val = df_val.sample(frac=1).reset_index(drop=True)

In [None]:
df_train.to_pickle(path + "data/train.pkl")
df_val.to_pickle(path + "data/val.pkl")
test.to_pickle(path + 'data/test.pkl')