# IMPORT & SET LOGGER

In [1]:
from __future__ import absolute_import, division, print_function

import sys
import gc
import logging
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
from torch.utils.data.distributed import DistributedSampler
from transformers import (TrainingArguments,
                          Trainer,
                          EarlyStoppingCallback,
                          DataCollatorWithPadding,
                          RobertaConfig,
                          RobertaForSequenceClassification,
                          RobertaTokenizer)
from tqdm import tqdm, trange
import multiprocessing
from model import Model
from parser import DFG_python
from parser import ( tree_to_token_index,
                     index_to_code_token,
                     tree_to_variable_index )
from tree_sitter import Language, Parser
from parser import TextDataset
from datasets import load_metric
from transformers import DataCollatorWithPadding
import pandas as pd
from collections import deque
import re

logger = logging.getLogger(__name__)
logger.addHandler(logging.StreamHandler(stream=sys.stdout))
logger.setLevel(logging.INFO)
logger.handlers[0].stream = sys.stdout

LANGUAGE = Language('./clonedetection/parser/my-languages.so', 'python')
_parser = Parser()
_parser.set_language(LANGUAGE)
parser = [_parser, DFG_python]

# SET MODEL, CONFIG, TOKENIZER

In [2]:
MODEL = "microsoft/graphcodebert-base"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config = RobertaConfig.from_pretrained(MODEL)
config.num_labels=1
tokenizer = RobertaTokenizer.from_pretrained(MODEL)
tokenizer.truncation_side = 'left'
model = RobertaForSequenceClassification.from_pretrained(MODEL, config=config)
model = Model(model, tokenizer=tokenizer, config=config)

Some weights of the model checkpoint at microsoft/graphcodebert-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.d

# MAKE DATASETS

In [None]:
from collections import deque
import re
# PREPROCESSING FOR CODE SCRIPT
def preprocess_script(df:pd.DataFrame):
    new_codes={'code1': deque(), 'code2': deque()}
    codes = (df.code1, df.code2)

    for i, code in enumerate(codes, start=1):     # loop code1, code2
        for c in tqdm(code):                            # loop code series
            new_code = deque()
            for line in c.split('\n'):
                if line.lstrip().startswith('#'): # 주석으로 시작되는 행 skip
                    continue
                line = line.rstrip()
                if '#' in line:
                    line = line[:line.index('#')] # 주석 전까지 코드만 저장
                line = line.replace('\n','')      # 개행 문자를 모두 삭제함
                line = line.replace('    ','\t')  # 공백 4칸을 tab으로 변환

                if line == '': # 전처리 후 빈 라인은 skip
                    continue
                
                new_code.append(line)
                
            new_code = '\n'.join(new_code)
            new_code = re.sub('("""[\w\W]*?""")', '<str>', new_code)
            new_code = re.sub("('''[\w\W]*?''')", '<str>', new_code)
            new_code = re.sub('/^(file|gopher|news|nntp|telnet|https?|ftps?|sftp):\/\/([a-z0-9-]+\.)+[a-z0-9]{2,4}.*$/',
                              '<url>',
                              new_code)
            
            new_codes[f'code{i}'].append(new_code)
            
    return pd.DataFrame(data={'code1':new_codes['code1'], 'code2':new_codes['code2'], 'similar':df.similar})

In [None]:
train_data = pd.read_csv('data/train_data_lv1.csv')
preprocess_script(train_data).to_csv('data/train_data.csv', mode='w', index=False)
valid_data = pd.read_csv('data/valid_data_lv1.csv')
preprocess_script(valid_data).to_csv('data/valid_data.csv', mode='w', index=False)

In [None]:
train_dataset = TextDataset(
    file_path='data/train_data.csv',
    tokenizer=tokenizer,
    parser=parser
)

In [None]:
valid_dataset = TextDataset(
    file_path='data/valid_data.csv',
    tokenizer=tokenizer,
    parser=parser
)

# FUNCTIONS

In [None]:
def evaluate(valid_dataset, model, eval_batch_size, eval_when_training=False):
    #build dataloader
    eval_sampler = SequentialSampler(valid_dataset)
    eval_dataloader = DataLoader(valid_dataset, sampler=eval_sampler,batch_size=eval_batch_size,num_workers=4)

    # multi-gpu evaluate
    if torch.cuda.device_count() > 1 and eval_when_training is False:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation *****")
    logger.info("  Num examples = %d", len(valid_dataset))
    logger.info("  Batch size = %d", eval_batch_size)

    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()
    logits=[]
    y_trues=[]
    for i, batch in enumerate(tqdm(eval_dataloader)):
        (inputs_ids_1,position_idx_1,attn_mask_1,
        inputs_ids_2,position_idx_2,attn_mask_2,
        labels)=[x.to(device)  for x in batch]
        with torch.no_grad():
            lm_loss,logit = model(inputs_ids_1,position_idx_1,attn_mask_1,inputs_ids_2,position_idx_2,attn_mask_2,labels)
            eval_loss += lm_loss.mean().item()
            logits.append(logit.cpu().numpy())
            y_trues.append(labels.cpu().numpy())
        nb_eval_steps += 1

    #calculate scores
    logits=np.concatenate(logits,0)
    y_trues=np.concatenate(y_trues,0)
    best_threshold=0.5
    best_f1=0

    y_preds=logits[:,1]>best_threshold
    
    from sklearn.metrics import recall_score
    recall=recall_score(y_trues, y_preds)
    from sklearn.metrics import precision_score
    precision=precision_score(y_trues, y_preds)
    from sklearn.metrics import f1_score
    f1=f1_score(y_trues, y_preds)
    from sklearn.metrics import accuracy_score
    accuracy=accuracy_score(y_trues, y_preds)
    result = {
        "eval_recall": float(recall),
        "eval_precision": float(precision),
        "eval_f1": float(f1),
        "eval_threshold":best_threshold,
        "eval_accuracy":float(accuracy)
    }

    logger.info("***** Eval results *****")
    for key in sorted(result.keys()):
        logger.info("  %s = %s", key, str(round(result[key],4)))

    return result

In [None]:
import os
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from knockknock import discord_sender

webhook_url=''

@discord_sender(webhook_url=webhook_url)
def train(model,
          train_dataset:TextDataset,
          valid_dataset:TextDataset,
          train_batch_size=4,
          eval_batch_size=8,
          epochs=1,
          weight_decay=0.0,
          learning_rate=2e-5,
          adam_epsilon=1e-8,
          gradient_accumulation_steps=4,
          max_grad_norm=1.0,
          output_dir='./models/'
          ):
    """ Train the model """

    #build dataloader
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=train_batch_size, num_workers=4)

    max_steps=epochs*len(train_dataloader)
    save_steps=len(train_dataloader)//10
    warmup_steps=max_steps//5
    model.to(device)

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
         'weight_decay': weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps,
                                                num_training_steps=max_steps)

    # multi-gpu training
    if torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", epochs)
    logger.info("  Instantaneous batch size per GPU = %d", train_batch_size//max(torch.cuda.device_count(), 1))
    logger.info("  Total train batch size = %d",train_batch_size*gradient_accumulation_steps)
    logger.info("  Gradient Accumulation steps = %d", gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", max_steps)

    global_step=0
    tr_loss, logging_loss,avg_loss,tr_nb,tr_num,train_loss = 0.0, 0.0,0.0,0,0,0
    best_f1=0

    model.zero_grad()

    for idx in range(epochs):
        bar = tqdm(train_dataloader,total=len(train_dataloader))
        tr_num=0
        train_loss=0
        for step, batch in enumerate(bar):
            (inputs_ids_1,position_idx_1,attn_mask_1,
            inputs_ids_2,position_idx_2,attn_mask_2,
            labels)=[x.to(device)  for x in batch]
            model.train()
            loss,logits = model(inputs_ids_1,position_idx_1,attn_mask_1,inputs_ids_2,position_idx_2,attn_mask_2,labels)

            if torch.cuda.device_count() > 1:
                loss = loss.mean()

            if gradient_accumulation_steps > 1:
                loss = loss / gradient_accumulation_steps

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

            tr_loss += loss.item()
            tr_num+=1
            train_loss+=loss.item()
            if avg_loss==0:
                avg_loss=tr_loss

            avg_loss=round(train_loss/tr_num,5)
            bar.set_description("epoch {} loss {}".format(idx,avg_loss))

            if (step + 1) % gradient_accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()
                scheduler.step()
                global_step += 1
                output_flag=True
                avg_loss=round(np.exp((tr_loss - logging_loss) /(global_step- tr_nb)),4)

                if global_step % save_steps == 0:
                    results = evaluate(valid_dataset, model, eval_batch_size)

                    # Save model checkpoint
                    if results['eval_f1']>best_f1:
                        best_f1=results['eval_f1']
                        logger.info("  "+"*"*20)
                        logger.info("  Best f1:%s",round(best_f1,4))
                        logger.info("  "+"*"*20)

                        checkpoint_prefix = 'checkpoint-best-f1'
                        output_dir = os.path.join(output_dir, '{}'.format(checkpoint_prefix))
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
                        model_to_save = model.module if hasattr(model,'module') else model
                        output_dir = os.path.join(output_dir, '{}'.format('model.bin'))
                        torch.save(model_to_save.state_dict(), output_dir)
                        logger.info("Saving model checkpoint to %s", output_dir)

In [None]:
def test(test_dataset: TextDataset,
         model,
         eval_batch_size=16,
         best_threshold=0):
    #build dataloader
    eval_sampler = SequentialSampler(test_dataset)
    eval_dataloader = DataLoader(test_dataset, sampler=eval_sampler, batch_size=eval_batch_size,num_workers=4)

    # multi-gpu evaluate
    if torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running Test *****")
    logger.info("  Num examples = %d", len(test_dataset))
    logger.info("  Batch size = %d", eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()
    logits=[]
    y_trues=[]
    for i, batch in enumerate(tqdm(eval_dataloader)):
        (inputs_ids_1,position_idx_1,attn_mask_1,
        inputs_ids_2,position_idx_2,attn_mask_2,
        labels)=[x.to(device) for x in batch]
        with torch.no_grad():
            lm_loss,logit = model(inputs_ids_1,position_idx_1,attn_mask_1,inputs_ids_2,position_idx_2,attn_mask_2,labels)
            eval_loss += lm_loss.mean().item()
            logits.append(logit.cpu().numpy())
            y_trues.append(labels.cpu().numpy())
        nb_eval_steps += 1

    #output result
    logits=np.concatenate(logits,0)
    y_preds=logits[:,1]>best_threshold

    return y_preds*1

# DO TRAIN

In [None]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
train(model,
      train_dataset=train_dataset,
      valid_dataset=valid_dataset,
      train_batch_size=2,
      epochs=3,
      weight_decay=0.001
     )

# MAKE SUBMISSION.csv

In [None]:
from collections import deque
import re
# PREPROCESSING FOR CODE SCRIPT
def preprocess_script(df:pd.DataFrame):
    new_codes={'code1': deque(), 'code2': deque()}
    codes = (df.code1, df.code2)

    for i, code in enumerate(codes, start=1):     # loop code1, code2
        for c in tqdm(code):                            # loop code series
            new_code = deque()
            for line in c.split('\n'):
                if line.lstrip().startswith('#'): # 주석으로 시작되는 행 skip
                    continue
                line = line.rstrip()
                if '#' in line:
                    line = line[:line.index('#')] # 주석 전까지 코드만 저장
                line = line.replace('\n','')      # 개행 문자를 모두 삭제함
                line = line.replace('    ','\t')  # 공백 4칸을 tab으로 변환

                if line == '': # 전처리 후 빈 라인은 skip
                    continue
                
                new_code.append(line)
                
            new_code = '\n'.join(new_code)
            new_code = re.sub('("""[\w\W]*?""")', '<str>', new_code)
            new_code = re.sub("('''[\w\W]*?''')", '<str>', new_code)
            new_code = re.sub('/^(file|gopher|news|nntp|telnet|https?|ftps?|sftp):\/\/([a-z0-9-]+\.)+[a-z0-9]{2,4}.*$/',
                              '<url>',
                              new_code)
            
            new_codes[f'code{i}'].append(new_code)
            
    return pd.DataFrame(data={'code1':new_codes['code1'], 'code2':new_codes['code2'], 'similar':df.similar})

In [None]:
test_data = pd.read_csv('data/test.csv')
test_data = test_data.drop('pair_id', axis=1)
test_data['similar'] = [None] * len(test_data)
test_data = preprocess_script(test_data)
test_data.to_csv('data/test_1.csv', mode='w', index=False)

In [None]:
test_dataset = TextDataset(
    file_path='data/test_1.csv',
    tokenizer=tokenizer,
    parser=parser
)

In [None]:
model.to(device)
sub = pd.read_csv('data/sample_submission.csv')
sub['similar'] = test(test_dataset, model, eval_batch_size=16, best_threshold=0.5)
sub.to_csv('submissions/graphcodebert_submission.csv', mode='w', index=False)

In [None]:
sub

In [57]:
test_dataset = TextDataset(
    file_path='data/test_1.csv',
    tokenizer=tokenizer,
    parser=parser
)

100%|██████████████████████████████████| 179700/179700 [15:49<00:00, 189.18it/s]


In [58]:
model.to(device)
sub = pd.read_csv('data/sample_submission.csv')
sub['similar'] = test(test_dataset, model, eval_batch_size=16, best_threshold=0.5)
sub.to_csv('submissions/graphcodebert_submission.csv', mode='w', index=False)

***** Running Test *****
  Num examples = 179700
  Batch size = 16


100%|███████████████████████████████████| 11232/11232 [1:56:44<00:00,  1.60it/s]


In [59]:
sub

Unnamed: 0,pair_id,similar
0,1,1
1,2,1
2,3,1
3,4,1
4,5,1
...,...,...
179695,179696,1
179696,179697,1
179697,179698,1
179698,179699,1
