For CodeSearch, this notebook simply attempts to reproduce the results in the paper ["CodeBERT:
A Pre-Trained Model for Programming and Natural Languages"](https://arxiv.org/pdf/2002.08155.pdf). It very closely follows its framework as well, but here we only focus on the Python programming language.

In [3]:
from __future__ import absolute_import
import os
import sys
import torch
import json
import random
from pathlib import Path
import numpy as np
from io import open
from itertools import cycle
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler, TensorDataset
from transformers import (AdamW, get_linear_schedule_with_warmup,
                          RobertaConfig, RobertaModel, RobertaTokenizer, RobertaForSequenceClassification)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [15]:
config = RobertaConfig.from_pretrained('microsoft/codebert-base', num_labels=2, finetuning_task="codesearch")
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base', do_lower_case=True)
model = RobertaForSequenceClassification.from_pretrained('microsoft/codebert-base', config=config)
model.to(device)

Some weights of the model checkpoint at microsoft/codebert-base were not used when initializing RobertaForSequenceClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be 

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [27]:
class RawTextData:
    
    def __init__(self, text0, text1, label):
        self.text0 = text0
        self.text1 = text1
        self.label = label

In [47]:
class TokenizedData:
    
    def __init__(self, input_ids, input_mask, segment_ids, label):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label = label

In [48]:
def get_raw_text(dir_filename, encoding='utf-8'):
    with open(dir_filename, "r", encoding='utf-8') as f:
        raw_text = []
        for line in f.readlines():
            line = line.strip().split('<CODESPLIT>')
            if len(line) != 5:
                continue
            raw_text.append(RawTextData(text0=line[3], text1=line[4], label=line[0]))
    return raw_text

In [7]:
train_raw_text = get_raw_text("python_train_val/train.txt")

In [59]:
def tokenize_raw_text(text_data, tokenizer, max_seq_length=50):
    pad_token=0
    pad_token_segment_id=0
    features= []
    print("*** Tokenizing Data *** ")
    for i, example in enumerate(text_data):
        if i % 1600 == 1599:
            print("=", end="")
        if i % 64000 == 63999:
            print("[Processed " + str(i+1) + " / " + str(len(text_data)) + "] " )    
        tokens0 = tokenizer.tokenize(example.text0)[:max_seq_length]
        tokens1 = tokenizer.tokenize(example.text1)
        # Truncates the sequence so that its length is at most max_seq_length - 3
        # This takes into account the [SEP] and [CLS] tokens required by BERT
        while len(tokens0) + len(tokens1) > max_seq_length - 3:
            if len(tokens0) > len(tokens1):
                tokens0.pop()
            else:
                tokens1.pop()
        tokens = tokens0 + ["[SEP]"]
        segment_ids = [0] * len(tokens)
        
        tokens += tokens1 + ["[SEP]"]
        segment_ids += [1] * (len(tokens1) + 1)
        tokens = ["[CLS]"] + tokens
        segment_ids = [1] + segment_ids

        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_ids)

        # If necessary, zero-pad 
        padding_length = max_seq_length - len(input_ids)
        input_ids = input_ids + ([0] * padding_length)
        input_mask = input_mask + ([0] * padding_length)
        segment_ids = segment_ids + ([0] * padding_length)
        
        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        features.append(
            TokenizedData(input_ids=input_ids,
                          input_mask=input_mask,
                          segment_ids=segment_ids,
                          label=int(example.label)))
    return features

In [60]:
train_features = tokenize_raw_text(raw_text, tokenizer)
len(features)

*** Tokenizing Data *** 

824342

In [62]:
train_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
train_input_masks = torch.tensor([f.input_mask for f in features], dtype=torch.long)
train_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
train_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
train_dataset = TensorDataset(train_input_ids, train_input_masks, train_segment_ids, train_label_ids)

In [65]:
learning_rate = 1e-5
adam_epsilon = 1e-8
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=adam_epsilon)

In [26]:
# from torch import optim
# optimizer = optim.Adam(model.parameters())

In [68]:
def train(train_dataset, model, optimizer, batch_size=100, num_epochs=8):
    """ Train the model """
    train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)

    t_total = len(train_dataloader) // num_epochs

    scheduler = get_linear_schedule_with_warmup(optimizer, 0, t_total)
    
    print("*** Training ***")

    global_step = 0
    model.zero_grad()
    model.train()
    for idx, _ in enumerate(range(num_epochs)):
        cumu_loss, curr_loss = 0.0, 0.0
        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.to(device) for t in batch)
            inputs = {'input_ids': batch[0],
                      'attention_mask': batch[1],
                     'labels': batch[3]}
            outputs = model(input_ids=batch[0], attention_mask=batch[1], labels=batch[3])
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.25)
            loss = outputs[0]
            loss.backward()
            cumu_loss += loss.item()
            optimizer.step()
            scheduler.step()
            model.zero_grad()
            global_step += 1
            if step % 50 == 49:
                print("=", end="")
            if step % 800 == 799:
                print('[Epoch %d / %d, minibatch %d / %d] loss: %.5f' %
                  (idx + 1, num_epochs, step + 1, len(train_dataloader), (cumu_loss - curr_loss) / 800))
                curr_loss = cumu_loss
    return global_step, cumu_loss / global_step

In [None]:
global_step, training_loss = train(dataset, model, optimizer)

In [45]:
torch.save(model.state_dict(), 'roberta-model__v2.pt')

In [70]:
model.load_state_dict(torch.load('roberta-model__v2.pt', map_location=torch.device('cpu')))

<All keys matched successfully>

In [71]:
with open('python_test/batch_0.txt', "r", encoding='utf-8') as f:
    test0_lines = []
    for line in f.readlines():
        line = line.strip().split('<CODESPLIT>')
        if len(line) != 5:
            continue
        test0_lines.append(line)

In [9]:
test0_examples = []
for (i, line) in enumerate(test0_lines):
    guid = "%s-%s" % ('test', i)
    text_a = line[3]
    text_b = line[4]
    label = "0"
    test0_examples.append(
        InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))

In [12]:
test0_features = convert_examples_to_features(test0_examples,
                                       ['0', '1'],
                                       max_seq_length=50,
                                       tokenizer=tokenizer,
                                       output_mode='classification'
                                       )

In [13]:
test0_input_ids = torch.tensor([f.input_ids for f in test0_features], dtype=torch.long)
test0_input_mask = torch.tensor([f.input_mask for f in test0_features], dtype=torch.long)
test0_segment_ids = torch.tensor([f.segment_ids for f in test0_features], dtype=torch.long)
test0_label_ids = torch.tensor([f.label_id for f in test0_features], dtype=torch.long)
test0_dataset = TensorDataset(test0_input_ids, test0_input_mask, test0_segment_ids, test0_label_ids)

In [58]:
def evaluate(model, tokenizer, dataset, lines, output_test_file, batch_size=32):
    """
    Evaluates the model based on classification accuracy. Receives the logits that are output
    from the network and saves the result in the given output directory file.
    """
    sampler = SequentialSampler(dataset)
    dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=batch_size)

    print("*** Evaluating ***")
    eval_loss = 0.0
    num_steps = 0
    preds = None
    out_label_ids = None
    for i, batch in enumerate(eval_dataloader):
        if i % 200 == 199:
            print("=", end="")
        if i % 5000 == 4999:
            print("[Step " + str(i+1) + " / " + str(len(eval_dataloader)) + "] " )
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            labels = batch[3]
            outputs = model(input_ids=batch[0], attention_mask=batch[1], labels=labels)
            tmp_eval_loss, logits = outputs[:2]
            eval_loss += tmp_eval_loss.mean().item()
            
        num_steps += 1
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = labels.detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids, labels.detach().cpu().numpy(), axis=0)
            
    eval_loss = eval_loss / num_steps
    
    preds_label = np.argmax(preds, axis=1)
    
    accuracy = (preds_label == out_label_ids).mean()
    output_dir = os.path.dirname(output_test_file)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    with open(output_test_file, "w") as writer:
        all_logits = preds.tolist()
        for i, logit in enumerate(all_logits):
            line = '<CODESPLIT>'.join(
                [item.encode('ascii', 'ignore').decode('ascii') for item in lines[i]])

            writer.write(line + '<CODESPLIT>' + '<CODESPLIT>'.join([str(l) for l in logit]) + '\n')
        print("Accuracy =", str(accuracy))

    return accuracy

In [59]:
results = evaluate(model, tokenizer, test0_dataset, test0_lines, "./python_results/batch0_result__v2.txt")

***** Running evaluation {} *****
  Num examples = 1000000
  Batch size = 32
acc_and_f1 = 0.4949405
f1 = 0.0


In [72]:
from more_itertools import chunked
import numpy as np

In [74]:
batch_size = 1000

ranks = []
num_batch = 0
file = 'python_results/batch0_result__v2.txt'
with open(file, encoding='utf-8') as f:
    batched_data = chunked(f.readlines(), batch_size)
    for batch_idx, batch_data in enumerate(batched_data):
        num_batch += 1
        step1 = batch_data[batch_idx].strip().split('<CODESPLIT>')
        correct_score = float(step1[-1])
        scores = np.array([float(data.strip().split('<CODESPLIT>')[-1]) for data in batch_data])
        rank = np.sum(scores >= correct_score)
        ranks.append(rank)

mean_mrr = np.mean(1.0 / np.array(ranks))
print("Python mrr: {}".format(mean_mrr))

Python mrr: 0.7634221178177741
