For CodeSearch, this notebook simply attempts to reproduce the results in the paper ["CodeBERT:
A Pre-Trained Model for Programming and Natural Languages,"](https://arxiv.org/pdf/2002.08155.pdf)

In [1]:
from __future__ import absolute_import
import os
import sys
import pickle
import torch
import json
import random
from pathlib import Path
import logging
import argparse
import numpy as np
from io import open
from itertools import cycle
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
from torch.utils.data.distributed import DistributedSampler
from transformers import (AdamW, get_linear_schedule_with_warmup,
                          RobertaConfig, RobertaModel, RobertaTokenizer, RobertaForSequenceClassification)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [2]:
config = RobertaConfig.from_pretrained('microsoft/codebert-base', num_labels=2, finetuning_task="codesearch")
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base', do_lower_case=True)
model = RobertaForSequenceClassification.from_pretrained('microsoft/codebert-base', config=config)
model.to(device)

Some weights of the model checkpoint at microsoft/codebert-base were not used when initializing RobertaForSequenceClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be 

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [3]:
with open('train.txt', "r", encoding='utf-8') as f:
    lines = []
    for line in f.readlines():
        line = line.strip().split('<CODESPLIT>')
        if len(line) != 5:
            continue
        lines.append(line)

In [7]:
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

In [8]:
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id

In [6]:
examples = []
for (i, line) in enumerate(lines):
    guid = "%s-%s" % ("train", i)
    text_a = line[3]
    text_b = line[4]
    label = line[0]
    examples.append(
        InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))

In [21]:
label_list=["0","1"]
label_map = {label: i for i, label in enumerate(label_list)}
features = []

In [22]:
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()

In [23]:
sep_token = "[SEP]"
cls_token = "[CLS]"
sequence_a_segment_id = 0
sequence_b_segment_id = 1
cls_token_segment_id = 1
max_seq_length = 50
pad_token=0
pad_token_segment_id=0


for (ex_index, example) in enumerate(examples):

    tokens_a = tokenizer.tokenize(example.text_a)[:max_seq_length]

    tokens_b = None
    if example.text_b:
        tokens_b = tokenizer.tokenize(example.text_b)
        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
    else:
        if len(tokens_a) > max_seq_length - 2:
            tokens_a = tokens_a[:(max_seq_length - 2)]
    tokens = tokens_a + [sep_token]
    segment_ids = [sequence_a_segment_id] * len(tokens)

    if tokens_b:
        tokens += tokens_b + [sep_token]
        segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)
    tokens = [cls_token] + tokens
    segment_ids = [cls_token_segment_id] + segment_ids

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    padding_length = max_seq_length - len(input_ids)
    input_ids = input_ids + ([pad_token] * padding_length)
    input_mask = input_mask + ([0] * padding_length)
    segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    label_id = label_map[example.label]
    features.append(
        InputFeatures(input_ids=input_ids,
                      input_mask=input_mask,
                      segment_ids=segment_ids,
                      label_id=label_id))

In [24]:
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

In [28]:
learning_rate = 1e-5
adam_epsilon = 1e-8

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)]},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)]}
]
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=adam_epsilon)

In [26]:
# from torch import optim
# optimizer = optim.Adam(model.parameters())

In [60]:
def train(train_dataset, model, optimizer, batch_size=100, num_epochs=8):
    """ Train the model """
    train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)

    t_total = len(train_dataloader) // num_epochs

    scheduler = get_linear_schedule_with_warmup(optimizer, 0, t_total)
    
    print("***** Running training *****")
    print("  Num examples =", len(train_dataset))
    print("  Num Epochs =", num_epochs)
    print("  Total optimization steps =", t_total)

    global_step = 0
    model.zero_grad()
    model.train()
    for idx, _ in enumerate(range(num_epochs)):
        cumu_loss, curr_loss = 0.0, 0.0
        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.to(device) for t in batch)
            inputs = {'input_ids': batch[0],
                      'attention_mask': batch[1],
                     'labels': batch[3]}
            outputs = model(input_ids=batch[0], attention_mask=batch[1], labels=batch[3])
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.25)
            loss = outputs[0]
            loss.backward()
            cumu_loss += loss.item()
            optimizer.step()
            scheduler.step()
            model.zero_grad()
            global_step += 1
            if step % 50 == 49:
                print("=", end="")
            if step % 800 == 799:
                print('[Epoch %d, minibatch %5d] loss: %.5f' %
                  (idx + 1, step + 1, (cumu_loss - curr_loss) / 800))
                curr_loss = cumu_loss
    return global_step, cumu_loss / global_step

In [None]:
global_step, training_loss = train(dataset, model, optimizer)

***** Running training *****
  Num examples = 824342
  Num Epochs = 8
  Total optimization steps = 1030

In [45]:
torch.save(model.state_dict(), 'roberta-model__v2.pt')

In [4]:
model.load_state_dict(torch.load('roberta-model__v2.pt', map_location=torch.device('cpu')))

<All keys matched successfully>

In [5]:
with open('python_test/batch_0.txt', "r", encoding='utf-8') as f:
    test0_lines = []
    for line in f.readlines():
        line = line.strip().split('<CODESPLIT>')
        if len(line) != 5:
            continue
        test0_lines.append(line)

In [9]:
test0_examples = []
for (i, line) in enumerate(test0_lines):
    guid = "%s-%s" % ('test', i)
    text_a = line[3]
    text_b = line[4]
    label = "0"
    test0_examples.append(
        InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))

In [10]:
def convert_examples_to_features(examples, label_list, max_seq_length,
                                 tokenizer, output_mode,
                                 cls_token_at_end=False, pad_on_left=False,
                                 cls_token='[CLS]', sep_token='[SEP]', pad_token=0,
                                 sequence_a_segment_id=0, sequence_b_segment_id=1,
                                 cls_token_segment_id=1, pad_token_segment_id=0,
                                 mask_padding_with_zero=True):
    """ Loads a data file into a list of `InputBatch`s
        `cls_token_at_end` define the location of the CLS token:
            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
    """

    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):

        tokens_a = tokenizer.tokenize(example.text_a)[:50]

        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[:(max_seq_length - 2)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids:   0   0   0   0  0     0   0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambiguously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = tokens_a + [sep_token]
        segment_ids = [sequence_a_segment_id] * len(tokens)

        if tokens_b:
            tokens += tokens_b + [sep_token]
            segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)

        if cls_token_at_end:
            tokens = tokens + [cls_token]
            segment_ids = segment_ids + [cls_token_segment_id]
        else:
            tokens = [cls_token] + tokens
            segment_ids = [cls_token_segment_id] + segment_ids

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_seq_length - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
            segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
        else:
            input_ids = input_ids + ([pad_token] * padding_length)
            input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
            segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        if output_mode == "classification":
            label_id = label_map[example.label]
        elif output_mode == "regression":
            label_id = float(example.label)
        else:
            raise KeyError(output_mode)

        features.append(
            InputFeatures(input_ids=input_ids,
                          input_mask=input_mask,
                          segment_ids=segment_ids,
                          label_id=label_id))
    return features

In [11]:
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()

In [12]:
test0_features = convert_examples_to_features(test0_examples,
                                       ['0', '1'],
                                       max_seq_length=50,
                                       tokenizer=tokenizer,
                                       output_mode='classification'
                                       )

In [13]:
test0_input_ids = torch.tensor([f.input_ids for f in test0_features], dtype=torch.long)
test0_input_mask = torch.tensor([f.input_mask for f in test0_features], dtype=torch.long)
test0_segment_ids = torch.tensor([f.segment_ids for f in test0_features], dtype=torch.long)
test0_label_ids = torch.tensor([f.label_id for f in test0_features], dtype=torch.long)
test0_dataset = TensorDataset(test0_input_ids, test0_input_mask, test0_segment_ids, test0_label_ids)

In [53]:
from sklearn.metrics import f1_score

def simple_accuracy(preds, labels):
    return (preds == labels).mean()


def acc_and_f1(preds, labels):
    acc = simple_accuracy(preds, labels)
    f1 = f1_score(y_true=labels, y_pred=preds)
    return {
        "acc": acc,
        "f1": f1,
        "acc_and_f1": (acc + f1) / 2,
    }

In [58]:
def evaluate(model, tokenizer, dataset, lines, output_test_file, batch_size=32):
    results = {}
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=batch_size)

    # Eval!
    print("***** Running evaluation {} *****")
    print("  Num examples =", len(dataset))
    print("  Batch size =", batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None
    for i, batch in enumerate(eval_dataloader):
        if i % 200 == 199:
            print("=", end="")
        if i % 5000 == 4999:
            print("[Step " + str(i+1) + " / " + str(len(eval_dataloader)) + "] " )
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            labels = batch[3]
            outputs = model(input_ids=batch[0], attention_mask=batch[1], labels=labels)
            tmp_eval_loss, logits = outputs[:2]

            eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = labels.detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids, labels.detach().cpu().numpy(), axis=0)
            
    eval_loss = eval_loss / nb_eval_steps
    
    preds_label = np.argmax(preds, axis=1)
    
    accuracy = (preds_label == out_label_ids).mean()
    output_dir = os.path.dirname(output_test_file)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    with open(output_test_file, "w") as writer:
        all_logits = preds.tolist()
        for i, logit in enumerate(all_logits):
            line = '<CODESPLIT>'.join(
                [item.encode('ascii', 'ignore').decode('ascii') for item in lines[i]])

            writer.write(line + '<CODESPLIT>' + '<CODESPLIT>'.join([str(l) for l in logit]) + '\n')
        print("Accuracy =", str(accuracy))

    return accuracy

In [59]:
results = evaluate(model, tokenizer, test0_dataset, test0_lines, "./python_results/batch0_result__v2.txt")

***** Running evaluation {} *****
  Num examples = 1000000
  Batch size = 32
acc_and_f1 = 0.4949405
f1 = 0.0


In [None]:
from more_itertools import chunked
import numpy as np

In [None]:
batch_size = 1000

ranks = []
num_batch = 0
file = 'batch0_result__v2.txt'
with open(file, encoding='utf-8') as f:
    batched_data = chunked(f.readlines(), batch_size)
    for batch_idx, batch_data in enumerate(batched_data):
        num_batch += 1
        step1 = batch_data[batch_idx].strip().split('<CODESPLIT>')
        correct_score = float(step1[-1])
        scores = np.array([float(data.strip().split('<CODESPLIT>')[-1]) for data in batch_data])
        rank = np.sum(scores >= correct_score)
        ranks.append(rank)

mean_mrr = np.mean(1.0 / np.array(ranks))
print("Python mrr: {}".format(mean_mrr))