In [7]:
# Constants

ENTRY_COUNT = 5000
BATCH_SIZE = 300
OUTPUTS_BATCH_SIZE = 10

import torch

device = "cpu"

if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"

torch.set_default_device(device)

In [4]:
# Download corpus

import pyterrier as pt
import json

def download_dataset():
    if not pt.java.started():
        pt.java.init()
    dataset = pt.get_dataset('irds:codesearchnet')

    return list(map(lambda x: x["code"], list(filter(lambda x: x["language"]=='python', dataset.get_corpus_iter()))[:ENTRY_COUNT]))

corpus = download_dataset()

with open("entries.json", "w") as f:
    json.dump(corpus, f)

codesearchnet documents: 100%|██████████| 2070536/2070536 [00:31<00:00, 65195.42it/s] 


In [12]:
# Set up the tokenizer
from transformers import RobertaTokenizer, RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', use_fast=True)

def tokenizer_func(data):
    return tokenizer(data["code"], padding="max_length", truncation=True)

# Set up dataset and create tokens

from datasets import Dataset

dataset = Dataset.from_dict({"code": corpus})

tokenized_dataset = dataset.map(tokenizer_func, batched=True, num_proc=6)

tokenized_dataset.to_json("tokens.json")

  from .autonotebook import tqdm as notebook_tqdm

Map (num_proc=6):   0%|          | 0/5000 [00:00<?, ? examples/s][A
Map (num_proc=6):  17%|█▋        | 833/5000 [00:00<00:02, 1545.35 examples/s][A
Map (num_proc=6):  50%|█████     | 2500/5000 [00:00<00:00, 4459.42 examples/s][A
Map (num_proc=6): 100%|██████████| 5000/5000 [00:00<00:00, 5433.43 examples/s][A

Creating json from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s][A
Creating json from Arrow format: 100%|██████████| 5/5 [00:00<00:00, 22.62ba/s][A


22280735

In [2]:
# Set up model and load tokens
from torch.utils.data import DataLoader
from transformers import RobertaModel
from datasets import Dataset

model = RobertaModel.from_pretrained('roberta-base').half()
model.eval()

tokenized_dataset = Dataset.from_json("tokens.json")

tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask"])

  from .autonotebook import tqdm as notebook_tqdm
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# Run model
import os

data_loader = DataLoader(tokenized_dataset, batch_size=BATCH_SIZE)

results = []

with torch.no_grad():
    out_idx=0
    batch_idx=0
    for batch in data_loader:
        if os.path.isfile(f"./output_{out_idx}.pt"):
            batch_idx+=OUTPUTS_BATCH_SIZE
            out_idx+=1
            continue
        output = model(**batch, output_hidden_states=True)
        print("Progress:{} {:.5f}".format(batch_idx, batch_idx / len(data_loader)))
        results.append(output.last_hidden_state.cpu())

        if len(results) == OUTPUTS_BATCH_SIZE or batch_idx == len(data_loader) - 1:
            stacked_results = torch.cat(results)
            torch.save(stacked_results, f"output_{out_idx}.pt")

            out_idx += 1

            results = []
        batch_idx+=1

Progress:0 0.00000
Progress:1 0.05882
Progress:2 0.11765
Progress:3 0.17647
Progress:4 0.23529
Progress:5 0.29412
Progress:6 0.35294
Progress:7 0.41176
Progress:8 0.47059
Progress:9 0.52941
Progress:10 0.58824
Progress:11 0.64706
Progress:12 0.70588
Progress:13 0.76471
Progress:14 0.82353
Progress:15 0.88235
Progress:16 0.94118
