In [1]:
import torch
import torch.nn as nn
import pandas as pd
import glob
from transformers import AutoTokenizer, AutoModel, RobertaConfig, AutoModelForSequenceClassification
from random import randint

from torch.utils.data import Dataset, DataLoader
import tqdm

epochs = 1
batch_size = 3

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class Classifier(nn.Module):
    def __init__(self):
        super(type(self), self).__init__()
        config = RobertaConfig.from_pretrained("neulab/codebert-cpp")
        config.position_embedding_type = 'relative'
        config.output_past=False
        self.one = torch.ones((batch_size, 1)).to(device)

        self.encoder = AutoModel.from_pretrained("neulab/codebert-cpp", config=config, ignore_mismatched_sizes=True)
        self.output = nn.Sequential(
            nn.Linear(1536, 1536, bias=True),
            nn.Dropout(0.1),
            nn.Linear(1536,1, bias=True),
        )

    def forward(self, x0, m0, x1, m1):
        x0 = self.encoder(x0, attention_mask=m0)['pooler_output']
        x1 = self.encoder(x1, attention_mask=m1)['pooler_output']
        x = torch.cat([x0, x1], dim=1)
        x = self.output(x)

        return x

model = Classifier()
tokenizer = AutoTokenizer.from_pretrained("neulab/codebert-cpp")
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
model.to(device)


  return self.fget.__get__(instance, owner)()
Some weights of RobertaModel were not initialized from the model checkpoint at neulab/codebert-cpp and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Classifier(
  (encoder): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

In [3]:
class CustomDataset(Dataset):
    def __init__(self):
        super(type(self), self).__init__()
        self.codes = glob.glob('/home/code_inspector/train_code/**/*.cpp', recursive=True)

    def __getitem__(self, i):
        prob = (i // 500) + 1
        num = (i % 500) + 1

        with open(f'/home/code_inspector/train_code/problem{prob:03d}/problem{prob:03d}_{num}.cpp') as fs:
            code = ''.join(fs.readlines())

        num = num+randint(1,499)
        num = num - 500 if num > 500 else num
        with open(f'/home/code_inspector/train_code/problem{prob:03d}/problem{prob:03d}_{num}.cpp') as fs:
            code_same = ''.join(fs.readlines())

        prob = prob+randint(1,9)
        prob = prob - 10 if prob > 10 else prob
        with open(f'/home/code_inspector/train_code/problem{prob:03d}/problem{prob:03d}_{num}.cpp') as fs:
            code_diff = ''.join(fs.readlines())
        

        return code, code_same, code_diff

    def __len__(self):
        return 10*500

dataloader = DataLoader(CustomDataset(), batch_size=batch_size, shuffle=True, drop_last=True)
for code, code_same, code_diff in dataloader:
    encode = tokenizer(code, padding=True, truncation=True, max_length=514, return_tensors="pt")
    encode_same = tokenizer(code_same, padding=True, truncation=True, max_length=514, return_tensors="pt")
    encode_diff = tokenizer(code_diff, padding=True, truncation=True, max_length=514, return_tensors="pt")

    code = encode['input_ids']
    code_same = encode_same['input_ids']
    code_diff = encode_diff['input_ids']

    mask = encode['attention_mask']
    mask_same = encode_same['attention_mask']
    mask_diff = encode_diff['attention_mask']
    break

In [4]:
dataloader = DataLoader(CustomDataset(), batch_size=batch_size, shuffle=True, drop_last=True)

if False:
    checkpoint = torch.load('code_inspector.pt')
    model.load_state_dict(checkpoint['model'])
    optimizer.load_state_dict(checkpoint['optimizer'])

one = torch.ones((batch_size, 1)).to(device)
zero = torch.zeros((batch_size, 1)).to(device)

model.train()
for epoch in range(epochs):
    total_loss = 0.0
    precision = 0
    recall = 0
    for i, (code, code_same, code_diff) in enumerate(dataloader):
        encode = tokenizer(code, padding=True, truncation=True, max_length=514, return_tensors="pt").to(device)
        encode_same = tokenizer(code_same, padding=True, truncation=True, max_length=514, return_tensors="pt").to(device)
        encode_diff = tokenizer(code_diff, padding=True, truncation=True, max_length=514, return_tensors="pt").to(device)

        code = encode['input_ids']
        code_same = encode_same['input_ids']
        code_diff = encode_diff['input_ids']

        mask = encode['attention_mask']
        mask_same = encode_same['attention_mask']
        mask_diff = encode_diff['attention_mask']

        pred_same = model(code, mask, code_same, mask_same)
        pred_diff = model(code, mask, code_diff, mask_diff)
        loss = nn.functional.mse_loss(pred_same, one) + nn.functional.mse_loss(pred_diff, zero)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        precision += torch.sum(pred_same >= 0.5)
        recall += torch.sum(pred_diff < 0.5)

        if i % 100 == 99:
            print(f'loss: {total_loss:4.3f}, precision: {precision/batch_size:.1f}, recall: {recall/batch_size:.1f}')
            total_loss = 0.0
            precision = 0
            recall = 0
            
            checkpoint = {
                'model': model.state_dict(),
                'optimizer': optimizer.state_dict(),
            }
            torch.save(checkpoint, 'code_inspector.pt')


KeyboardInterrupt: 