In [36]:
import torch

from fairseq.models.roberta import RobertaModel
roberta = RobertaModel.from_pretrained('../roberta.large.mnli', checkpoint_file='model.pt')
roberta.eval()  # disable dropout (or leave in train mode to finetune)

# Encode a pair of sentences and make a prediction
tokens = roberta.encode('[ Ramona and Beezus ] Fox 2000 Pictures released the film on July 23 , 2010 .', 'Fox 2000 Pictures released the film Soul Food .')
print(roberta.predict('mnli', tokens).argmax())  # 0: contradiction

tokens = roberta.encode('Robert is a heavily optimized version of BERT', 'Roberta is based on BERT.')
print(roberta.predict('mnli', tokens).argmax()) # 2: entailment

tokens = roberta.encode('Jonty is a chef.', 'Jonty hates cooking.')
print(roberta.predict('mnli', tokens).argmax())  # 0: contradiction

loading archive file ../roberta.large.mnli
| dictionary: 50264 types
tensor(1)
tensor(2)
tensor(0)


In [1]:
from text_utils import TextEncoder
from datasets import _entailment, entailment
from utils import encode_dataset

text_encoder = TextEncoder('model/encoder_bpe_40000.json', 'model/vocab_40000.bpe')
test_prefix = '../data/fever/dev'
tst_premise, tst_hypothesis, tst_y = _entailment(test_prefix)
test_set = encode_dataset([(tst_premise, tst_hypothesis, tst_y)], encoder=text_encoder)

 48%|███████████████▉                 | 39913/82555 [00:00<00:00, 399125.62it/s]

Loading ../data/fever/dev


                                                                                

In [39]:
import torch
import re
import math
from tqdm import tqdm_notebook
from fairseq.models.roberta import RobertaModel
from fairseq.data.data_utils import collate_tokens


def predict(tst_premise, tst_hypothesis, batch_size, result_file):
    roberta = RobertaModel.from_pretrained('../roberta.large.mnli', checkpoint_file='model.pt')
    roberta.eval()  # disable dropout (or leave in train mode to finetune)
    predictions = []
    print("Running predictions")
    
    list_of_pairs = list(zip(tst_premise, tst_hypothesis))
    for i in tqdm_notebook(range(1)):#range(math.ceil(len(tst_premise)/batch_size))):
        batch = collate_tokens(
            [roberta.encode(pair[0], pair[1]) for pair in list_of_pairs[i*batch_size:(i+1)*batch_size]], pad_idx=1
        )
        try:
            logprobs = roberta.predict('mnli', batch)
            print(logprobs)
            predictions = predictions + logprobs.argmax(dim=1).tolist()
            print(predictions)
        except:
            print('I made it')
            predictions = predictions + torch.ones([batch.shape[0]], dtype=torch.long).tolist()
    
    print("Succeeded")
    
    with open(result_file, 'w') as f:
        f.write('{}\t{}\n'.format('index', 'prediction'))
        for i, prediction in enumerate(predictions):
            print(i)
            print(prediction)
            f.write('{}\t{}\n'.format(i, prediction))

In [40]:
predict(tst_premise, tst_hypothesis, 32, '../data/fever/roberta_results_file_dev_asdgfasldkfjsan')

loading archive file ../roberta.large.mnli
| dictionary: 50264 types
Running predictions


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

tensor([[-5.2041e+00, -7.0819e-02, -2.7666e+00],
        [-2.1173e-02, -3.9987e+00, -5.9478e+00],
        [-4.7036e+00, -1.9429e-02, -4.5874e+00],
        [-3.6511e+00, -1.3335e-01, -2.3139e+00],
        [-4.1180e+00, -2.2018e-02, -5.2031e+00],
        [-4.4458e-01, -1.0342e+00, -5.6868e+00],
        [-8.9010e-01, -5.3388e-01, -5.7897e+00],
        [-1.2008e+00, -3.6301e-01, -5.6686e+00],
        [-1.4514e-01, -2.0230e+00, -5.8640e+00],
        [-3.6590e-01, -1.1976e+00, -5.4004e+00],
        [-2.5911e+00, -8.6441e-02, -4.8448e+00],
        [-5.0400e+00, -4.7121e+00, -1.5581e-02],
        [-3.1549e+00, -4.5192e-02, -6.4733e+00],
        [-3.9246e+00, -7.0360e-02, -3.0326e+00],
        [-4.4295e+00, -4.1976e-02, -3.5340e+00],
        [-7.6944e-04, -7.4795e+00, -8.4944e+00],
        [-6.1946e-04, -7.8702e+00, -8.3460e+00],
        [-7.1738e-04, -7.6221e+00, -8.3880e+00],
        [-1.5905e-03, -7.0436e+00, -7.2414e+00],
        [-5.6353e-03, -5.4054e+00, -6.7881e+00],
        [-5.1409e+00

In [None]:
from text_utils import TextEncoder
from datasets import _entailment, entailment
from utils import encode_dataset

text_encoder = TextEncoder('model/encoder_bpe_40000.json', 'model/vocab_40000.bpe')
test_prefix = '../data/fever-copy/test'
tst_premise_short, tst_hypothesis_short, tst_y_short = _entailment(test_prefix)
test_set_short = encode_dataset([(tst_premise_short, tst_hypothesis_short, tst_y_short)], encoder=text_encoder)

In [17]:
import argparse
import json
import sys


predicted = []

with open('../data/fever/dev-predictions.jsonl',"r") as predictions_file:
    for line in predictions_file:
        predicted.append(json.loads(line)['predicted_label'])