In [9]:
import json
import jsonlines
import os
import glob
from tokenizers import ByteLevelBPETokenizer
import torch
import torch.nn.functional as F
from gpt2_model import GPT, generate_square_subsequent_mask
import pandas as pd
from collections import Counter

In [2]:
if torch.cuda.is_available():
    device = "cuda"
    print("Cuda is available. Using GPU.")
else:
    device = "cpu"
    print("Cuda is not available. Using CPU.")

Cuda is available. Using GPU.


In [13]:
model = torch.load("saved_models/bllip/distilled/distilled_bllip_ltg_gpt2.pt")

In [14]:
model = model['model']

In [15]:
def get_probs(sentence, model, tokenizer):
    with torch.no_grad():
        tokens = tokenizer.encode(sentence).ids

        # Add BOS and EOS tokens to match training regimen
        tokens.insert(0, 0)
        tokens.append(2)

        # Add batch dimension and move to device
        tokens = torch.tensor(tokens, dtype=torch.long).unsqueeze(0).to(device)
        model.to(device)
        inputs = tokens[:, :-1]
        labels = tokens[:, 1:]

        mask = generate_square_subsequent_mask(size=inputs.size(1), device=device)
        logits = model(input_ids=inputs, attention_mask=mask)
        log_probs_word = F.log_softmax(logits, dim=-1)

        # Add dimension to labels
        # Then gather lob probs in log_probs_word based on values in labels
        # Then ditch the last dimension and sum total log_probs
        gathered_log_probs = torch.gather(log_probs_word, 2, labels.unsqueeze(2)).squeeze(2).sum(1)
        return gathered_log_probs
    
def run_test_suite(model, files, tokenizer, seed=42):
    torch.manual_seed(seed)
    score_dict = {}
    for file in files:
        with jsonlines.open(file) as reader:
            data = [obj for obj in reader]
        total_sents = len(data)
        total_correct = 0
        for test in data:
            good_sentence = " " + test['sentence_good']
            bad_sentence = " " + test['sentence_bad']

            good_probs = get_probs(good_sentence, model, tokenizer)
            bad_probs = get_probs(bad_sentence, model, tokenizer)

            if good_probs > bad_probs:
                total_correct += 1
        score = (total_correct / total_sents)*100
        score_dict[file] = score
    return score_dict

In [16]:
tokenizer = ByteLevelBPETokenizer('tokenizers/rnng/vocab.json', 'tokenizers/rnng/merges.txt')

In [23]:
files = ['blimp_data/sentential_negation_npi_scope.jsonl']

In [24]:
out = run_test_suite(model, files, tokenizer)

In [3]:
pattern = "blimp_data/*.jsonl"
files = glob.glob(pattern, recursive=True)

In [4]:
all_terms = []
for file in files:
    with jsonlines.open(file) as reader:
        data = [obj for obj in reader]
    for row in data:
        all_terms.append(row['linguistics_term'])

In [10]:
Counter(all_terms)

Counter({'island_effects': 8000,
         'argument_structure': 7000,
         'quantifiers': 4000,
         'determiner_noun_agreement': 8000,
         'control_raising': 5000,
         'subject_verb_agreement': 6000,
         's-selection': 2000,
         'filler_gap_dependency': 7000,
         'binding': 7000,
         'npi_licensing': 7000,
         'anaphor_agreement': 2000,
         'irregular_forms': 2000,
         'ellipsis': 2000})