In [1]:
import os
import csv
import torch

from collections import defaultdict
from sklearn.metrics import f1_score, classification_report
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from torch.utils.data import DataLoader
from tqdm import tqdm

In [2]:
torch.manual_seed(42)

<torch._C.Generator at 0x7fd1940a7fb0>

In [3]:
os.environ['TOKENIZERS_PARALLELISM'] = 'False'

In [4]:
train_data = defaultdict(list)
with open(f'../data/experimental splits/train_1ns.csv', 'r') as f:
    reader = csv.reader(f)
    next(f)
    for line in reader:
        sentence,label,concept,category,feature,idx = line
        train_data[concept].append(float(label))
        train_data[f'{concept}_{feature.split(" ")[0].strip()}']
#         data.append([idx, sentence, int(label), concept, category, feature])

In [5]:
concept_bias = {k: int((torch.tensor(v).mean() > 0.5).item()) for k,v in train_data.items()}

In [6]:
def load_split(split):
    data = []
    with open(f'../data/experimental splits/{split}_1ns.csv', 'r') as f:
        reader = csv.reader(f)
        next(f)
        for line in tqdm(reader):
            sentence,label,concept,category,feature,idx = line
            data.append([idx, sentence, int(label), concept, category, feature])
            
    return data

In [7]:
class PropertyJudge:
    def __init__(self, model_path, device='cpu'):
        self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.device = device
        
        self.model.to(self.device)
        self.model.eval()
        
    def tokenize(self, batch):
        return self.tokenizer(batch, padding=True, return_tensors='pt').to(self.device)
        
    def infer(self, batch):
        with torch.no_grad():
            logits = self.model(**batch).logits.detach()
            logprob = logits - logits.logsumexp(1).unsqueeze(1)

            predicted_labels = logprob.argmax(1).tolist()
        
        return predicted_labels

In [8]:
MODEL = 'axxl-property'
PATH = f'../../induction/checkpoints/finetuned_models/{MODEL}'

propjudge = PropertyJudge(PATH, 'cuda:0')

In [10]:
test_data = load_split("test")

6788it [00:00, 689318.83it/s]


In [13]:
truth2 = []
cb_predicted = []
bg_predicted = []
catches = 0
for entry in test_data:
    idx, sentence, label, concept, category, feature = entry
    truth2.append(label)
    cb_predicted.append(concept_bias[concept])
    try:
        bg_predicted.append(concept_bias[f'{concept}_{feature.split(" ")[0].strip()}'])
    except:
        catches+=1
        bg_predicted.append(1)

In [17]:
truth = []
predicted = []
model_results = []
test_dl = DataLoader(test_data, batch_size = 32, num_workers = 16)

for batch in tqdm(test_dl):
    idx, sentences, labels, concept, category, feature = batch
    labels = labels.tolist()
    sentences = list(sentences)
    encoded = propjudge.tokenize(sentences)
    pred = propjudge.infer(encoded)
    
    truth.extend(labels)
    predicted.extend(pred)
    
    model_results.extend(list(zip(idx, sentences, concept, category, feature, labels, pred)))

100%|███████████████████████████████████████████████████████████████████████████| 213/213 [00:15<00:00, 13.49it/s]


In [9]:
with open(f'../data/results/property-judgment/{MODEL}.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['model', 'idx', 'sentence', 'concept', 'category', 'feature', 'label', 'predicted'])
    for result in model_results:
        idx, sentences, concept, category, feature, labels, pred
        writer.writerow([MODEL, idx, sentences, concept, category, feature, labels, pred])

In [19]:
f1_score(truth, predicted)

0.78274311410905

In [18]:
import numpy as np

rng = np.random.default_rng()


def eval_with_paired_bootstrap(gold, sys1, sys2, num_samples=10000, sample_ratio=0.5):
    """Evaluate with paired boostrap
    This compares two systems, performing a significance tests with
    paired bootstrap resampling to compare the accuracy of the two systems.

    Parameters
    ----------
    gold
      The correct labels
    sys1
      The output of system 1
    sys2
      The output of system 2
    num_samples
      The number of bootstrap samples to take
    sample_ratio
      The ratio of samples to take every time

    """
    assert len(gold) == len(sys1)
    assert len(gold) == len(sys2)

    gold = np.array(gold)
    sys1 = np.array(sys1)
    sys2 = np.array(sys2)

    sys1_scores = []
    sys2_scores = []
    wins = [0, 0, 0]
    n = len(gold)

    for _ in range(num_samples):
        # Subsample the gold and system outputs
        subset_idxs = rng.choice(n, int(n * sample_ratio), replace=True)
#         sys1_score = (sys1[subset_idxs] == gold[subset_idxs]).mean()
#         sys2_score = (sys2[subset_idxs] == gold[subset_idxs]).mean()
        sys1_score = f1_score(gold[subset_idxs], sys1[subset_idxs])
        sys2_score = f1_score(gold[subset_idxs], sys2[subset_idxs])

        if sys1_score > sys2_score:
            wins[0] += 1
        elif sys1_score < sys2_score:
            wins[1] += 1
        else:
            wins[2] += 1

        sys1_scores.append(sys1_score)
        sys2_scores.append(sys2_score)

    # Print win stats
    wins = [x / float(num_samples) for x in wins]
    print("Win ratio: sys1=%.3f, sys2=%.3f, tie=%.3f" % (wins[0], wins[1], wins[2]))
    if wins[0] > wins[1]:
        print("(sys1 is superior with p value p=%.3f)\n" % (1 - wins[0]))
    elif wins[1] > wins[0]:
        print("(sys2 is superior with p value p=%.3f)\n" % (1 - wins[1]))

    # Print system stats
    sys1_scores.sort()
    sys2_scores.sort()
    print(
        "sys1 mean=%.3f, median=%.3f, 95%% confidence interval=[%.3f, %.3f], sd=%.3f"
        % (
            np.mean(sys1_scores),
            np.median(sys1_scores),
            sys1_scores[int(num_samples * 0.025)],
            sys1_scores[int(num_samples * 0.975)],
            np.mean(sys1_scores) - sys1_scores[int(num_samples * 0.025)]
        )
    )
    print(
        "sys2 mean=%.3f, median=%.3f, 95%% confidence interval=[%.3f, %.3f], sd=%.3f"
        % (
            np.mean(sys2_scores),
            np.median(sys2_scores),
            sys2_scores[int(num_samples * 0.025)],
            sys2_scores[int(num_samples * 0.975)],
            np.mean(sys2_scores) - sys2_scores[int(num_samples * 0.025)]
        )
    )

In [19]:
eval_with_paired_bootstrap(truth, predicted, cb_predicted)

Win ratio: sys1=1.000, sys2=0.000, tie=0.000
(sys1 is superior with p value p=0.000)

sys1 mean=0.790, median=0.791, 95% confidence interval=[0.776, 0.805], sd=0.015
sys2 mean=0.652, median=0.652, 95% confidence interval=[0.633, 0.670], sd=0.019


In [20]:
eval_with_paired_bootstrap(truth, predicted, [1]*len(predicted))

Win ratio: sys1=1.000, sys2=0.000, tie=0.000
(sys1 is superior with p value p=0.000)

sys1 mean=0.791, median=0.791, 95% confidence interval=[0.775, 0.805], sd=0.015
sys2 mean=0.667, median=0.667, 95% confidence interval=[0.652, 0.681], sd=0.015
