In [16]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, BertForSequenceClassification, BertTokenizer
from datasets import load_dataset, load_dataset_builder
from tqdm import tqdm
import numpy as np
import pandas as pd
from huggingface_hub import hf_hub_download
from sklearn.metrics import accuracy_score, f1_score

In [17]:
# model_handle = 'ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli'
# model_handle = 'chromeNLP/textattack_bert_base_MNLI_fixed'
# model_handle = 'facebook/bart-large-mnli'
# model_name = 'bert-base-uncased-snli-help'
# model_handle = '../models/bert-base-uncased-snli-help/'

model_handle = './models/infobert-checkpoint/'
model_name = 'infobert'

dataset_name = 'snli'
# split = 'validation_matched'
# split = 'test'

In [18]:
model = AutoModelForSequenceClassification.from_pretrained(model_handle, resume_download=True)
label2id = model.config.label2id
label2id

Some weights of the model checkpoint at ./models/infobert-checkpoint/ were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'LABEL_0': 0, 'LABEL_1': 1, 'LABEL_2': 2}

In [19]:
label2id = {
    'infobert': {
        'entailment': 2, 
        'neutral': 1, 
        'contradiction': 0,
    },

    'bert-base-uncased-snli': {
        'entailment': 1,
        'neutral': 2, 
        'contradiction': 0
    },

    'bert-base-uncased-snli-help': {
        'entailment': 1,
        'neutral': 2, 
        'contradiction': 2
    },
}


In [20]:
builder = load_dataset_builder(dataset_name)
dataset = load_dataset(dataset_name, split=split).filter(lambda x :  x['label']!=-1)
builder.info.features

Found cached dataset snli (/home/julia/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)


ValueError: Unknown split "validation_matched". Should be one of ['test', 'train', 'validation'].

In [26]:
# 
# guess_label2id = {
#     'entailment': 0,
#     'neutral': 2, 
#     'contradiction': 1
# }

# mnli same as dataset label2id
# guess_label2id = {
#     'entailment': 0,
#     'neutral': 1, 
#     'contradiction': 2
# }

# best for roberta-large-mnli
# guess_label2id = {
#     'entailment': 2,
#     'neutral': 1, 
#     'contradiction': 0
# }

#guess for for bert-base-uncased-snli-help
guess_label2id = {
    'entailment': 1,
    'neutral': 2, 
    'contradiction': 2
}

# best for bert-base-uncased-snli
# guess_label2id = {
#     'entailment': 1,
#     'neutral': 2, 
#     'contradiction': 0
# }

dataset = dataset.align_labels_with_mapping(label2id[model_name], 'label')
# dataset = dataset.align_labels_with_mapping(label2id, 'label')
dataset.set_format(type="torch", device='cuda')

Loading cached processed dataset at /home/julia/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39/cache-2b17587843f58083.arrow


In [27]:
results = {}
results = pd.DataFrame(results)
results["y_true"] = dataset['label'].to('cpu')
results["y_true"].apply(int)

0       1
1       0
2       2
3       0
4       0
       ..
9810    1
9811    0
9812    2
9813    2
9814    0
Name: y_true, Length: 9815, dtype: int64

In [28]:
try:
    tokenizer = AutoTokenizer.from_pretrained(model_handle)
except OSError:
    tokenizer = AutoTokenizer.from_pretrained('textattack/bert-base-uncased-snli')
max_length=256
def encode(examples):
    return tokenizer(examples["premise"], examples["hypothesis"], truncation=True, padding="max_length", max_length=max_length)

In [29]:
dataset = dataset.map(encode, batched=True)
dataset = dataset.map(lambda examples: {"labels": examples["label"]}, batched=True)

dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "premise", "hypothesis",  "labels"], device='cuda')
dataloader = torch.utils.data.DataLoader(dataset, batch_size=64)

100%|██████████| 10/10 [00:02<00:00,  3.85ba/s]
100%|██████████| 10/10 [00:01<00:00,  5.09ba/s]


In [30]:
df = dataset
df = df[:]
results['premise'] = df['premise']
results['hypothesis'] = df['hypothesis']
results.head()

Unnamed: 0,y_true,premise,hypothesis
0,1,The new rights are nice enough,Everyone really likes the newest benefits
1,0,This site includes a list of all award winners...,The Government Executive articles housed on th...
2,2,uh i don't know i i have mixed emotions about ...,"I like him for the most part, but would still ..."
3,0,yeah i i think my favorite restaurant is alway...,My favorite restaurants are always at least a ...
4,0,i don't know um do you do a lot of camping,I know exactly.


In [31]:
model.to('cuda')
model.eval()

with torch.no_grad():
    y_pred = []
    for inputs in tqdm(dataloader):
        batch_outputs = model(inputs['input_ids'], inputs['attention_mask'])
        batch_logits = batch_outputs['logits'].to('cpu')
        batch_predictions = np.argmax(batch_logits, axis=1)
        y_pred += batch_predictions

100%|██████████| 154/154 [04:07<00:00,  1.61s/it]


In [32]:
results["y_pred"] = y_pred
results["y_pred"] = results["y_pred"].apply(int)

In [33]:
# results.loc[results.y_true!=results.y_pred].value_counts()
# results.loc[results.y_true==results.y_pred].value_counts()

In [34]:
#y_pred
#2112
#y_true
#1021

In [35]:
accuracy_score(results["y_true"], results["y_pred"])
# f1_score(results["y_true"], results["y_pred"], average="macro")

0.9110545084055017

In [36]:
accuracy_score(results['y_pred'], results['y_true'])

0.9110545084055017

In [37]:
results["y_pred"].loc[results.y_true==0].value_counts()

0    3025
1     141
2      47
Name: y_pred, dtype: int64

In [38]:
results["y_pred"].loc[results.y_true==1].value_counts()

1    2769
0     177
2     177
Name: y_pred, dtype: int64

In [39]:
results["y_pred"].loc[results.y_true==2].value_counts()

2    3148
1     280
0      51
Name: y_pred, dtype: int64


| model | accuracy on snli| accuracy on mnli validation_matched|
| --- | --- | --- |
|ynie | 91.8464| --- |
| bert-base-uncased-snli-help | 73.381 | 61.82 |
| chromeNLP | | |