In [1]:
import config
from libs.hallucination import hallucination_dataset, hallucination_classifier
from libs.fixation_pred import FixNN
import numpy as np
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, precision_score, recall_score
import torch
from torch.utils.data import DataLoader

from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from utils import factcc_data
df_test = factcc_data.get_factcc_data(only_test=True)

y_test = df_test['label']

In [3]:
language_model =  "bert-base-uncased"
freeze_bert = config.freeze_bert
maxlen =  config.maxlen   
bs = config.batch_size   
iters_to_accumulate = config.iters_to_accumulate   
epochs = config.epochs 
lr = config.lr

# model trained on fixation data
checkpoint_tsm = "./models/gaze_model/bert-base-uncased_gaze.pt"
infer=True

In [4]:
def evaluation(model):

    predicted_proba = []
    
    if tsm_active:
        tsm_scores = []

    else:
        attention_scores = [] 
    

    for it, (seq, attn_masks, token_type_ids, labels) in enumerate(tqdm(test_loader)):
            


        # Converting to cuda tensors
        seq, attn_masks, token_type_ids, labels = \
            seq.to(device), attn_masks.to(device), token_type_ids.to(device), labels.to(device)

        
        if tsm_active:
            output, tsm_out = model(seq, attn_masks, token_type_ids)
            current_bs = tsm_out.size()[0]
            current_seq_len = tsm_out.size()[1]
            current_tsm_scores = tsm_out.view(current_bs, current_seq_len).tolist()
            tsm_scores += current_tsm_scores
        else:
            output = model(seq, attn_masks, token_type_ids)
            # attentions = torch.mean(torch.mean(attentions[-1], axis=1),axis=1).tolist()
            # attention_scores += attentions

            
        prediction = torch.sigmoid(output).tolist()

        predicted_proba += prediction

    hallucinated_proba = np.array(predicted_proba).reshape(len(predicted_proba))
    not_hallucinated_proba = 1 - hallucinated_proba
    
    y_probas = []
    for class_0, class_1 in zip(not_hallucinated_proba, hallucinated_proba):
        y_probas.append([class_0, class_1])
        
    y_probas = np.array(y_probas)
    y_pred = y_probas[:, 0] < 0.05
        
    if tsm_active:
        return y_pred, tsm_scores
    else:
        return y_pred

### GAB+LAB+GAZE

In [5]:
tsm_active = True # Local attention bias
fix_active = True # Gaze

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #config.device
model = hallucination_classifier(language_model=language_model, maxlen=320, fix_active=fix_active, checkpoint_tsm=checkpoint_tsm,  freeze_bert=False, tsm_active=tsm_active, infer=infer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_rela

Fixation Network Initialized with checkpoint
TSM Active


In [7]:
test_set = hallucination_dataset(df_test, maxlen, language_model)

test_loader = DataLoader(test_set, batch_size=6, num_workers=5)

model_path = "./models/bert-base-uncased_tsm_True_fix_True.pt"
# model_path = "./models/bert-base-uncased_tsm_True_fix_False.pt"
# model_path = "./models/bert-base-uncased_tsm_False_fix_False.pt"

loaded_model = torch.load(model_path)


In [8]:
model.load_state_dict(loaded_model)
_ = model.to(device)

y_pred, tsm_scores = evaluation(model)


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84/84 [00:04<00:00, 18.32it/s]


In [9]:
print("Accuracy: ", accuracy_score(y_pred, y_test))
print("Precision: ", precision_score(y_pred, y_test, average="weighted", pos_label="INCORRECT"))
print("Recall: ", recall_score(y_pred, y_test, average="weighted", pos_label="INCORRECT"))
print("F1:", f1_score(y_pred, y_test, average="weighted", pos_label="INCORRECT"))
print("Balanced Accuracy:", balanced_accuracy_score(y_pred, y_test))

Accuracy:  0.8946322067594433
Precision:  0.9743225236395249
Recall:  0.8946322067594433
F1: 0.9262795158871333
Balanced Accuracy: 0.8710361067503924


### GAB+LAB

In [10]:
tsm_active = True # Local attention bias
fix_active = False # Gaze

In [11]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #config.device
model = hallucination_classifier(language_model=language_model, maxlen=320, fix_active=fix_active, checkpoint_tsm=checkpoint_tsm,  freeze_bert=False, tsm_active=tsm_active, infer=infer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_rela

Fixation Network Initialized randomly
TSM Active


In [12]:
test_set = hallucination_dataset(df_test, maxlen, language_model)

test_loader = DataLoader(test_set, batch_size=6, num_workers=5)

# model_path = "./models/bert-base-uncased_tsm_True_fix_True.pt"
model_path = "./models/bert-base-uncased_tsm_True_fix_False.pt"
# model_path = "./models/bert-base-uncased_tsm_False_fix_False.pt"

loaded_model = torch.load(model_path)


In [13]:
model.load_state_dict(loaded_model)
_ = model.to(device)

y_pred, tsm_scores = evaluation(model)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84/84 [00:02<00:00, 30.05it/s]


In [14]:
print("Accuracy: ", accuracy_score(y_pred, y_test))
print("Precision: ", precision_score(y_pred, y_test, average="weighted", pos_label="INCORRECT"))
print("Recall: ", recall_score(y_pred, y_test, average="weighted", pos_label="INCORRECT"))
print("F1:", f1_score(y_pred, y_test, average="weighted", pos_label="INCORRECT"))
print("Balanced Accuracy:", balanced_accuracy_score(y_pred, y_test))

Accuracy:  0.8926441351888668
Precision:  0.9755640685911595
Recall:  0.8926441351888668
F1: 0.926033291032139
Balanced Accuracy: 0.8637135098438561


## GAB

In [15]:
tsm_active = False # Local attention bias
fix_active = False # Gaze

In [16]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #config.device
model = hallucination_classifier(language_model=language_model, maxlen=320, fix_active=fix_active, checkpoint_tsm=checkpoint_tsm,  freeze_bert=False, tsm_active=tsm_active, infer=infer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
test_set = hallucination_dataset(df_test, maxlen, language_model)

test_loader = DataLoader(test_set, batch_size=6, num_workers=5)

# model_path = "./models/bert-base-uncased_tsm_True_fix_True.pt"
# model_path = "./models/bert-base-uncased_tsm_True_fix_False.pt"
model_path = "./models/bert-base-uncased_tsm_False_fix_False.pt"

loaded_model = torch.load(model_path)

In [18]:
model.load_state_dict(loaded_model)
_ = model.to(device)

y_pred = evaluation(model)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84/84 [00:01<00:00, 46.76it/s]


In [19]:
print("Accuracy: ", accuracy_score(y_pred, y_test))
print("Precision: ", precision_score(y_pred, y_test, average="weighted", pos_label="INCORRECT"))
print("Recall: ", recall_score(y_pred, y_test, average="weighted", pos_label="INCORRECT"))
print("F1:", f1_score(y_pred, y_test, average="weighted", pos_label="INCORRECT"))
print("Balanced Accuracy:", balanced_accuracy_score(y_pred, y_test))

Accuracy:  0.8767395626242545
Precision:  0.9547317077710753
Recall:  0.8767395626242545
F1: 0.9100306428449637
Balanced Accuracy: 0.694558521560575
