# Setup

In [1]:
import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "5,6,7,8"

In [2]:
from pathlib import Path
import string
from typing import Literal, Any

import pandas as pd
import requests
from selfcheckgpt.modeling_selfcheck import SelfCheckNLI
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import torch
import torch.nn.functional as F
import tqdm.notebook as tqdm
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

In [3]:
DeviceType = Literal['cpu', 'cuda']
Samples = list[Literal['train', 'val', 'test']]
Tasks = list[Literal['agnostic', 'aware']]
Model = Literal[
    'FacebookAI/roberta-large-mnli',
    'deberta-selfchecknli',
    'google/t5_xxl_true_nli_mixture',
    'sentence-transformers/nli-roberta-large',
    'microsoft/deberta-base-mnli', 
    'microsoft/deberta-large-mnli', 
    'microsoft/deberta-xlarge-mnli',
    'microsoft/deberta-v2-xlarge-mnli',
    'microsoft/deberta-v2-xxlarge-mnli',
    'openchat',
]

In [4]:
MODEL: Model = 'deberta-selfchecknli'
DEVICE: DeviceType = 'cuda'
SAMPLES: Samples = ['val', 'test']
TASKS: Tasks = ['agnostic', 'aware']
INVERSE_NLI = False

INPUT_DATA_PATH = './data/'
OUTPUT_DATA_PATH = './output/'
REFERENCE_PATHS, SUBMISSION_PATHS, RESULT_PATHS = {}, {}, {}
for sample in SAMPLES:
    REFERENCE_PATHS[sample] = f'{INPUT_DATA_PATH}{sample}/'
    SUBMISSION_PATHS[sample] = f"{OUTPUT_DATA_PATH}{MODEL.replace('/', '-')}{'_inverse' if INVERSE_NLI else ''}/{sample}/"
    RESULT_PATHS[sample] = f'{SUBMISSION_PATHS[sample]}{sample}.score.txt'

# For OpneChat model only
TEMPERATURE = 0
MAX_TOKENS = 5
REQUEST_NUMBER = 1
INSTRACTION = "Is the Sentence supported by the Context above? Answer using ONLY yes or no."
TEMPLATE = "Context: {context}\n\nSentence: {sentence}\n\nSystem: {instruction}\n\n"

In [5]:
for sample in SAMPLES:
    Path(SUBMISSION_PATHS[sample]).mkdir(parents=True, exist_ok=True)

class UserError(Exception): pass

assert DEVICE in ['cpu', 'cuda'], 'Wrong device'
assert all(sample in ['train', 'val', 'test'] for sample in SAMPLES), 'Wrong sample'
assert all(task in ['agnostic', 'aware'] for task in TASKS), 'Wrong task'
assert MODEL in [
    'FacebookAI/roberta-large-mnli',
    'deberta-selfchecknli',
    'google/t5_xxl_true_nli_mixture',
    'sentence-transformers/nli-roberta-large',
    'microsoft/deberta-base-mnli', 
    'microsoft/deberta-large-mnli', 
    'microsoft/deberta-xlarge-mnli',
    'microsoft/deberta-v2-xlarge-mnli',
    'microsoft/deberta-v2-xxlarge-mnli',
    'openchat',
    ], 'Wrong model'

# Load data

In [6]:
# load data
data = {}
for sample in SAMPLES:
    for task in TASKS:
        file_path = INPUT_DATA_PATH + sample + '/' + sample + '.model-' + task + '.json'
        data[sample + '-' + task] = pd.read_json(file_path)

# prepocess data
if 'train-aware' in data.keys():
    data['train-aware'].at[10519, 'src'] = 'None'
if 'train-agnostic' in data.keys():
    data['train-agnostic'].drop(columns=['model'], inplace=True)
if 'valid-agnostic' in data.keys():
    data['valid-agnostic'].drop(columns=['model'], inplace=True)

# Predict

In [7]:
def get_texts(datapoint: pd.Series):
    task = str(datapoint['task'])
    src = str(datapoint['src'])
    hyp = str(datapoint['hyp'])
    tgt = str(datapoint['tgt'])
    if task == 'DM':
        premise = tgt
    elif task == 'MT':
        premise = tgt  # may be src, but in another language
    elif task =='PG':
        premise = src  # may be either tgt or src
    else:
        raise UserError('Task should be either DM, MT or PG.')
    
    return premise, hyp


def get_model_object() -> Any:
    if MODEL == 'deberta-selfchecknli':
        model_object = SelfCheckNLI(device=torch.device(DEVICE))
    elif MODEL == 'google/t5_xxl_true_nli_mixture':
        model_object = pipeline("text2text-generation", model=MODEL, device_map="auto")
    elif MODEL == 'sentence-transformers/nli-roberta-large':
        model_object = SentenceTransformer(MODEL, device=DEVICE)
    elif MODEL == 'FacebookAI/roberta-large-mnli':
        model_object = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli')
        model_object.eval()
        if DEVICE == 'cuda': model_object.cuda()
    elif MODEL in [
        'microsoft/deberta-base-mnli', 'microsoft/deberta-large-mnli',
        'microsoft/deberta-xlarge-mnli', 'microsoft/deberta-v2-xlarge-mnli',
        'microsoft/deberta-v2-xxlarge-mnli',
        ]:
        global tokenizer
        model_object = AutoModelForSequenceClassification.from_pretrained(MODEL)
        tokenizer = AutoTokenizer.from_pretrained(MODEL)
    elif MODEL == 'openchat':
        model_object = None
    else:
        raise UserError('Wrong method or model')
    
    return model_object


def predict(premise: str, hyp: str, model: Any) -> tuple[float, float | None]:
    entailment_proba = None
    if MODEL == 'deberta-selfchecknli':
        contradiction_proba = model.predict(sentences=[hyp], sampled_passages=[premise])[0]
    elif MODEL == 'FacebookAI/roberta-large-mnli':
        tokens = model.encode(premise, hyp)
        result = model.predict('mnli', tokens)
        probs = torch.exp(result.detach()[0]).tolist()
        contradiction_proba, entailment_proba = probs[0], probs[2]
    elif MODEL == 'google/t5_xxl_true_nli_mixture':
        template = "premise: {premise} hypothesis: {hypothesis}"
        prompt = template.format(premise=premise, hypothesis=hyp)
        output = model([prompt])
        contradiction_proba = float(1 - int(output[0]['generated_text']))
    elif MODEL == 'sentence-transformers/nli-roberta-large':
        embeddings = model.encode([hyp, premise])
        contradiction_proba = 1 - cosine_similarity(embeddings, dense_output=False)[0][1]
    elif MODEL in [
        'microsoft/deberta-base-mnli', 'microsoft/deberta-large-mnli',
        'microsoft/deberta-xlarge-mnli', 'microsoft/deberta-v2-xlarge-mnli',
        'microsoft/deberta-v2-xxlarge-mnli',
        ]:
        inputs = tokenizer(premise, hyp, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
        probabilities = F.softmax(outputs.logits, dim=1)
        contradiction_proba = probabilities[0][0].item()
        entailment_proba = probabilities[0][2].item()
    elif MODEL == 'openchat':
        contradiction_proba = 0.0
        for _ in range(REQUEST_NUMBER):
            url = "http://localhost:18888/v1/chat/completions"
            prompt = TEMPLATE.format(context=premise, sentence=hyp, instruction=INSTRACTION)
            data = {
                "temperature": TEMPERATURE,
                "max_tokens": MAX_TOKENS,
                "model": "openchat_3.5",
                "messages": [{"role": "user", "content": prompt}],
            }
            response = requests.post(url, json=data).json()
            answer = response['choices'][0]['message']['content']
            answer = answer.strip(string.punctuation + string.whitespace).lower()
            if answer.startswith("yes"):
                pass
            elif answer.startswith("no"):
                contradiction_proba += 1.0
            else:
                contradiction_proba += 0.5
        contradiction_proba /= REQUEST_NUMBER
    else:
        raise UserError('Wrong model')

    return contradiction_proba, entailment_proba


def predict_df(data: pd.DataFrame, sample: str, output_path: str, model: Any, round_probs=False):
    # check output path
    print(output_path)
    if os.path.exists(output_path):
        raise UserError('The file already exists.')
    
    with open(output_path, 'a', encoding='utf-8') as fp:
        first_iter = True
        test_sample = sample == 'test'
        predictions = []
        for _, row in tqdm.tqdm(data.iterrows(), total=len(data)):
            # get premise and hypothesis
            premise, hyp = get_texts(row)
            if INVERSE_NLI:
                premise, hyp = hyp, premise  # Swap them
    
            # predict
            contradiction_proba, entailment_proba = predict(premise, hyp, model)
            
            # postprocess probability
            if round_probs:  # round to 0.2 step
                contradiction_proba = round(contradiction_proba * 5) / 5
                if entailment_proba is not None:
                    entailment_proba = round(entailment_proba * 5) / 5 

            # form and write header
            if first_iter:
                header = 'id,' if test_sample else ''
                header += 'p(Contr)'
                if entailment_proba is not None:
                    header += ',p(Entl)'
                header += '\n'
                fp.write(header)
                first_iter = False
            
            # form probability to string
            data_string = f"{row['id']}," if test_sample else ''
            data_string += f'{contradiction_proba:.6f}'
            if entailment_proba is not None:
                data_string += f',{entailment_proba:.6f}'
            data_string += '\n'

            # write last 50 preds to file
            predictions.append(data_string)
            if len(predictions) % 50 == 0:
                fp.writelines(predictions)
                predictions = []

        if predictions:
            fp.writelines(predictions)
            predictions = []


def predict_dfs(data: dict[str, pd.DataFrame]):
    model = get_model_object()
    for sample in SAMPLES:
        for task in TASKS:
            output_file_path = f'{SUBMISSION_PATHS[sample]}{sample}.model-{task}.csv'
            predict_df(data[sample + '-' + task], sample, output_file_path, model)

In [8]:
predict_dfs(data)

tokenizer_config.json:   0%|          | 0.00/400 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/883 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


SelfCheck-NLI initialized to device cuda
./output/deberta-selfchecknli/val/val.model-agnostic.csv


  0%|          | 0/499 [00:00<?, ?it/s]

./output/deberta-selfchecknli/val/val.model-aware.csv


  0%|          | 0/501 [00:00<?, ?it/s]

./output/deberta-selfchecknli/test/test.model-agnostic.csv


  0%|          | 0/1500 [00:00<?, ?it/s]

./output/deberta-selfchecknli/test/test.model-aware.csv


  0%|          | 0/1500 [00:00<?, ?it/s]

In [9]:
# Show predictions
df = {}
for sample in SAMPLES:
    for task in TASKS:
        path = f'{SUBMISSION_PATHS[sample]}{sample}.model-{task}.csv'
        df[sample + '-' + task] = pd.read_csv(path)
        display(df[sample + '-' + task])

Unnamed: 0,p(Contr)
0,0.002097
1,0.643224
2,0.707190
3,0.008859
4,0.004337
...,...
494,0.000138
495,0.000263
496,0.000291
497,0.000053


Unnamed: 0,p(Contr)
0,0.736973
1,0.014666
2,0.058523
3,0.001738
4,0.986732
...,...
496,0.997198
497,0.000630
498,0.751895
499,0.976152


Unnamed: 0,id,p(Contr)
0,1,0.591021
1,2,0.603800
2,3,0.000358
3,6,0.990409
4,7,0.059645
...,...,...
1495,2992,0.000220
1496,2993,0.996383
1497,2994,0.017649
1498,2996,0.000135


Unnamed: 0,id,p(Contr)
0,0,0.001624
1,4,0.002131
2,5,0.003519
3,9,0.347478
4,11,0.000133
...,...,...
1495,2988,0.010280
1496,2991,0.001613
1497,2995,0.000221
1498,2997,0.000605


# Evaluate
## Convert .csv to .json

In [11]:
def norm_probs(a, b):
    if (sum_ := a + b) > 0:
        a, b = min(a / sum_, 1.0), min(b / sum_, 1.0)
    else:
        a, b = 0.5, 0.5
    return a, b


# Create .json from .csv
df = {}
for sample in SAMPLES:
    for task in TASKS:
        csv_path = f'{SUBMISSION_PATHS[sample]}{sample}.model-{task}.csv'
        json_path = f'{SUBMISSION_PATHS[sample]}{sample}.model-{task}.json'
        df[task] = pd.read_csv(csv_path).reset_index(drop=True)
        if 'p(Entl)' in df[task]:
            df[task]['p(Hallucination)'] = df[task].apply(lambda x: norm_probs(x['p(Contr)'], x['p(Entl)'])[0], axis=1)
            df[task].drop(columns=['p(Entl)'], inplace=True)
        else:
            df[task]['p(Hallucination)'] = df[task]['p(Contr)']
        df[task].drop(columns=['p(Contr)'], inplace=True)
        df[task]['label'] = df[task].apply(lambda x: 'Hallucination' if x['p(Hallucination)'] > 0.5 else 'Not Hallucination', axis=1)
        df[task].to_json(json_path, orient='records')

## Check file format

In [12]:
for sample in SAMPLES:
    if sample == 'val':
        !python ./check_output.py {SUBMISSION_PATHS[sample]} --is_val
    elif sample == 'test':
        !python ./check_output.py {SUBMISSION_PATHS[sample]}

all clear!
all clear!


## Evaluate

In [13]:
for sample in SAMPLES:
    if sample == 'val':
        !python ./score.py {SUBMISSION_PATHS[sample]} {REFERENCE_PATHS[sample]} {RESULT_PATHS[sample]} --is_val
    elif sample == 'test':
        !python ./score.py {SUBMISSION_PATHS[sample]} {REFERENCE_PATHS[sample]} {RESULT_PATHS[sample]}

    if os.path.isfile(RESULT_PATHS[sample]):
        with open(RESULT_PATHS[sample]) as fp:
            print(sample)
            print(fp.read())

val
aware_acc:0.716566866267465
aware_rho:0.6714934562884112
agnostic_acc:0.751503006012024
agnostic_rho:0.6994542370904473

test
agnostic_acc:0.7533333333333333
agnostic_rho:0.6829601168067542
aware_acc:0.7586666666666667
aware_rho:0.6831961651299768



## Delete .json files

In [None]:
for sample in SAMPLES:
    for task in TASKS:
        json_path = f'{SUBMISSION_PATHS[sample]}{sample}.model-{task}.json'
        if os.path.isfile(json_path):
            os.remove(json_path)