In [1]:
import os
os.chdir('/home/s3/hyeryung/mucoco')

import argparse
import logging
import os
from pathlib import Path

import evaluate
import numpy as np
import pandas as pd
import torch
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

import wandb
from evaluation.prompted_sampling.evaluate import (
    conditional_perplexity,
    distinctness,
    fluency_classify,
    formality_score_ext,
    formality_score_int,
    repetition,
    sentiment_classify_big,
    sentiment_classify_own2,
    toxicity_score,
    toxicity_score_energy,
    toxicity_score_int,
    toxicity_score_mucola,
)

In [2]:
## logging-related
logging.basicConfig(level=logging.DEBUG, format="%(message)s")
logger = logging.getLogger("le")
logger.setLevel(logging.DEBUG)


In [9]:
def evaluate_lewis_metrics(sources_file,outputs_file,results_file,update_wandb,wandb_run_path):
    sources = None
    predictions = None
    
    try: # could be in json format - generated by us
        sources = pd.read_json(sources_file, lines=True)
        ## unravel prediction data
        sources = sources.explode("generations")
        sources["text"] = sources["generations"].apply(lambda x: x["text"])
        sources = sources["text"].tolist()
        # already detokenized
    except: 
        with open(sources_file, "r") as f:
            sources = [line.rstrip("\n") for line in f.readlines()]
    
    try: # could be in json format - generated by us
        predictions = pd.read_json(outputs_file, lines=True)
        ## unravel prediction data
        predictions = predictions.explode("generations")
        predictions["text"] = predictions["generations"].apply(lambda x: x["text"])
        predictions = predictions["text"].tolist()
        # already detokenized
        
    except: 
        with open(args.outputs_file, "r") as f:
            predictions = [line.rstrip("\n") for line in f.readlines()]
            
    ## start evaluation
    ## -- BLEU, SBLEU
    # https://huggingface.co/spaces/evaluate-metric/sacrebleu
    sacrebleu = evaluate.load("sacrebleu")
    # decided not to save raw sbleu score since it took a while to compute
    # sbleu_score_raw = [sacrebleu.compute(predictions=[predictions[i]], references=[sources[i]])['score'] for i in range(len(predictions))]
    sbleu_score = sacrebleu.compute(
        predictions=predictions, references=[[sent] for sent in sources]
    )["score"]

    ## -- BERTScore, SBERTScore
    # https://huggingface.co/spaces/evaluate-metric/bertscore
    # The function returns a dictionary with the following keys - precision, recall, f1, hashcode - and corresponding values for each sentence
    bertscore = evaluate.load("bertscore")
    sbert_score_raw = np.array(
        bertscore.compute(
            predictions=predictions,
            references=sources,
            lang="en",
            rescale_with_baseline=True,
        )["f1"]
    )
    # Take the mean of f1 scores for all the predictions
    sbert_score = np.mean(sbert_score_raw)

    if update_wandb:
        api = wandb.Api()
        run = api.run(wandb_run_path)
        run.summary.update(
            {
                "sbleu": sbleu_score,
                "sbert": sbert_score * 100,
            }
        )
        run.update()
        
    if results_file is not None:
        
        with open(results_file, 'a') as f:
            f.write("-"*50 + "\n")
            f.write(f"sbleu: {sbleu_score}\n")
            f.write(f"sbert_score: {sbert_score * 100}\n")
        
        sbertscore_outputs = pd.DataFrame(
            {"sbert_score": sbert_score_raw}
        )
        sbertscore_outputs.to_csv(results_file + ".sbertscore", index=False)

    return sbleu_score, sbert_score*100



In [15]:
var_sources_file='new_module/data/toxicity-avoidance/testset_gpt2_2500.jsonl'
var_outputs_file='outputs/toxicity/roberta-base-jigsaw-toxicity-classifier-with-gpt2-large-embeds-energy-training/gbi-word-netps-1-nls-1-os200-es40-allsat-toxic-to-nontoxic-attention-duf2r40i/outputs_epsilon-1.09861228867.txt'
var_results_file='outputs/toxicity/roberta-base-jigsaw-toxicity-classifier-with-gpt2-large-embeds-energy-training/gbi-word-netps-1-nls-1-os200-es40-allsat-toxic-to-nontoxic-attention-duf2r40i/results_epsilon-1.09861228867-test.txt'
var_update_wandb=True
var_wandb_run_path='hayleyson/toxicity_gbi/duf2r40i'

ret_sbleu_score, ret_sbert_score = evaluate_lewis_metrics(var_sources_file,var_outputs_file,var_results_file,var_update_wandb,var_wandb_run_path)
print(ret_sbleu_score, ret_sbert_score)

Starting new HTTPS connection (1): s3.amazonaws.com:443
https://s3.amazonaws.com:443 "HEAD /datasets.huggingface.co/datasets/metrics/evaluate-metric/sacrebleu/evaluate-metric/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/v0.4.1/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/main/sacrebleu.py HTTP/1.1" 200 0
Attempting to acquire lock 140502026722512 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 140502026722512 acquired on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Attempting to release lock 140502026722512 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 140502026722512 released on /hom

79.62055911987534 80.2123210121598


In [12]:
var_sources_file='new_module/data/toxicity-avoidance/testset_gpt2_2500.jsonl'
var_outputs_file='outputs/toxicity/mlm-reranking/roberta-base-jigsaw-toxicity-classifier-with-gpt2-large-embeds-energy-training/mlm-beamsearch-v0-word-nps5-k10-beam5-allsat_primary-toxic-to-nontoxic-grad_norm-oc6pto51/outputs_epsilon0.75_filled.txt'
var_results_file='outputs/toxicity/mlm-reranking/roberta-base-jigsaw-toxicity-classifier-with-gpt2-large-embeds-energy-training/mlm-beamsearch-v0-word-nps5-k10-beam5-allsat_primary-toxic-to-nontoxic-grad_norm-oc6pto51/results_epsilon0.75-test.txt'
var_update_wandb=True
var_wandb_run_path='hayleyson/toxicity-decoding/oc6pto51'

ret_sbleu_score, ret_sbert_score = evaluate_lewis_metrics(var_sources_file,var_outputs_file,var_results_file,var_update_wandb,var_wandb_run_path)
print(ret_sbleu_score, ret_sbert_score)

Starting new HTTPS connection (1): s3.amazonaws.com:443


https://s3.amazonaws.com:443 "HEAD /datasets.huggingface.co/datasets/metrics/evaluate-metric/sacrebleu/evaluate-metric/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/v0.4.1/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/main/sacrebleu.py HTTP/1.1" 200 0
Attempting to acquire lock 140502021291280 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 140502021291280 acquired on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Attempting to release lock 140502021291280 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 140502021291280 released on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_module

0.19385010053656257 -3.5599555759959913


In [14]:
var_sources_file='new_module/data/toxicity-avoidance/testset_gpt2_2500.jsonl'
var_outputs_file='outputs/toxicity/mlm-reranking/roberta-base-jigsaw-toxicity-classifier-with-gpt2-large-embeds-energy-training/mlm-beamsearch-v1-word-nps5-k10-beam3-allsat_primary-toxic-to-nontoxic-grad_norm-gb09bmmt/outputs_epsilon0.75_filled.txt'
var_results_file='outputs/toxicity/mlm-reranking/roberta-base-jigsaw-toxicity-classifier-with-gpt2-large-embeds-energy-training/mlm-beamsearch-v1-word-nps5-k10-beam3-allsat_primary-toxic-to-nontoxic-grad_norm-gb09bmmt/results_epsilon0.75-test.txt'
var_update_wandb=True
var_wandb_run_path='hayleyson/toxicity-decoding/gb09bmmt'

ret_sbleu_score, ret_sbert_score = evaluate_lewis_metrics(var_sources_file,var_outputs_file,var_results_file,var_update_wandb,var_wandb_run_path)
print(ret_sbleu_score, ret_sbert_score)

Starting new HTTPS connection (1): s3.amazonaws.com:443
https://s3.amazonaws.com:443 "HEAD /datasets.huggingface.co/datasets/metrics/evaluate-metric/sacrebleu/evaluate-metric/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/v0.4.1/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/main/sacrebleu.py HTTP/1.1" 200 0
Attempting to acquire lock 140502028035728 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 140502028035728 acquired on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Attempting to release lock 140502028035728 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 140502028035728 released on /hom

0.19523889530459831 -3.66357445439513


In [17]:
var_sources_file='new_module/data/sentiment/outputs.txt.init.jsonl'
var_outputs_file='outputs/sentiment/roberta-base-yelp-sentiment-classifier-with-gpt2-large-embeds-energy-training/gbi-word-netps-1-nls-1-os200-es40-allsat-negative-to-positive-attention-ugr3ts53/outputs_epsilon-1.09861228867.txt'
var_results_file='outputs/sentiment/roberta-base-yelp-sentiment-classifier-with-gpt2-large-embeds-energy-training/gbi-word-netps-1-nls-1-os200-es40-allsat-negative-to-positive-attention-ugr3ts53/results_epsilon-1.09861228867-test.txt'
var_update_wandb=True
var_wandb_run_path='hayleyson/sentiment_gbi/ugr3ts53'

ret_sbleu_score, ret_sbert_score = evaluate_lewis_metrics(var_sources_file,var_outputs_file,var_results_file,var_update_wandb,var_wandb_run_path)
print(ret_sbleu_score, ret_sbert_score)

Starting new HTTPS connection (1): s3.amazonaws.com:443
https://s3.amazonaws.com:443 "HEAD /datasets.huggingface.co/datasets/metrics/evaluate-metric/sacrebleu/evaluate-metric/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/v0.4.1/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/main/sacrebleu.py HTTP/1.1" 200 0
Attempting to acquire lock 140503347880720 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 140503347880720 acquired on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Attempting to release lock 140503347880720 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 140503347880720 released on /hom

56.25560423686509 52.70575179475054


In [18]:
var_sources_file='new_module/data/sentiment/outputs.txt.init.jsonl'
var_outputs_file='outputs/sentiment/mlm-reranking/roberta-base-yelp-sentiment-classifier-with-gpt2-large-embeds-energy-training/mlm-beamsearch-v0-word-nps5-k10-beam5-allsat_primary-4tml0ox0-positive/outputs_epsilon0.75.txt'
var_results_file='outputs/sentiment/mlm-reranking/roberta-base-yelp-sentiment-classifier-with-gpt2-large-embeds-energy-training/mlm-beamsearch-v0-word-nps5-k10-beam5-allsat_primary-4tml0ox0-positive/results_epsilon0.75-test.txt'
var_update_wandb=True
var_wandb_run_path='hayleyson/sentiment-decoding/4tml0ox0'

ret_sbleu_score, ret_sbert_score = evaluate_lewis_metrics(var_sources_file,var_outputs_file,var_results_file,var_update_wandb,var_wandb_run_path)
print(ret_sbleu_score, ret_sbert_score)

Starting new HTTPS connection (1): s3.amazonaws.com:443
https://s3.amazonaws.com:443 "HEAD /datasets.huggingface.co/datasets/metrics/evaluate-metric/sacrebleu/evaluate-metric/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/v0.4.1/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/main/sacrebleu.py HTTP/1.1" 200 0
Attempting to acquire lock 140501971197264 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 140501971197264 acquired on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Attempting to release lock 140501971197264 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 140501971197264 released on /hom

20.132076667258044 -243.74623417854312


In [20]:
var_sources_file='data/formality/GYAFC_Corpus/Entertainment_Music/test/informal'
var_outputs_file='outputs/formality/mlm-reranking/roberta-base-pt16-formality-classifier-with-gpt2-large-embeds-energy-training/mlm-beamsearch-v0-word-nps5-k10-beam5-allsat_primary-informal-to-formal-grad_norm-ddtoxbcr/outputs_epsilon0.75.txt'
var_results_file='outputs/formality/mlm-reranking/roberta-base-pt16-formality-classifier-with-gpt2-large-embeds-energy-training/mlm-beamsearch-v0-word-nps5-k10-beam5-allsat_primary-informal-to-formal-grad_norm-ddtoxbcr/results_epsilon0.75-test.txt'
var_update_wandb=True
var_wandb_run_path='hayleyson/formality-decoding/ddtoxbcr'

ret_sbleu_score, ret_sbert_score = evaluate_lewis_metrics(var_sources_file,var_outputs_file,var_results_file,var_update_wandb,var_wandb_run_path)
print(ret_sbleu_score, ret_sbert_score)

Starting new HTTPS connection (1): s3.amazonaws.com:443


https://s3.amazonaws.com:443 "HEAD /datasets.huggingface.co/datasets/metrics/evaluate-metric/sacrebleu/evaluate-metric/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/v0.4.1/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/main/sacrebleu.py HTTP/1.1" 200 0
Attempting to acquire lock 140501946768080 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 140501946768080 acquired on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Attempting to release lock 140501946768080 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 140501946768080 released on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_module

30.285047990257716 42.60947119000775


In [22]:
var_sources_file='data/formality/GYAFC_Corpus/Entertainment_Music/test/informal'
var_outputs_file='outputs/formality/roberta-base-pt16-formality-classifier-with-gpt2-large-embeds-energy-training/gbi-word-netps-1-nls-1-os200-es40-allsat-informal-to-formal-attention-1nnb9kcv/outputs_epsilon-1.09861228867.txt'
var_results_file='outputs/formality/roberta-base-pt16-formality-classifier-with-gpt2-large-embeds-energy-training/gbi-word-netps-1-nls-1-os200-es40-allsat-informal-to-formal-attention-1nnb9kcv/results_epsilon-1.09861228867-test.txt'
var_update_wandb=True
var_wandb_run_path='hayleyson/formality_gbi/1nnb9kcv'

ret_sbleu_score, ret_sbert_score = evaluate_lewis_metrics(var_sources_file,var_outputs_file,var_results_file,var_update_wandb,var_wandb_run_path)
print(ret_sbleu_score, ret_sbert_score)

Starting new HTTPS connection (1): s3.amazonaws.com:443


https://s3.amazonaws.com:443 "HEAD /datasets.huggingface.co/datasets/metrics/evaluate-metric/sacrebleu/evaluate-metric/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/v0.4.1/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/main/sacrebleu.py HTTP/1.1" 200 0
Attempting to acquire lock 140501960720720 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 140501960720720 acquired on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Attempting to release lock 140501960720720 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 140501960720720 released on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_module

0.09152930778814619 -7.3246804683523585
