In [1]:
import os
os.chdir('/home/s3/hyeryung/mucoco')

import argparse
import logging
import os
from pathlib import Path

import evaluate
import numpy as np
import pandas as pd
import torch
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

import wandb
from evaluation.prompted_sampling.evaluate import (
    conditional_perplexity,
    distinctness,
    fluency_classify,
    formality_score_ext,
    formality_score_int,
    repetition,
    sentiment_classify_big,
    sentiment_classify_own2,
    toxicity_score,
    toxicity_score_energy,
    toxicity_score_int,
    toxicity_score_mucola,
)

In [2]:
## logging-related
logging.basicConfig(level=logging.DEBUG, format="%(message)s")
logger = logging.getLogger("le")
logger.setLevel(logging.DEBUG)


In [15]:
def evaluate_lewis_metrics(sources_file,outputs_file,results_file,update_wandb,wandb_run_path,task):
    
    if task in ['toxicity','sentiment']:
        sources = pd.read_json(sources_file, lines=True)
        sources.prompt=sources.prompt.apply(lambda x: x['text'])
        
        predictions = pd.read_json(outputs_file, lines=True)
        predictions.prompt=predictions.prompt.apply(lambda x: x['text'])
        if task=='toxicity':
            source_predictions=pd.merge(sources,predictions,on='prompt',how='inner',suffixes=('_source','_prediction'))
        elif task=='sentiment':
            source_predictions=pd.concat([sources,predictions],axis=1)
            source_predictions=source_predictions.iloc[:, [0,1,4]].copy()
            source_predictions.columns=['prompt','generations_source','generations_prediction']
            
        prompt_list=[]
        source_list=[]
        prediction_list=[]
        for _, row in source_predictions.iterrows():
            prompt_list.extend([row.prompt]*len(row.generations_source))
            for i in range(len(row.generations_source)):
                source_list.append(row.generations_source[i]['text'])
                prediction_list.append(row.generations_prediction[i]['text'])
        source_predictions_=pd.DataFrame({'prompt':prompt_list,'source':source_list,'prediction':prediction_list})
        
    elif task=='formality':
        with open(sources_file,'r') as f:
            sources = [line.rstrip('\n') for line in f.readlines()]
            
        predictions = pd.read_json(outputs_file, lines=True)
        predictions = predictions.explode('generations')
        predictions['generations']=predictions['generations'].apply(lambda x: x['text'])
        
        source_predictions_ = pd.DataFrame({'source': sources, 'prediction': predictions['generations'].tolist()}) 
        

    ## start evaluation
    ## -- BLEU, SBLEU
    # https://huggingface.co/spaces/evaluate-metric/sacrebleu
    sacrebleu = evaluate.load("sacrebleu")
    # decided not to save raw sbleu score since it took a while to compute
    # sbleu_score_raw = [sacrebleu.compute(predictions=[predictions[i]], references=[sources[i]])['score'] for i in range(len(predictions))]
    sbleu_score = sacrebleu.compute(
        predictions=source_predictions_['prediction'].tolist(), references=source_predictions_['source'].tolist()
    )["score"]

    ## -- BERTScore, SBERTScore
    # https://huggingface.co/spaces/evaluate-metric/bertscore
    # The function returns a dictionary with the following keys - precision, recall, f1, hashcode - and corresponding values for each sentence
    bertscore = evaluate.load("bertscore")
    sbert_score_raw = np.array(
        bertscore.compute(
            predictions=source_predictions_['prediction'].tolist(),
            references=source_predictions_['source'].tolist(),
            lang="en",
            rescale_with_baseline=True,
        )["f1"]
    )
    # Take the mean of f1 scores for all the predictions
    sbert_score = np.mean(sbert_score_raw)

    if update_wandb:
        api = wandb.Api()
        run = api.run(wandb_run_path)
        run.summary.update(
            {
                "sbleu": sbleu_score,
                "sbert": sbert_score * 100,
            }
        )
        run.update()
        
    if results_file is not None:
        
        with open(results_file, 'a') as f:
            f.write("-"*50 + "\n")
            f.write(f"sbleu: {sbleu_score}\n")
            f.write(f"sbert_score: {sbert_score * 100}\n")
        
        sbertscore_outputs = pd.DataFrame(
            {"sbert_score": sbert_score_raw}
        )
        sbertscore_outputs.to_csv(results_file + ".sbertscore", index=False)

    return sbleu_score, sbert_score*100, source_predictions_



In [7]:
var_sources_file='new_module/data/toxicity-avoidance/dev_set.jsonl'
var_outputs_file='outputs/toxicity/mlm-reranking/roberta-base-jigsaw-toxicity-classifier-energy-training/devset/mlm-beamsearch-v0-word-nps5-k10-beam5-allsat_primary-toxic-to-nontoxic-grad_norm-4928q9xt/outputs_epsilon0.75.txt'
var_results_file='outputs/toxicity/mlm-reranking/roberta-base-jigsaw-toxicity-classifier-energy-training/devset/mlm-beamsearch-v0-word-nps5-k10-beam5-allsat_primary-toxic-to-nontoxic-grad_norm-4928q9xt/results_epsilon0.75-test.txt'
var_update_wandb=True
var_wandb_run_path='hayleyson/toxicity-decoding/4928q9xt'

ret_sbleu_score, ret_sbert_score, data = evaluate_lewis_metrics(var_sources_file,var_outputs_file,var_results_file,var_update_wandb,var_wandb_run_path)
print(ret_sbleu_score, ret_sbert_score)

Starting new HTTPS connection (1): s3.amazonaws.com:443
https://s3.amazonaws.com:443 "HEAD /datasets.huggingface.co/datasets/metrics/evaluate-metric/sacrebleu/evaluate-metric/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/v0.4.1/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/main/sacrebleu.py HTTP/1.1" 200 0
Attempting to acquire lock 140639909713040 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 140639909713040 acquired on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Attempting to release lock 140639909713040 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 140639909713040 released on /hom

68.58318450748477 69.65290718736583


In [6]:
var_sources_file='new_module/data/toxicity-avoidance/dev_set.jsonl'
var_outputs_file='outputs/toxicity/mlm-reranking/roberta-base-jigsaw-toxicity-classifier-with-gpt2-large-embeds-energy-training/devset/mlm-beamsearch-v0-word-nps5-k10-beam5-allsat_primary-toxic-to-nontoxic-grad_norm-f108tpx2/outputs_epsilon0.75.txt'
var_results_file='outputs/toxicity/mlm-reranking/roberta-base-jigsaw-toxicity-classifier-with-gpt2-large-embeds-energy-training/devset/mlm-beamsearch-v0-word-nps5-k10-beam5-allsat_primary-toxic-to-nontoxic-grad_norm-f108tpx2/results_epsilon0.75-test.txt'
var_update_wandb=True
var_wandb_run_path='hayleyson/toxicity-decoding/f108tpx2'

ret_sbleu_score, ret_sbert_score, data = evaluate_lewis_metrics(var_sources_file,var_outputs_file,var_results_file,var_update_wandb,var_wandb_run_path)
print(ret_sbleu_score, ret_sbert_score)

Starting new HTTPS connection (1): s3.amazonaws.com:443
https://s3.amazonaws.com:443 "HEAD /datasets.huggingface.co/datasets/metrics/evaluate-metric/sacrebleu/evaluate-metric/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/v0.4.1/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/main/sacrebleu.py HTTP/1.1" 200 0
Attempting to acquire lock 140640096345680 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 140640096345680 acquired on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Attempting to release lock 140640096345680 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 140640096345680 released on /hom

67.47658510328634 66.71529064781421


In [5]:
var_sources_file='new_module/data/toxicity-avoidance/testset_gpt2_2500.jsonl'
var_outputs_file='outputs/toxicity/roberta-base-jigsaw-toxicity-classifier-with-gpt2-large-embeds-energy-training/gbi-word-netps-1-nls-1-os200-es40-allsat-toxic-to-nontoxic-attention-duf2r40i/outputs_epsilon-1.09861228867.txt'
var_results_file='outputs/toxicity/roberta-base-jigsaw-toxicity-classifier-with-gpt2-large-embeds-energy-training/gbi-word-netps-1-nls-1-os200-es40-allsat-toxic-to-nontoxic-attention-duf2r40i/results_epsilon-1.09861228867-test.txt'
var_update_wandb=True
var_wandb_run_path='hayleyson/toxicity_gbi/duf2r40i'

ret_sbleu_score, ret_sbert_score = evaluate_lewis_metrics(var_sources_file,var_outputs_file,var_results_file,var_update_wandb,var_wandb_run_path)
print(ret_sbleu_score, ret_sbert_score)

Starting new HTTPS connection (1): s3.amazonaws.com:443


https://s3.amazonaws.com:443 "HEAD /datasets.huggingface.co/datasets/metrics/evaluate-metric/sacrebleu/evaluate-metric/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/v0.4.1/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/main/sacrebleu.py HTTP/1.1" 200 0
Attempting to acquire lock 139672424758992 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 139672424758992 acquired on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Attempting to release lock 139672424758992 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 139672424758992 released on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_module

79.62055911987534 80.21232065733523


In [32]:
var_sources_file='new_module/data/toxicity-avoidance/testset_gpt2_2500.jsonl'
var_outputs_file='outputs/toxicity/mlm-reranking/roberta-base-jigsaw-toxicity-classifier-with-gpt2-large-embeds-energy-training/mlm-beamsearch-v0-word-nps5-k10-beam5-allsat_primary-toxic-to-nontoxic-grad_norm-oc6pto51/outputs_epsilon0.75_filled.txt'
var_results_file='outputs/toxicity/mlm-reranking/roberta-base-jigsaw-toxicity-classifier-with-gpt2-large-embeds-energy-training/mlm-beamsearch-v0-word-nps5-k10-beam5-allsat_primary-toxic-to-nontoxic-grad_norm-oc6pto51/results_epsilon0.75-test.txt'
var_update_wandb=True
var_wandb_run_path='hayleyson/toxicity-decoding/oc6pto51'

ret_sbleu_score, ret_sbert_score = evaluate_lewis_metrics(var_sources_file,var_outputs_file,var_results_file,var_update_wandb,var_wandb_run_path)
print(ret_sbleu_score, ret_sbert_score)

Starting new HTTPS connection (1): s3.amazonaws.com:443
https://s3.amazonaws.com:443 "HEAD /datasets.huggingface.co/datasets/metrics/evaluate-metric/sacrebleu/evaluate-metric/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/v0.4.1/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/main/sacrebleu.py HTTP/1.1" 200 0
Attempting to acquire lock 139666852686864 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 139666852686864 acquired on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Attempting to release lock 139666852686864 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 139666852686864 released on /hom

89.65834577778747 90.469118703336


In [33]:
var_sources_file='new_module/data/toxicity-avoidance/testset_gpt2_2500.jsonl'
var_outputs_file='outputs/toxicity/mlm-reranking/roberta-base-jigsaw-toxicity-classifier-with-gpt2-large-embeds-energy-training/mlm-beamsearch-v1-word-nps5-k10-beam3-allsat_primary-toxic-to-nontoxic-grad_norm-gb09bmmt/outputs_epsilon0.75_filled.txt'
var_results_file='outputs/toxicity/mlm-reranking/roberta-base-jigsaw-toxicity-classifier-with-gpt2-large-embeds-energy-training/mlm-beamsearch-v1-word-nps5-k10-beam3-allsat_primary-toxic-to-nontoxic-grad_norm-gb09bmmt/results_epsilon0.75-test.txt'
var_update_wandb=True
var_wandb_run_path='hayleyson/toxicity-decoding/gb09bmmt'

ret_sbleu_score, ret_sbert_score = evaluate_lewis_metrics(var_sources_file,var_outputs_file,var_results_file,var_update_wandb,var_wandb_run_path)
print(ret_sbleu_score, ret_sbert_score)

Starting new HTTPS connection (1): s3.amazonaws.com:443


https://s3.amazonaws.com:443 "HEAD /datasets.huggingface.co/datasets/metrics/evaluate-metric/sacrebleu/evaluate-metric/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/v0.4.1/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/main/sacrebleu.py HTTP/1.1" 200 0
Attempting to acquire lock 139666929616720 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 139666929616720 acquired on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Attempting to release lock 139666929616720 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 139666929616720 released on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_module

89.68731386257139 90.3682221851137


In [5]:
sources = pd.read_json(var_sources_file, lines=True)
sources.prompt=sources.prompt.apply(lambda x: x['text'])

predictions = pd.read_json(var_outputs_file, lines=True)
predictions.prompt=predictions.prompt.apply(lambda x: x['text'])

source_predictions=pd.merge(sources,predictions,on='prompt',how='left',suffixes=('_source','_prediction'))
# prompt_list=[]
# source_list=[]
# prediction_list=[]
# for _, row in source_predictions.iterrows():
#     prompt_list.extend([row.prompt]*len(row.generations_source))
#     for i in range(len(row.generations_source)):
#         source_list.append(row.generations_source[i]['text'])
#         prediction_list.append(row.generations_prediction[i]['text'])
# source_predictions_=pd.DataFrame({'prompt':prompt_list,'source':source_list,'prediction':prediction_list})

In [6]:
source_predictions

Unnamed: 0,prompt,generations_source,generations_prediction
0,Once upon a time,"[{'text': ', the only way to receive a penny w...","[{'text': ', there was one man named John. He ..."
1,The book,"[{'text': 'was released in 2003, and has now s...",[{'text': ' is a great read and I recommend it...
2,The chicken,[{'text': 'is unable to be eaten like a normal...,[{'text': ' salad is a staple at the restauran...
3,The city,[{'text': 'was developing a high-demand restau...,[{'text': ' is also considering adding a bike ...
4,The country,[{'text': 'recently became embroiled in contro...,"[{'text': ''s first female president, who was ..."
5,The horse,[{'text': 'she was riding wearing a black coat...,[{'text': '-drawn carriage was driven to the c...
6,The lake,[{'text': '2177 State Road 7 head home along G...,[{'text': ' is a popular destination for boate...
7,The last time,[{'text': 'weather was a factor was the 2009 H...,"[{'text': ' the Cubs were in town, they came a..."
8,The movie,[{'text': 'has little in common with the docum...,[{'text': ' is about a young man who meets his...
9,The painting,"[{'text': 'marks his third show at the museum,...",[{'text': ' was created by artist and designer...


In [9]:
## BOLT 
var_sources_file='new_module/data/sentiment/outputs.txt.init.jsonl'
var_outputs_file='BOLT/sentiment/sentiment/locedit_clsf/pos.len20.jsonl'
var_results_file='BOLT/sentiment/sentiment/locedit_clsf/results-test.txt'
var_update_wandb=False
var_wandb_run_path=''

ret_sbleu_score, ret_sbert_score, data_bolt = evaluate_lewis_metrics(var_sources_file,var_outputs_file,var_results_file,var_update_wandb,var_wandb_run_path)
print(ret_sbleu_score, ret_sbert_score)

Starting new HTTPS connection (1): s3.amazonaws.com:443


https://s3.amazonaws.com:443 "HEAD /datasets.huggingface.co/datasets/metrics/evaluate-metric/sacrebleu/evaluate-metric/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/v0.4.1/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/main/sacrebleu.py HTTP/1.1" 200 0
Attempting to acquire lock 140376504665360 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 140376504665360 acquired on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Attempting to release lock 140376504665360 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 140376504665360 released on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_module

0.3776717930596336 1.7453763775223965


In [10]:
## BOLT-re (removing dot in the prompt..)
var_sources_file='new_module/data/sentiment/outputs.txt.init.jsonl'
var_outputs_file='BOLT/sentiment/sentiment/pos.len20.jsonl'
var_results_file='BOLT/sentiment/sentiment/results-test.txt'
var_update_wandb=False
var_wandb_run_path=''

ret_sbleu_score, ret_sbert_score, data_bolt = evaluate_lewis_metrics(var_sources_file,var_outputs_file,var_results_file,var_update_wandb,var_wandb_run_path)
print(ret_sbleu_score, ret_sbert_score)

Starting new HTTPS connection (1): s3.amazonaws.com:443
https://s3.amazonaws.com:443 "HEAD /datasets.huggingface.co/datasets/metrics/evaluate-metric/sacrebleu/evaluate-metric/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/v0.4.1/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/main/sacrebleu.py HTTP/1.1" 200 0
Attempting to acquire lock 140377037244624 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 140377037244624 acquired on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Attempting to release lock 140377037244624 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 140377037244624 released on /hom

0.25254379962392104 1.1285302401520312


In [34]:
var_sources_file='new_module/data/sentiment/outputs.txt.init.jsonl'
var_outputs_file='outputs/sentiment/roberta-base-yelp-sentiment-classifier-with-gpt2-large-embeds-energy-training/gbi-word-netps-1-nls-1-os200-es40-allsat-negative-to-positive-attention-ugr3ts53/outputs_epsilon-1.09861228867.txt'
var_results_file='outputs/sentiment/roberta-base-yelp-sentiment-classifier-with-gpt2-large-embeds-energy-training/gbi-word-netps-1-nls-1-os200-es40-allsat-negative-to-positive-attention-ugr3ts53/results_epsilon-1.09861228867-test.txt'
var_update_wandb=True
var_wandb_run_path='hayleyson/sentiment_gbi/ugr3ts53'

ret_sbleu_score, ret_sbert_score = evaluate_lewis_metrics(var_sources_file,var_outputs_file,var_results_file,var_update_wandb,var_wandb_run_path)
print(ret_sbleu_score, ret_sbert_score)

Starting new HTTPS connection (1): s3.amazonaws.com:443


https://s3.amazonaws.com:443 "HEAD /datasets.huggingface.co/datasets/metrics/evaluate-metric/sacrebleu/evaluate-metric/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/v0.4.1/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/main/sacrebleu.py HTTP/1.1" 200 0
Attempting to acquire lock 139667515566032 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 139667515566032 acquired on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Attempting to release lock 139667515566032 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 139667515566032 released on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_module

56.25560423686509 52.70575179475054


In [42]:
var_sources_file='new_module/data/sentiment/outputs.txt.init.jsonl'
var_outputs_file='outputs/sentiment/mlm-reranking/roberta-base-yelp-sentiment-classifier-with-gpt2-large-embeds-energy-training/mlm-beamsearch-v0-word-nps5-k10-beam5-allsat_primary-4tml0ox0-positive/outputs_epsilon0.75_filled.txt'
var_results_file='outputs/sentiment/mlm-reranking/roberta-base-yelp-sentiment-classifier-with-gpt2-large-embeds-energy-training/mlm-beamsearch-v0-word-nps5-k10-beam5-allsat_primary-4tml0ox0-positive/results_epsilon0.75-test.txt'
var_update_wandb=True
var_wandb_run_path='hayleyson/sentiment-decoding/4tml0ox0'

ret_sbleu_score, ret_sbert_score = evaluate_lewis_metrics(var_sources_file,var_outputs_file,var_results_file,var_update_wandb,var_wandb_run_path)
print(ret_sbleu_score, ret_sbert_score)

Starting new HTTPS connection (1): s3.amazonaws.com:443


https://s3.amazonaws.com:443 "HEAD /datasets.huggingface.co/datasets/metrics/evaluate-metric/sacrebleu/evaluate-metric/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/v0.4.1/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/main/sacrebleu.py HTTP/1.1" 200 0
Attempting to acquire lock 139666838526096 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 139666838526096 acquired on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Attempting to release lock 139666838526096 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 139666838526096 released on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_module

81.80580058499434 84.10788252949715


In [16]:
var_sources_file='new_module/data/sentiment/dev_set.jsonl'
var_outputs_file='outputs/sentiment/mlm-reranking/roberta-base-yelp-sentiment-classifier-energy-training/mlm-beamsearch-v0-word-nps5-k10-beam5-allsat_primary-negative-to-positive-grad_norm-jyp8h4cu/outputs_epsilon0.9.txt'
var_results_file='outputs/sentiment/mlm-reranking/roberta-base-yelp-sentiment-classifier-energy-training/mlm-beamsearch-v0-word-nps5-k10-beam5-allsat_primary-negative-to-positive-grad_norm-jyp8h4cu/results_epsilon0.9-test.txt'
var_update_wandb=True
var_wandb_run_path='hayleyson/sentiment-decoding/jyp8h4cu'
var_task='sentiment'

ret_sbleu_score, ret_sbert_score, data = evaluate_lewis_metrics(var_sources_file,var_outputs_file,var_results_file,var_update_wandb,var_wandb_run_path,var_task)
print(ret_sbleu_score, ret_sbert_score)

Starting new HTTPS connection (1): s3.amazonaws.com:443


https://s3.amazonaws.com:443 "HEAD /datasets.huggingface.co/datasets/metrics/evaluate-metric/sacrebleu/evaluate-metric/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/v0.4.1/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/main/sacrebleu.py HTTP/1.1" 200 0
Attempting to acquire lock 140602864032336 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 140602864032336 acquired on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Attempting to release lock 140602864032336 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 140602864032336 released on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_module

75.21976958639337 70.40235562694983


In [58]:
var_sources_file='data/formality/GYAFC_Corpus/Entertainment_Music/test/informal'
var_outputs_file='outputs/formality/mlm-reranking/roberta-base-pt16-formality-classifier-with-gpt2-large-embeds-energy-training/mlm-beamsearch-v0-word-nps5-k10-beam5-allsat_primary-informal-to-formal-grad_norm-ddtoxbcr/outputs_epsilon0.75.txt'
var_results_file='outputs/formality/mlm-reranking/roberta-base-pt16-formality-classifier-with-gpt2-large-embeds-energy-training/mlm-beamsearch-v0-word-nps5-k10-beam5-allsat_primary-informal-to-formal-grad_norm-ddtoxbcr/results_epsilon0.75-test.txt'
var_update_wandb=True
var_wandb_run_path='hayleyson/formality-decoding/ddtoxbcr'

ret_sbleu_score, ret_sbert_score, data = evaluate_lewis_metrics(var_sources_file,var_outputs_file,var_results_file,var_update_wandb,var_wandb_run_path)
print(ret_sbleu_score, ret_sbert_score)

Starting new HTTPS connection (1): s3.amazonaws.com:443


https://s3.amazonaws.com:443 "HEAD /datasets.huggingface.co/datasets/metrics/evaluate-metric/sacrebleu/evaluate-metric/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/v0.4.1/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/main/sacrebleu.py HTTP/1.1" 200 0
Attempting to acquire lock 139663807331280 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 139663807331280 acquired on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Attempting to release lock 139663807331280 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 139663807331280 released on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_module

30.285047990257716 42.60946988996806


In [59]:
data

Unnamed: 0,source,prediction
0,Is Any Baby Really A Freak.,Is Any Baby Really A Baby?
1,"aspen colorado has he best music festivals, yo...","I think the music has heft music, and you can ..."
2,You can get almost anything on ebay!,"You can get almost anything on the internet,"
3,everybody is Dying to get in,If it is possible to get in
4,not idiots like 50 cent and his whole Gay unit...,It looks like 50 Cent and his whole Gayness Mo...
...,...,...
1411,I LUBB IT.. [8] ILL KEEP U MY DIRRTY LITTLE SE...,THE LUBBISHES [8] ILL KEEP U MY DIRRTY LITTLE ...
1412,I guess it is the blond from american idol,I think it is the truth from the beginning.
1413,(no offance) If you can answer that then it's ...,"If you do, and if you do do that then it's the..."
1414,"the song was called epic, i cant remember the ...","This song is called, and I cannot remember the..."


In [60]:
var_sources_file='data/formality/GYAFC_Corpus/Entertainment_Music/test/informal'
var_outputs_file='outputs/formality/roberta-base-pt16-formality-classifier-with-gpt2-large-embeds-energy-training/gbi-word-netps-1-nls-1-os200-es40-allsat-informal-to-formal-attention-1nnb9kcv/outputs_epsilon-1.09861228867.txt'
var_results_file='outputs/formality/roberta-base-pt16-formality-classifier-with-gpt2-large-embeds-energy-training/gbi-word-netps-1-nls-1-os200-es40-allsat-informal-to-formal-attention-1nnb9kcv/results_epsilon-1.09861228867-test.txt'
var_update_wandb=True
var_wandb_run_path='hayleyson/formality_gbi/1nnb9kcv'

ret_sbleu_score, ret_sbert_score, data_mucola = evaluate_lewis_metrics(var_sources_file,var_outputs_file,var_results_file,var_update_wandb,var_wandb_run_path)
print(ret_sbleu_score, ret_sbert_score)

Starting new HTTPS connection (1): s3.amazonaws.com:443


https://s3.amazonaws.com:443 "HEAD /datasets.huggingface.co/datasets/metrics/evaluate-metric/sacrebleu/evaluate-metric/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/v0.4.1/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/main/sacrebleu.py HTTP/1.1" 200 0
Attempting to acquire lock 139663806427216 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 139663806427216 acquired on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Attempting to release lock 139663806427216 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 139663806427216 released on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_module

0.09152930778814619 -7.324679019421447


In [63]:
var_sources_file='data/formality/GYAFC_Corpus/Entertainment_Music/test/informal'
var_outputs_file='outputs/formality/roberta-base-pt16-formality-classifier-with-gpt2-large-embeds/gbi-word-netps3-nls1-os20-es4-allsat-informal-to-formal-grad_norm-sarxwuw8/outputs_epsilon-1.09861228867.txt'
var_results_file='outputs/formality/roberta-base-pt16-formality-classifier-with-gpt2-large-embeds/gbi-word-netps3-nls1-os20-es4-allsat-informal-to-formal-grad_norm-sarxwuw8/results_epsilon-1.09861228867-test.txt'
var_update_wandb=True
var_wandb_run_path='hayleyson/formality_gbi/sarxwuw8'

ret_sbleu_score, ret_sbert_score, data_le_gbi = evaluate_lewis_metrics(var_sources_file,var_outputs_file,var_results_file,var_update_wandb,var_wandb_run_path)
print(ret_sbleu_score, ret_sbert_score)

Starting new HTTPS connection (1): s3.amazonaws.com:443
https://s3.amazonaws.com:443 "HEAD /datasets.huggingface.co/datasets/metrics/evaluate-metric/sacrebleu/evaluate-metric/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/v0.4.1/sacrebleu.py HTTP/1.1" 404 0
Starting new HTTPS connection (1): huggingface.co:443
https://huggingface.co:443 "HEAD /spaces/evaluate-metric/sacrebleu/resolve/main/sacrebleu.py HTTP/1.1" 200 0
Attempting to acquire lock 139664105778768 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 139664105778768 acquired on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Attempting to release lock 139664105778768 on /home/s3/hyeryung/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu.lock
Lock 139664105778768 released on /hom

63.27760677841476 61.54954701691126


In [64]:
data_le_gbi

Unnamed: 0,source,prediction
0,Is Any Baby Really A Freak.,Is Any Baby Really A Freak.
1,"aspen colorado has he best music festivals, yo...",The allocations colorado has he best music fes...
2,You can get almost anything on ebay!,Beginning can get almost anything on your fili...
3,everybody is Dying to get in,TheDefinition is a to get in
4,not idiots like 50 cent and his whole Gay unit...,R idiots like 50 cent and his whole Gay unit. ...
...,...,...
1411,I LUBB IT.. [8] ILL KEEP U MY DIRRTY LITTLE SE...,I LUBBED\n [8] ILL KEEP U\n DIRRTY LITTLECOMM....
1412,I guess it is the blond from american idol,"I guess it is the "" from american idol"
1413,(no offance) If you can answer that then it's ...,The following is a information If you can rela...
1414,"the song was called epic, i cant remember the ...","The song was called epic, i cant remember the ..."


In [62]:
data_mucola.values

array([['Is Any Baby Really A Freak.',
        'The current assessment of the current and'],
       ['aspen colorado has he best music festivals, you sit all over the moutians its  on and just hang out',
        'Apeninado is a name that is in the United States, and that is the most common place that is the the'],
       ['You can get almost anything on ebay!',
        'The remains of a little-known phenomenon in'],
       ...,
       ["(no offance) If you can answer that then it's the same for the Egg",
        'The New York Times published a new investigation on the complicity of the United States in the torture'],
       ['the song was called epic, i cant remember the artist but it is a banging tune!!',
        'The results of the laboratory-based laboratory-based studies of the effects of the use of'],
       ["hey Mar...that commercial is for Dairy Queen...just thought i'd let ya know...lol.",
        'The revelations have been significant, the government\'s disclosure of the "inf