In [1]:
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from collections import OrderedDict
import numpy as np
import torchmetrics
from datasets import load_dataset, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, GenerationConfig
import os
import pandas as pd

np.random.seed(42)

In [2]:
torch.cuda.is_available()

True

In [3]:
# Hyperparamters 
top_k = 50
top_p = 0.9
temp = 0.8
min_new_tokens = 10
max_new_tokens = 50
do_sample=True
num_beams=1

dataset_name="cnn"
model_name= "meta-llama/Meta-Llama-3-8B-Instruct"
batch_size=8
max_input_length=1024
DEVICE = "cuda:1" if torch.cuda.is_available() else "cpu"
access_token = "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"
cache_dir="/data/james/.cache"

In [4]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import BertForSequenceClassification, BertTokenizer

class Evaluator:
    def __init__(self, metrics=None):
        if not metrics:
            metrics = ["rouge", "sacre_bleu", "bertscore", "factkb"]
        self.metrics = metrics
    
    def evaluate(self, predictions, references, documents, metrics=["rouge", "bertscore", "factkb", "alignscore"]):
        result_dict = OrderedDict()
        if "rouge" in metrics:
            rouge_dict = self.calculate_rouge(predictions, references)
            for k, v in rouge_dict.items():
                result_dict[k] = v
        if "sacre_bleu" in metrics:
            sacre_bleu_dict = self.calculate_sacrebleu(predictions, references)
            for k, v in sacre_bleu_dict.items():
                result_dict[k] = v
        if "bertscore" in metrics:
            bertscore_dict = self.calculate_bertscore(predictions, references)
            for k, v in bertscore_dict.items():
                result_dict[k] = v
        if "factkb" in metrics:
            result_dict["factkb"] = self.calculate_factkb(predictions, documents)
            
        if "alignscore" in metrics:
            result_dict["alignscore"] = self.calculate_alignscore(predictions, documents) 

        for k, v in result_dict.items():
            print(f"{k} -> {v*100:.2f}")
        return result_dict

    def calculate_rouge(self, predictions, references):
        from torchmetrics.functional.text.rouge import rouge_score
        rouge_dict = rouge_score(preds=predictions, target=references)
        return {k: v.item() for k, v in rouge_dict.items()}

    def calculate_sacrebleu(self, predictions, references):
        from torchmetrics.functional.text import sacre_bleu_score
        score = sacre_bleu_score(preds=predictions, target=[[i] for i in references])
        return {"sacre_bleu": score.item()}

    def calculate_bertscore(self, predictions, references):
        import evaluate
        bertscore = evaluate.load("bertscore")
        bertscore_dict = bertscore.compute(predictions=predictions, references=references, model_type="roberta-large-mnli")
        res = {"bertscore_precision": np.mean(bertscore_dict["precision"]), "bertscore_recall": np.mean(bertscore_dict["recall"]), "bertscore_f1": np.mean(bertscore_dict["f1"])}
        return {k: v.item() for k, v in res.items()}
    
    def calculate_alignscore(self, predictions, documents):
        from AlignScore.src.alignscore import AlignScore
        ckpt_path = "models/AlignScore-base.ckpt"
        align_scorer = AlignScore(model='roberta-base', batch_size=8, device=DEVICE, ckpt_path=ckpt_path, evaluation_mode='nli_sp')
        alignscore_result = align_scorer.score(contexts=documents, claims=predictions)
        #total_result['AlignScore'] = 100*np.mean(alignscore_result)
        return np.mean(alignscore_result)

    def calculate_factkb(self, predictions, documents):
        tokenizer = AutoTokenizer.from_pretrained("roberta-base", padding="max_length", truncation=True, cache_dir=cache_dir)
        model = AutoModelForSequenceClassification.from_pretrained("bunsenfeng/FactKB", torch_dtype=torch.float16, cache_dir=cache_dir)
        model = model.to(DEVICE)
        res = []
        for i in range(len(predictions)):
            input_pretokenized = f"{predictions[i]} {tokenizer.sep_token} {documents[i]}"
            tokenized_input = tokenizer(input_pretokenized, return_tensors="pt", truncation=True, max_length=512)
            with torch.no_grad():
                output = model(input_ids=tokenized_input.input_ids.to(DEVICE))
            logits = torch.softmax(output.logits, dim=1)  # (bz, 2)
            res.append(logits.squeeze()[-1].item())
        return np.mean(res)    

In [5]:
# Utility functions

def xsum_pretokenize(dataset, tokenizer, max_input_length):
    data = {"context": [], "query": [], "summary": []}
    for i, row in tqdm(enumerate(dataset), desc="truncating documents..."):
        trunc_doc = tokenizer.batch_decode(tokenizer(row['document'], return_tensors="pt", max_length=max_input_length,  truncation=True).input_ids, skip_special_tokens=True)[0]
        data['context'].append(trunc_doc)
        data['summary'].append(row['summary'])
        data["query"].append("Summarize the article in one sentence. Summary:")
    return Dataset.from_dict(data)

def cnn_pretokenize(dataset, tokenizer, max_input_length):
    data = {"context": [], "query": [], "summary": []}
    for i, row in tqdm(enumerate(dataset), desc="truncating documents..."):
        trunc_doc = tokenizer.batch_decode(tokenizer(row['article'], return_tensors="pt", max_length=max_input_length,  truncation=True).input_ids, skip_special_tokens=True)[0]
        data['context'].append(trunc_doc)
        data['summary'].append(row['highlights'])
        data['query'].append("Summary of the above news article:")
    return Dataset.from_dict(data)

def pubmedqa_pretokenize(dataset, tokenizer, max_input_length):
    data = {"context": [], "query": [], "summary": []}
    for i, row in tqdm(enumerate(dataset), desc="truncating documents..."):
        context= ''.join(c for c in row['context']['contexts'])
        trunc_doc = tokenizer.batch_decode(tokenizer(context, return_tensors="pt", max_length=max_input_length, truncation=True).input_ids, skip_special_tokens=True)[0]
        data['context'].append(trunc_doc)
        data['summary'].append(row['long_answer'])
        data['query'].append(f"Question: {row['question']}. Answer:")
    return Dataset.from_dict(data)

def pretokenize(dataset_name, dataset, tokenizer, max_input_length):
    if dataset_name == "xsum":
        return xsum_pretokenize(dataset, tokenizer, max_input_length)
    elif dataset_name == "cnn":
        return cnn_pretokenize(dataset, tokenizer, max_input_length)
    elif dataset_name == "PubMedQA":
        return pubmedqa_pretokenize(dataset, tokenizer, max_input_length)
    return None

def template_input(row, dataset):
    if dataset == "xsum" or dataset == "cnn":
        return f"Article: {row['context']}. {row['query']}"
    elif dataset == "PubMedQA":
        return f"Document: {row['context']}. {row['query']}"
    else:
        return ""

def template_empty_input(row, dataset):
    if dataset == "xsum" or dataset == "cnn":
        return f"Article: . {row['query']}"
    elif dataset == "PubMedQA":
        return f"Document: . {row['query']}"
    else:
        return ""

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          padding_side="left",
                                          use_fast=False,
                                          token=access_token,
                                          trust_remote_code=True,
                                          cache_dir=cache_dir)
if tokenizer.pad_token is None:
    print("True")
    tokenizer.pad_token, tokenizer.pad_token_id = tokenizer.eos_token, tokenizer.eos_token_id

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


True


In [8]:
if dataset_name == "PubMedQA":
    raw_test_set = load_dataset("qiaojin/PubMedQA", "pqa_labeled", cache_dir=cache_dir)['train']
elif dataset_name == 'xsum':
    raw_test_set = load_dataset(dataset_name, split="test[:1000]")
elif dataset_name == 'cnn':
    raw_test_set = load_dataset("abisee/cnn_dailymail", "3.0.0", split="test[:1000]", cache_dir=cache_dir)

In [9]:
test_set = pretokenize(dataset_name, raw_test_set, tokenizer, max_input_length)

truncating documents...: 1000it [00:02, 387.71it/s]


In [10]:
# Code for Pure DP decoding 
import torch.nn.functional as F
from scipy.optimize import bisect

def calculate_memorization(p, q, idx):
    return abs(torch.log(p[idx]/q[idx])).numpy()#.cpu().numpy()

def entropy(p):
    return (-np.sum(p*np.log(p)))

def calc_partition_loss(proj_logit, proj_output, pub_output, alpha, temperature):
    max_loss = 0
    for i in range(proj_logit.shape[0]):
        proj_logit_i = torch.cat([proj_logit[:i, :], proj_logit[i+1:, :]])
        proj_output_i = F.softmax(proj_logit_i / temperature, dim=-1).mean(dim=0)
        ids = torch.nonzero(proj_output)
        eps = renyi_priv_loss(proj_output[ids], proj_output_i[ids], alpha)
        max_loss = max(max_loss, eps)
    return max_loss

def calc_group_memorization(output, ensemble_outputs, idx):
    return [calculate_memorization(output, ensemble_outputs[i, :], idx) for i in range(0, ensemble_outputs.shape[0])]

In [11]:
def post_calc_memorization(model,
                   context_aware_input_ids,
                   context_unaware_input_ids,
                   response_input_ids,
                   lambd,
                   temperature,
                   stop_token_ids,
                   min_length,
                   batch_size=None,
                   ensemble_context_aware_input_ids=None,
                  ):
    mem_vals = []
    for t in range(response_input_ids.shape[1]):
        priv_context_aware_input_ids = torch.cat([context_aware_input_ids,
                                      response_input_ids[:, :t]],
                                     dim=1)
        pub_logit = model(torch.cat([context_unaware_input_ids,
                                     response_input_ids[:, :t]],
                                    dim=1)
                         ).logits.squeeze()[-1, :].type(torch.float64).cpu()
        
        priv_logit = model(priv_context_aware_input_ids).logits[-1, -1, :].type(torch.float64).cpu()

        if batch_size != None:
            N = ensemble_context_aware_input_ids.shape[0]
            num_batch = N // batch_size + 1 if N % batch_size != 0 else N // batch_size
            ensemble_priv_context_aware_input_ids = torch.cat([ensemble_context_aware_input_ids,
                                      response_input_ids[:, :t].repeat(N, 1)],
                                     dim=1)
            ensemble_priv_logit = torch.cat([model(ensemble_priv_context_aware_input_ids[i*batch_size:(i+1)*batch_size]).logits[:, -1, :].type(torch.float64).cpu()
                     for i in range(0, num_batch)], axis=0)
            ensemble_proj_logit = lambd * ensemble_priv_logit + (1-lambd) * pub_logit.repeat(N, 1)
        
        proj_logit = lambd * priv_logit + (1-lambd) * pub_logit
        
        if t < min_length:
            pub_logit[stop_token_ids[0]] = -float("Inf")
            proj_logit[stop_token_ids[0]] = -float("Inf")
            ensemble_proj_logit[:, stop_token_ids[0]] = -float("Inf")
            
        if pub_logit.shape[0] > len(tokenizer):
            pub_logit[len(tokenizer):pub_logit.shape[0]] = -float("Inf")
            proj_logit[len(tokenizer):pub_logit.shape[0]] = -float("Inf")
            ensemble_proj_logit[:, len(tokenizer):pub_logit.shape[0]] = -float("Inf")
        
        pub_output = F.softmax(pub_logit / temperature, dim=-1)
        priv_output = F.softmax(priv_logit / temperature, dim=-1)
        proj_output = F.softmax(proj_logit / temperature, dim=-1)
        
        ids = torch.nonzero(pub_output)
        if ensemble_context_aware_input_ids == None:
            mem_val = calculate_memorization(proj_output[ids], pub_output[ids], response_input_ids[:, t].cpu())[0][0]
        else:
            ensemble_proj_output = F.softmax(ensemble_proj_logit / temperature, dim=-1)
            mem_val = max(calc_group_memorization(proj_output[ids], ensemble_proj_output[:, ids].squeeze(-1), response_input_ids[:, t].cpu()))[0][0]
        mem_vals.append(mem_val)
        
    return mem_vals

In [12]:
def partition(data, tokenizer, partition_length, dataset_name):
    document_ids = tokenizer(data['context']).input_ids
    ensemble = []
    for i in range(0, len(document_ids), partition_length):
        idx = (i+partition_length)
        #ensemble = torch.cat([ensemble, input_ids[-1:, idx:i]], dim=1)
        row = {'context': tokenizer.decode(document_ids[i:idx], skip_special_tokens=True), 'query': data['query']}
        ensemble.append(template_input(row, dataset_name))
    return ensemble

def group_partition(data, tokenizer, partition_length, dataset_name):
    document_ids = tokenizer(data['context']).input_ids
    groups = [template_input(data, dataset_name)]
    n_grams = []
    for i in range(0, len(document_ids), partition_length):
        idx = (i+partition_length)
        group_i = document_ids[:i] + document_ids[idx:]
        row = {'context': tokenizer.decode(group_i, skip_special_tokens=True), 'query': data['query']}
        groups.append(template_input(row, dataset_name))
    return groups


def partition_n_gram(data, tokenizer, dataset_name, n):
    document_ids = tokenizer(data['context']).input_ids
    length = len(document_ids)
    groups = []
    n_grams = []
    N = length - n + 1
    if N < 0:
        return [template_empty_input(data, dataset_name)]
    for i in range(N):
        removed_n_gram = document_ids[:i] + document_ids[i+n:]
        n_grams.append(document_ids[i:i+n])
        row = {'context': tokenizer.decode(removed_n_gram, skip_special_tokens=True), 'query': data['query']}
        groups.append(template_input(row, dataset_name))
    return groups, n_grams

In [13]:
def cmad_generation(model,
                  context_aware_input_ids,
                  context_unaware_input_ids,
                  lambd,
                  temperature,
                  max_length,
                  min_length,
                  stop_token_ids,
                  device,
                 ):
    response_input_ids = torch.LongTensor([[]]).to(device)
    for i in range(max_length):
        priv_context_aware_input_ids = torch.cat([context_aware_input_ids,
                                      response_input_ids.repeat(context_aware_input_ids.shape[0], 1)],
                                     dim=1)
        pub_logit = model(torch.cat([context_unaware_input_ids,
                                     response_input_ids],
                                    dim=1)
                         ).logits.squeeze()[-1, :].type(torch.float64)

        priv_logit = model(priv_context_aware_input_ids).logits[:, -1, :].type(torch.float64)
        proj_logit = lambd * priv_logit + (1-lambd) * pub_logit.repeat(priv_logit.shape[0], 1)
        
        if i < min_length:
            pub_logit[stop_token_ids[0]] = -float("Inf")
            proj_logit[:, stop_token_ids[0]] = -float("Inf")
            
        if pub_logit.shape[0] > len(tokenizer):
            pub_logit[len(tokenizer):pub_logit.shape[0]] = -float("Inf")
            proj_logit[:, len(tokenizer):pub_logit.shape[0]] = -float("Inf")
            
        pub_output = F.softmax(pub_logit / temperature, dim=-1)
        #priv_output = F.softmax(priv_logit, dim=-1)[-1]
        proj_output = F.softmax(proj_logit / temperature, dim=-1)

        pred_idx = proj_output[0].multinomial(1).view(1, -1).long()
        if pred_idx.cpu()[0].item() in stop_token_ids:
            break

        response_input_ids = torch.cat([response_input_ids, pred_idx], dim=1)
        del pred_idx
    return response_input_ids.cpu()[0], 0

In [14]:
def decode_experiment(test_set, model, tokenizer, lambd, temperature, dataset_name, min_length):
    dp_predictions = []
    stop_token_ids = [tokenizer.eos_token_id,
                      tokenizer.pad_token_id,
                     ]
    doc_priv_loss = [] 
    for idx, data in tqdm(enumerate(test_set), total=len(test_set)):
        context_unaware_tokenized_input = tokenizer(template_empty_input(data, dataset_name), return_tensors="pt", padding=True)
        context_aware_tokenized_input = tokenizer(template_input(data, dataset_name), return_tensors="pt", padding=True)
        with torch.no_grad():
            dp_output, doc_eps = cmad_generation(model,
                                    context_aware_tokenized_input.input_ids.to(DEVICE),
                                    context_unaware_tokenized_input.input_ids.to(DEVICE),
                                    lambd=lambd,
                                    temperature=temperature,
                                    max_length=max_new_tokens,
                                    min_length=min_length,
                                    stop_token_ids=stop_token_ids,
                                    device=DEVICE,
                                    )
        decode_dp_output = tokenizer.decode(dp_output, skip_special_tokens=True)
        dp_predictions.append(decode_dp_output)
        doc_priv_loss.append(doc_eps)
    return dp_predictions, doc_priv_loss

In [15]:
dir_name = "results"
m_name = "Meta-Llama-3-8B-Instruct"
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"

In [None]:
os.makedirs(dir_name, exist_ok=True)
lambds = [0.5, 1.0, 1.5]

for lambd in lambds:
    #file_name = f'{dataset_name}_{m_name}_{lambd}_context{max_input_length}.csv'
    file_name = f'{dataset_name}_{m_name}_{lambd}.csv'
    dp_predictions, dp_loss = decode_experiment(test_set, model, tokenizer, lambd=lambd, temperature=0.8, dataset_name=dataset_name, min_length=10)
    df = pd.DataFrame({'generations': dp_predictions, 'privacy_loss': dp_loss})
    df.to_csv(os.path.join(dir_name, file_name))

  0%|                                                                                                                                                                                  | 0/1000 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
 52%|██████████████████████████████████████████████████████████████████████████████████████▌                                                                                 | 515/1000 [46:31<38:34,  4.77s/it]

In [26]:
documents, references = [], []
for idx, data in tqdm(enumerate(test_set), total=len(test_set)):
    documents.append(data['context'])
    references.append(data['summary'])
evaluator = Evaluator()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 30138.86it/s]


In [31]:
lambd=0.5
file_name = f'{dataset_name}_{m_name}_{lambd}.csv'
df = pd.read_csv(os.path.join(dir_name, file_name))
doc_priv_loss = df['privacy_loss']
predictions = df['generations']

In [17]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    torch_dtype=torch.float16,
    token=access_token,
    cache_dir=cache_dir,
    #local_files_only=True,
    #device_map="auto",
    #max_memory = {0: "0GB", 1: "0GB", 2: "35GB", 3: "35GB", 4: "0GB", 5: "0GB", 6: "0GB", 7: "0GB"}
    ).to(DEVICE)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
partition_len = max_input_length
temperature=0.8
stop_token_ids = [tokenizer.eos_token_id,
                      tokenizer.pad_token_id,
                     ]
lambds = [1.0, 1.5, 0.5]
mean_vals = []

tokenizer = AutoTokenizer.from_pretrained(model_name,
                                      padding_side="left",
                                      use_fast=False,
                                      token=access_token,
                                      trust_remote_code=True,
                                      cache_dir=cache_dir)
if tokenizer.pad_token is None:
    print("True")
    tokenizer.pad_token, tokenizer.pad_token_id = tokenizer.eos_token, tokenizer.eos_token_id

batch_size = 32
for n_gram_size in [2048]:
    for lambd in lambds:
        file_name = f'{dataset_name}_{m_name}_{lambd}.csv'
        #file_name = f'{dataset_name}_{m_name}_{lambd}_context{context_len}.csv'
        df = pd.read_csv(os.path.join(dir_name, file_name))
        predictions = df['generations']
        vals = []
        print(file_name)
        
        test_set = pretokenize(dataset_name, raw_test_set, tokenizer, max_input_length)
        query_set = test_set.select(range(1000))

        for data, response in tqdm(zip(query_set, predictions), total=len(query_set)):
            context_unaware_tokenized_input = tokenizer(template_empty_input(data, dataset_name), return_tensors="pt", padding=True)
            context_aware_tokenized_input = tokenizer(template_input(data, dataset_name), return_tensors="pt", padding=True)
            if n_gram_size == None:
                ensemble_context_aware_tokenized_input_ids = None
                batch_size = None
            else:
                ensemble = partition_n_gram(data, tokenizer, dataset_name, n_gram_size)
                ensemble_context_aware_tokenized_input = tokenizer(ensemble, return_tensors="pt", padding=True)
                ensemble_context_aware_tokenized_input_ids = ensemble_context_aware_tokenized_input.input_ids.to(DEVICE)
            response_tokenized_input = tokenizer(response, return_tensors="pt")
            with torch.no_grad():
                cur_mem = post_calc_memorization(model,
                                           context_aware_tokenized_input.input_ids.to(DEVICE),
                                           context_unaware_tokenized_input.input_ids.to(DEVICE),
                                           response_tokenized_input.input_ids[:, 1:].to(DEVICE),
                                           lambd,
                                           temperature,
                                           stop_token_ids,
                                           min_new_tokens,
                                           batch_size,
                                           ensemble_context_aware_tokenized_input_ids
                                          )
            vals.append(cur_mem)

        mem_vals = np.zeros([len(vals),len(max(vals,key = lambda x: len(x)))])
        for i,j in enumerate(vals):
            mem_vals[i, 0:len(j)] = j
        print(f"N-gram size {n_gram_size}\t Memorization: {np.mean(np.sum(mem_vals, axis=1))}")
        mean_vals.append(np.mean(np.sum(mem_vals, axis=1)))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


True
PubMedQA_Meta-Llama-3-8B-Instruct_1.0.csv


truncating documents...: 1000it [00:00, 1030.99it/s]
 74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                         | 745/1000 [4:42:09<1:38:24, 23.15s/it]

In [25]:
mean_vals

[66.38553924315379, 115.78231398467068, 17.25712741370103]

In [None]:
import matplotlib.pyplot as plt

mean_vals = np.mean(mem_vals, axis=0)
plt.plot(np.arange(0, len(mean_vals)), mean_vals)
plt.xlabel("Generation Length $|\mathbf{y}|$", fontsize=15)
plt.ylabel(r'$\mathbb{E}[f_{\text{Mem}}(\overline{p}_{\theta})]$', fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.savefig("gen_len_analysis.pdf", bbox_inches="tight")

In [None]:
file_name

In [32]:
result_dict = evaluator.evaluate(predictions, references, documents)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return torch.load(f, map_location=map_location)  # type: ignore[arg-type]
Lightning automatically upgraded your loaded checkpoint from v1.7.7 to v1.9.5. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint --file models/AlignScore-base.ckpt`
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  rank_zero_warn(
Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████████

rouge1_fmeasure -> 29.12
rouge1_precision -> 29.99
rouge1_recall -> 31.10
rouge2_fmeasure -> 8.82
rouge2_precision -> 8.85
rouge2_recall -> 9.80
rougeL_fmeasure -> 20.17
rougeL_precision -> 20.58
rougeL_recall -> 21.81
rougeLsum_fmeasure -> 24.08
rougeLsum_precision -> 24.88
rougeLsum_recall -> 25.69
bertscore_precision -> 74.32
bertscore_recall -> 74.76
bertscore_f1 -> 74.51
factkb -> 51.64
alignscore -> 33.06





In [15]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    torch_dtype=torch.float16,
    token=access_token,
    cache_dir=cache_dir,
    local_files_only=True,
    #device_map="auto",
    #max_memory = {0: "0GB", 1: "0GB", 2: "35GB", 3: "35GB", 4: "35GB", 5: "35GB", 6: "35GB", 7: "35GB"}
    ).to(DEVICE)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [22]:
data, response = test_set[53], predictions[53]
n = 2
temperature=0.8

context_unaware_tokenized_input = tokenizer(template_empty_input(data, dataset_name), return_tensors="pt", padding=True)
ensemble, n_grams = partition_n_gram(data, tokenizer, dataset_name, n)
context_aware_tokenized_input = tokenizer(ensemble, return_tensors="pt", max_length=max_input_length+25, padding=True, truncation=True)
response_tokenized_input = tokenizer(response, return_tensors="pt")

context_aware_input_ids = context_aware_tokenized_input.input_ids.to(DEVICE)
response_input_ids = response_tokenized_input.input_ids.to(DEVICE)
context_unaware_input_ids = context_unaware_tokenized_input.input_ids.to(DEVICE)

N = context_aware_input_ids.shape[0]
priv_context_aware_input_ids = torch.cat([context_aware_input_ids,
                              response_input_ids[:, :20].repeat(N, 1)],
                             dim=1)
batch_size = 1
with torch.no_grad():
    pub_logit = model(torch.cat([context_unaware_input_ids,
                             response_input_ids[:, :20]],
                            dim=1)
                 ).logits.squeeze()[-1, :].type(torch.float64).cpu()
    if batch_size == None:
        priv_logit = model(priv_context_aware_input_ids).logits[:, -1, :].type(torch.float64).cpu()
    else:
        priv_logit = torch.stack([model(priv_context_aware_input_ids[i:(i+1)*batch_size]).logits[:, -1, :].type(torch.float64).cpu()
                 for i in range(0, N, batch_size)]).squeeze(1)

proj_logit = lambd * priv_logit + (1-lambd) * pub_logit.repeat(N, 1)

if pub_logit.shape[0] > len(tokenizer):
    pub_logit[len(tokenizer):pub_logit.shape[0]] = -float("Inf")
    proj_logit[:, len(tokenizer):pub_logit.shape[0]] = -float("Inf")

pub_output = F.softmax(pub_logit / temperature, dim=-1)
priv_output = F.softmax(priv_logit / temperature, dim=-1)
proj_output = F.softmax(proj_logit / temperature, dim=-1)

ids = torch.nonzero(pub_output)
mem_val = calc_group_memorization(proj_output[:, ids].squeeze(), response_input_ids[:, 20].cpu())

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


TypeError: calc_group_memorization() missing 1 required positional argument: 'idx'

In [None]:
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt

def value_to_color(value, min_value, max_value):
    """ Map a float value to a color based on its position in the value range. """
    norm = mcolors.Normalize(vmin=min_value, vmax=max_value)
    cmap = plt.get_cmap('coolwarm')  # You can choose different colormaps
    return mcolors.to_hex(cmap(norm(value)))

# Normalize and colorize
min_value = min(mem_vals)
max_value = max(mem_vals)

colors = [value_to_color(val, min_value, max_value) for val in mem_val]

In [None]:
from IPython.display import HTML

def colorize_text_html(vals, colors):
    colored_text = ' '.join(
        f'<span style="color: {color};">{tokenizer.decode(input_ids)}</span>'
        for input_ids, color in zip(vals, colors)
    )
    return f'<p>{colored_text}</p>'

colored_text_html = colorize_text_html(data_tokenized_input_ids[47:141], colors[47:141])
HTML(colored_text_html)

In [None]:
from reportlab.lib.pagesizes import letter
from reportlab.lib.colors import HexColor, black
from reportlab.pdfgen import canvas
from reportlab.lib.units import inch
import matplotlib.pyplot as plt
from matplotlib.cm import ScalarMappable
import numpy as np

def create_colored_pdf(vals, colors, filename):
    c = canvas.Canvas(filename, pagesize=letter)
    width, height = letter

    # Set starting position
    x, y = 50, height - 50
    line_height = 14

    # Draw text with colors
    for input_ids, color in zip(vals, colors):
        word = tokenizer.decode(input_ids)
        color = HexColor(color)

        # Set color and draw text
        c.setFillColor(color)
        c.drawString(x, y, word)
        x += c.stringWidth(word + ' ', 'Helvetica', 12)

        # Move to next line if needed
        if x > width - 50:
            x = 50
            y -= line_height

    # Draw the color scale image in PDF
    #c.drawImage(color_scale_image, scale_x, scale_y, width=scale_width, height=scale_height)
    
    c.save()

# Create the PDF
create_colored_pdf(data_tokenized_input_ids[47:141], colors[47:141], "colored_text.pdf")

In [None]:
c = canvas.Canvas("colored_text.pdf", pagesize=letter)

# Draw color scale
scale_width = 5 * inch
scale_height = 0.5 * inch
scale_x = 50
scale_y = 50
c.setFillColor(black)
c.rect(scale_x, scale_y, scale_width, scale_height, fill=0)

# Create color scale using matplotlib
fig, ax = plt.subplots(figsize=(5, 0.5), dpi=80)
fig.subplots_adjust(left=0, right=1, top=1, bottom=0)
cmap = plt.get_cmap('coolwarm')
norm = mcolors.Normalize(vmin=min_value, vmax=max_value)
sm = ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])
cb = plt.colorbar(sm, cax=ax, orientation='horizontal')
cb.ax.tick_params(labelsize=10)
#cb.ax.set_title('Value Scale', fontsize=10)
#plt.show()
# Save color scale to an image
color_scale_image = "color_scale.pdf"
plt.savefig(color_scale_image, bbox_inches='tight', pad_inches=0)
plt.close(fig)