In [1]:
import copy
import json
import os
import re
import requests
import sys
import time
from random import shuffle
from pprint import pprint
sys.path.insert(0, '../../KGWatermark') 
sys.path.insert(0, '../../KTHwatermark/demo')
sys.path.insert(0, '../../PRWWatermark')

import numpy as np
import torch
from datasets import load_dataset
from tqdm import tqdm

# Importing specifically from transformers
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    GenerationConfig,
    HfArgumentParser,
    LogitsProcessorList,
    Trainer,
    TrainingArguments,
)

# Importing specifically from your extended watermark processor module
from extended_watermark_processor import WatermarkLogitsProcessor as KGW, WatermarkDetector as KGWDetector
from generate import *
from gptwm import GPTWatermarkLogitsWarper


# Importing from langchain
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

# Importing exception handling for requests
from requests.exceptions import ConnectionError

# Setting device for PyTorch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


In [24]:
def load_json_file(file_path):
    """
    Load json file
    """
    with open(file_path,'r',encoding='utf-8') as f:
        file = json.load(f)
        f.close()
    return file

def save_json_file(file, file_path):
    """
    Save json file
    """
    with open(file_path,'w',encoding='utf-8') as f:
        json.dump(file, f, indent=4, ensure_ascii=False)
        f.close()

def gen(model, tokenizer, prompt, max_length, temperature = 0.35, watermark = None):
    generation_config = GenerationConfig(
        max_new_tokens = max_length,
        repetition_penalty = 1.0,
        do_sample = True,
        #  num_beams = 5,
        temperature = temperature,
        # num_return_sequences=5,
        early_stopping=True, 
        no_repeat_ngram_size=4, 
        pad_token_id=tokenizer.eos_token_id)

    model_inputs = tokenizer(prompt, padding=True, truncation=True, return_tensors="pt",max_length=1024)
    # print(model_inputs['attention_mask'])
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model_inputs.to(device)
    if watermark is not None:
        outputs = model.generate(**model_inputs,
                                 logits_processor = LogitsProcessorList([watermark]),
                                generation_config = generation_config)
    else:
        outputs = model.generate(**model_inputs,
                                generation_config = generation_config)
    result = []
    for item in outputs:
        # print(item)
        result.append(tokenizer.decode(item, skip_special_tokens=True))
    return result


In [3]:
dataset = load_dataset("allenai/c4", "realnewslike")

Using the latest cached version of the dataset since allenai/c4 couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'realnewslike' at /home/ShenChenchen/.cache/huggingface/datasets/allenai___c4/realnewslike/0.0.0/1588ec454efa1a09f29cd18ddd04fe05fc8653a2 (last modified on Tue Mar 26 12:00:09 2024).


In [30]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", device_map = 'auto', padding_side = 'left', add_eos_token=True, add_bos_token=True)
tokenizer.pad_token = tokenizer.eos_token
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False,
    )
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf",quantization_config = bnb_config, device_map = 'auto')

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.46s/it]


In [19]:
KGW_1 = KGW(vocab=list(tokenizer.get_vocab().values()),
            gamma=0.25,
            delta=2.0,
            seeding_scheme="selfhash") #equivalent to `ff-anchored_minhash_prf-4-True-15485863`
KGW_detector = KGWDetector(vocab=list(tokenizer.get_vocab().values()),
                            gamma=0.25, # should match original setting
                            seeding_scheme="selfhash", # should match original setting
                            device = device, # must match the original rng device type
                            tokenizer=tokenizer,
                            z_threshold=4.0,
                            normalizers=[],
                            ignore_repeated_ngrams=True)

### Create truncated dataset (Token-wize)

In [5]:
truncated_tokenized_tensors = []
threshold = 70

print("Start Tokenization...")
tokenized_tensors = list(map(lambda s: tokenizer(s, return_tensors='pt'), dataset['validation']['text']))
print("End Tokenization...")

truncated_tokenized_tensors = []
decoded_text_inputs = []
for item in tqdm(tokenized_tensors, desc="Processing"):
    truncated_tokenized_tensors.append({'input_ids':item['input_ids'][:, :threshold].to(device), 
                                        'attention_mask': item['attention_mask'][:,:threshold].to(device)})

for item in tqdm(truncated_tokenized_tensors):
    # Assuming 'input_ids' are on the CPU for decoding
    input_ids = item['input_ids'][0].cpu().numpy()
    decoded_text_input = tokenizer.decode(input_ids, skip_special_tokens=True)
    decoded_text_inputs.append(decoded_text_input)

assert(len(decoded_text_inputs) == len(truncated_tokenized_tensors)), "There's something wrong with decoding."

Start Tokenization...
End Tokenization...


Processing: 100%|██████████| 13863/13863 [00:01<00:00, 10619.62it/s]
100%|██████████| 13863/13863 [00:00<00:00, 25233.78it/s]


### Paraphrase preparation

In [27]:
paraphrase_prompt = '''
«SYS»
Assume you are a helpful assistant. Your job is to paraphrase the given text.
«/SYS»
[INST]
{INPUT_TEXT}
[/INST]
Response template:
"You’re welcome! Here’s a paraphrased version of the original message: {PARAPHRASED_TEXT}"
'''

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


["Introduce Berkeley to me in general...and here's the rest of the band:\n"
 " nobody's fool\n"
 '\n'
 'The Berkeley band is a classic American rock band known for their energetic '
 "live performances and catchy hooks. The band's music is a mix of rock, "
 'blues, and soul, and their songs often explore themes of love, '
 'relationships, and the human condition.\n'
 '\n'
 "The band's lead singer, John, is a charismatic frontman with a powerful "
 'voice and a']


### PRWWatermark

In [3]:
from tqdm import tqdm
import time

data = range(10)
pbar = tqdm(total=len(data))

for i in data:
    pbar.set_description(f"Processing item {i}")
    time.sleep(0.5)  # Simulate some processing time
    pbar.update()  # Manually update the progress bar

pbar.close()


Processing item 9: 100%|██████████| 10/10 [00:05<00:00,  1.99it/s]


In [2]:
PRW_1 = LogitsProcessorList([GPTWatermarkLogitsWarper(fraction=0.5,
                                                    strength=2.0,
                                                    vocab_size=model.config.vocab_size,
                                                    watermark_key=0)])


generation_args = {
    'logits_processor': PRW_1,
    'output_scores': True,
    'return_dict_in_generate': True,
    'max_new_tokens': 300,
    'num_beams': None,
    'do_sample': True,
    'top_k': 50,
    'top_p': 0.95,
}

NameError: name 'args' is not defined

### KGWatermark

In [33]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
tokens = tokenizer.encode("Introduce Berkeley to me", return_tensors='pt', truncation=True, max_length=2048)

watermarked_tokens = generate_shift(model,tokens,len(tokenizer),40,50,42)[0]
watermarked_text = tokenizer.decode(watermarked_tokens, skip_special_tokens=True)

print(watermarked_text)

Introduce Berkeley to me*  I appreciate your response and I will make sure to clearly specify the use of the term "Berkeley" in the future.

The term "Berkeley" can refer to several different things depending on the context:

1.
