In [1]:
# Import necessary libraries

import pandas as pd
import transformers, torch
from transformers import GPTNeoForCausalLM, GPT2Tokenizer

In [2]:
# Setup GPT2 model and tokenizer

model= GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
model.to('cuda')
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")

In [4]:
# Get essay prompts

from list_generate import prompts

prompts= prompts


['The Vicar of Wakefield as a Failed Morality Story Morality stories and plays that espouse Christian values have been', 'Governmental Funding for the Development of Self-Sustaining Communities: The Stabilization of the HIV/AIDS Crisis At the end of 2006, over 39.5 million people were', 'Who Is God? One common problem for the Christian tradition is the idea', "The Absolute Necessity of College-Level Writing Courses Reading Mike Rose's article, The Language of Exclusion: Writing Instruction", "People or Property? In several of Shakespeare's plays, father – daughter relationships are", 'A (Solitary) Place For Fantasy in Reality Reality is composed of a rather tenuous fabric in the', "Human-Animal Nature in H.G. Wells and Edgar Allen Poe H.G. Wells' The Island of Dr. Moreau and Edgar Allen", "Pixels and Print: Effects of the Digital Age on Children's Literature The impact of the Internet and technology on children today", "Historical Places, Violent Spaces: A discussion on violence, perso

In [5]:
# Generate text with GPT Neo

def gpt_neo_generate(prompt, max_length=750):
  inputs =  tokenizer(prompt, return_tensors="pt")

  input_ids = inputs["input_ids"].to("cuda")

  generation_config = transformers.GenerationConfig(
      do_sample=True,
      temperature=0.7,
      top_p=0.92,
      top_k=80,
      no_repeat_ngram_size=2,
      min_length=500,
      max_length=1000,
      repetition_penalty=1.5,
      )

  with torch.no_grad():
    generation_output = model.generate(
        input_ids=input_ids,
        attention_mask=torch.ones_like(input_ids),
        pad_token_id=tokenizer.eos_token_id,
        generation_config=generation_config,
    )

    output_text = tokenizer.decode(
        generation_output[0].cuda(), skip_special_tokens=True
        ).strip()

    return output_text.split(prompt, 1)[1]

**Generating Watermarked Text**

In [6]:
import sys
sys.path.append('/dupe/lm_watermarking_main')

In [7]:
from extended_watermark_processor import WatermarkLogitsProcessor
from transformers import LogitsProcessorList

In [8]:
watermark_processor = WatermarkLogitsProcessor(vocab=list(tokenizer.get_vocab().values()),
                                               gamma=0.25,
                                               delta=2.0,
                                               seeding_scheme="simple_1")

In [9]:
# Function to generate watermarked text

def watermark_generate(prompt, max_length=750):
    inputs =  tokenizer(prompt, return_tensors="pt")

    input_ids = inputs["input_ids"].to("cuda")

    generation_config = transformers.GenerationConfig(
        do_sample=True,
        temperature=0.75,
        top_p=0.92,
        top_k=50,
        no_repeat_ngram_size=2,
        min_length=500,
        max_length=1000,
        repetition_penalty=1.5,
#       max_new_tokens=128,
        )

    with torch.no_grad():
        generation_output = model.generate(
        input_ids=input_ids,
        attention_mask=torch.ones_like(input_ids),
        pad_token_id=tokenizer.eos_token_id,
        generation_config=generation_config,
        logits_processor=LogitsProcessorList([watermark_processor]),
    )
    
    output = generation_output[:,input_ids.shape[-1]:]

    output_text = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
    
    return output_text

In [12]:
# Generate essays and store them in a Pandas DataFrame

watermark_data = []
for prompt in prompts:
    watermark_text = watermark_generate(prompt)
    watermark_data.append({"Prompt": prompt, "Essay": watermark_text})

watermark_df = pd.DataFrame(watermark_data)
watermark_df

Unnamed: 0,Prompt,Essay
0,The Vicar of Wakefield as a Failed Morality St...,"around for thousands of years. In fact, they ..."
1,Governmental Funding for the Development of Se...,living with the disease worldwide. More than ...
2,Who Is God? One common problem for the Christi...,that God is one. The basic problem is that we...
3,The Absolute Necessity of College-Level Writin...,"and Race in America (New York: Routledge, 201..."
4,People or Property? In several of Shakespeare'...,"central to plots. In Hamlet, the two fathers ..."
...,...,...
207,"The composition of the Earth Thales, argued to...",a plan that would prove beneficial to mankind...
208,God and Omnipotence According to traditional t...,all-powerful. This is the basis of a lot of P...
209,Question 9: Determinism and Free Will At first...,are determined or free seems very straightfor...
210,Hume and Smith on Justice Adam Smith and David...,"can be derived from human conduct, and that t..."


In [17]:
# Export the watermarked essay to a CSV file in the specified directory

directory = '/dupe/Data/'  
filename = 'watermark_generated.csv'  

watermark_df.to_csv(directory + filename, index=False)

**Detecting Watermarked text**

In [14]:
from extended_watermark_processor import WatermarkDetector

watermark_detector = WatermarkDetector(vocab=list(tokenizer.get_vocab().values()),
                                        gamma=0.25, # should match original setting
                                        seeding_scheme="simple_1", # should match original setting
                                        device=model.device, # must match the original rng device type
                                        tokenizer=tokenizer,
                                        z_threshold=4.0,
                                        normalizers=[],
                                        ignore_repeated_ngrams=False)

In [15]:
import pandas as pd

def run_detector(df, text_label):
    """For each text in DF, run the watermark detector and return a DataFrame with the text, 
    the number of tokens, number of green tokens, and the detection p-value."""
    
    # Prepare lists to hold the data
    texts = []
    token_numbers = []
    green_numbers = []
    p_vals = []
    
    # Iterate over the DataFrame
    for i in range(len(df)):
        
        # Get the text from the DataFrame
        text = df.iloc[i][text_label]
        texts.append(text)
        
        # Run the watermark detector (assuming it's defined elsewhere)
        try:
            detection = watermark_detector.detect(text)
        except:
            print('line:',i )
            continue
        
        # Append the detection results to the lists
        token_numbers.append(detection['num_tokens_scored'])
        green_numbers.append(detection['num_green_tokens'])
        p_vals.append(detection['p_value'])
    
    # Create a DataFrame with the collected data
    detect_df = pd.DataFrame({
        'Text': texts,
        'Token Number': token_numbers,
        'Green Number': green_numbers,
        'P-Value': p_vals
    })
    
    return detect_df


In [18]:
todetect_df =pd.read_csv('/dupe/Data/watermark_generated.csv')

watermarkdetect = run_detector(todetect_df, 'Essay')

In [19]:
watermarkdetect

Unnamed: 0,Text,Token Number,Green Number,P-Value
0,"around for thousands of years. In fact, they ...",692,382,1.707299e-75
1,living with the disease worldwide. More than ...,960,525,1.927755e-100
2,that God is one. The basic problem is that we...,985,513,4.419774e-86
3,"and Race in America (New York: Routledge, 201...",969,571,1.104685e-131
4,"central to plots. In Hamlet, the two fathers ...",194,118,5.024021e-31
...,...,...,...,...
207,a plan that would prove beneficial to mankind...,981,557,3.186734e-117
208,all-powerful. This is the basis of a lot of P...,977,530,3.057021e-99
209,are determined or free seems very straightfor...,977,500,6.162649e-80
210,"can be derived from human conduct, and that t...",338,200,5.350521e-48


In [22]:
# Export the watermarked destection DataFrame to a CSV file in the specified directory

directory = '/dupe/Data/' 
filename = 'watermark_detect.csv'  

watermarkdetect.to_csv(directory + filename, index=False)