In [2]:
import os
import sys
sys.path.append("../")

import pickle
import json
import glob
from tqdm.auto import trange, tqdm
import openai
from features import merge_entries, prepare_entry
import nltk
from utills import chunker, get_num_chunks
import numpy as np
from nltk.tokenize import sent_tokenize

In [31]:
PREPROCESSED_DATA_PATH = '../temp_data/pan/'
DATA_DIR = '../data/pan/'
GROUND_TRUTH_PATH = DATA_DIR + 'pan20-authorship-verification-training-large-truth.jsonl'
TEMP_DATA_PATH = '../temp_data/gpt3_new/'

MAX_RECORDS = 105

In [19]:
openai.api_key = "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"

# Define Methods

In [4]:
def fix_quotes_and_chunk(doc, sent_size=10):
    """
    Merge all the chunks and split by sentences. Then group again by 
    `sent_size` chunks. Fix the quotation marks as well
    """
    text =  ' '.join([e['preprocessed'] for e in doc])
    text = text.replace('"', '\'')
    chunks = [' '.join(c) for c in chunker(sent_tokenize(text), sent_size)]
    return [prepare_entry(c, mode='accurate', tokenizer='casual') for c in chunks]

In [5]:
def generate(text, fandom, num_paras):
    prompt = (
        f"Complete the next {num_paras} paragraphs from this fanfiction about {fandom}. " 
        "Ensure to use the same writing style as the original fanfiction:\n"
        f" {text} \n"
        ":"
    )
#     print(prompt)
    response = openai.Completion.create(
      model="gpt-3.5-turbo-instruct",
      prompt=prompt,
      max_tokens=2500,
    )

    r = response.to_dict()['choices'][0]['text']
    return r

In [17]:
def generate_gpt3_and_human_text_pair(preprocessed_doc, fandom, chunks_per_prompt):
    chunks = [merge_entries(c) for c in chunker(preprocessed_doc, chunks_per_prompt)]
    human_texts = [c for i, c in enumerate(chunks) if i % 2 == 0]
    prompt_texts = [c for i, c in enumerate(chunks) if i % 2 == 1]
    num_paras = 100
    
    generated_texts = [generate(p['preprocessed'], fandom, num_paras) for p in prompt_texts]
    generated_texts_preprocessed = [prepare_entry(generated_text, mode='accurate', tokenizer='casual') for generated_text in generated_texts]
    return preprocessed_doc, generated_texts_preprocessed

# Load Data

In [33]:
ground_truth = {}
with open(GROUND_TRUTH_PATH, 'r') as f:
    for l in f:
        d = json.loads(l)
        ground_truth[d['id']] = d['same']


fanfic_recs = []
chunk_token_length_total = 0
chunk_count = 0
with open(PREPROCESSED_DATA_PATH + 'preprocessed_test.jsonl', 'r') as f:
    for l in tqdm(f):
        d = json.loads(l)
        if ground_truth[d['id']] == True:
            fixed_d = d.copy()
            
            
            d1 = fix_quotes_and_chunk(d['pair'][0])
            d2 = fix_quotes_and_chunk(d['pair'][1])
            fixed_d['pair'] = [d1, d2]
            
            chunk_token_length_total += sum([len(e['tokens']) for e in d1])
            chunk_count += len(d1)
            chunk_token_length_total += sum([len(e['tokens']) for e in d2])
            chunk_count += len(d2)
            fanfic_recs.append(fixed_d)
        if len(fanfic_recs) > MAX_RECORDS:
            break




In [8]:
avg_chunk_length = chunk_token_length_total/chunk_count
print(f"Average token length of a chunk: {avg_chunk_length:.2f}")

Average token length of a chunk: 129.95


# Generate Text

In [22]:
LLM_token_length = 4096
prompt_token_length = int((LLM_token_length / 4) * 0.9) # Slightly lower than 1/4 of LLM token length
chunks_per_prompt = int(prompt_token_length/avg_chunk_length)

In [24]:
chunks_per_prompt, prompt_token_length

(7, 921)

In [35]:
processed_ids = []
with open(TEMP_DATA_PATH + 'new_human_gpt3_preprocessed.jsonl', 'r') as f:
    for l in f:
        d = json.loads(l)
        processed_ids.append(d['id'])

In [34]:
# processed_ids = []
with open(TEMP_DATA_PATH + 'new_human_gpt3_preprocessed.jsonl', 'a') as f_out:
    for d in tqdm(fanfic_recs):
        print(d['id'], d['id'] in processed_ids, flush=True)
        if d['id'] in processed_ids:
            continue
        try:
            d1_human, d1_ai = generate_gpt3_and_human_text_pair(d['pair'][0], d['fandoms'][0], chunks_per_prompt)
            print("Human Doc 1: ", len(merge_entries(d1_human)['tokens']))
            print("AI Doc 1: ", len(merge_entries(d1_ai)['tokens']))

            d2_human, d2_ai = generate_gpt3_and_human_text_pair(d['pair'][1], d['fandoms'][1], chunks_per_prompt)
            print("Human Doc 2: ", len(merge_entries(d2_human)['tokens']))
            print("AI Doc 2: ", len(merge_entries(d2_ai)['tokens']))

            preprocessed = {
                'id': d['id'],
                'fandoms': d['fandoms'],
                'pair': [
                    {'human': d1_human, 'ai': d1_ai},
                    {'human': d2_human, 'ai': d2_ai}
                ]
            }
            json.dump(preprocessed, f_out)
            f_out.write('\n')
            f_out.flush()
        except:
            continue

f8dfc29e-1adb-58f4-888d-a3236124727d True
b1a1257b-546b-5363-8d36-82ffa2280eb3 True
a71dede2-d06f-5df5-86fa-8784dd22ad5f True
ac5ae379-5d9e-5a32-bc66-2078efa70aa1 True
a9b8f0d1-aefe-58d8-a210-a276386d8c83 True
ef6e8d08-4bc8-581e-b43c-9ec8c10b7be7 True
8282b832-e689-581e-8de9-de7961d924fb True
dd88b38c-6a4d-5129-a681-b223f4305e5a True
86221920-cb49-587a-8236-ae41d3f8aa7c True
38cbfaa6-9b98-599a-9bf7-faa43d86f8c7 True
5bd631c9-92aa-51f1-b3bd-37a6bdb5eb65 True
5ee236b8-4d4b-55e1-93ad-4938d658066d True
5df1e0e4-3069-53a9-be3e-9e7f2c0e41f4 True
21b74444-5342-5cbf-a92f-cfa0c5066f50 True
d5ff3e2b-a4c1-5db0-9a6b-8a660a95125e True
d5777bb1-42fa-5c07-b545-2c6e619976f1 True
7acb4ce1-02fa-5e88-9e4b-63523ab14153 True
b2a5919f-cf65-58d2-9c16-7aa9acf978a4 True
0c7ba366-ab87-59d4-b240-4d5cb55aa07c True
d5777bb1-42fa-5c07-b545-2c6e619976f1 True
7acb4ce1-02fa-5e88-9e4b-63523ab14153 True
b2a5919f-cf65-58d2-9c16-7aa9acf978a4 True
0c7ba366-ab87-59d4-b240-4d5cb55aa07c True
9a5ab6f1-28bd-566c-9125-db4beb8c20

In [36]:
len(processed_ids)

104

In [38]:
openai.C

<module 'openai' from '/media/disk1/social/.local/lib/python3.6/site-packages/openai/__init__.py'>