In [None]:
!pip install --upgrade ipython

In [None]:
!pip uninstall transformers --Y

In [None]:
import os
import sys
sys.path.append("../")

In [None]:
import pickle
import json
import glob
from tqdm.auto import trange, tqdm  
from transformers import pipeline
from features import merge_entries, prepare_entry
import nltk
from utills import chunker

In [None]:
python3 -m pip install --upgrade transformers

In [None]:
!pip3 install git+https://github.com/huggingface/transformers

In [None]:
import transformers
from transformers import AutoTokenizer
from transformers import LlamaForCausalLM

In [None]:
'''
PREPROCESSED_DATA_PATH = '../temp_data/pan/'
DATA_DIR = '../data/pan/'
GROUND_TRUTH_PATH = DATA_DIR + 'pan20-authorship-verification-training-large-truth.jsonl'
TEMP_DATA_PATH = '../temp_data/ai/'
'''

PREPROCESSED_DATA_PATH = '../temp_data/pan/'
DATA_DIR = '/home/ovendra/authorship_verification-main/pan/'
GROUND_TRUTH_PATH = DATA_DIR + 'pan20-authorship-verification-training-large-truth.jsonl'
TEMP_DATA_PATH = '/home/ovendra/authorship_verification-main/temp_data/'

In [None]:
MAX_RECORDS = 10
NUM_MACHINES = 10

In [None]:
def generate_ai_and_human_text_pair(text_generation, nltk_tokenizer, preprocessed_doc):
    prompt_prefix = "Complete the next X paragraphs from this fanfiction about {fandom}. Ensure to use the same writing style as the original fanfiction: "
    prompt_texts = [prompt_prefix['preprocessed'] for i, c in enumerate(preprocessed_doc) if i % 2 == 0]
    
    generated_texts = text_generation(prompt_texts, max_length=450)
    generated_text = '\n'.join([gt[0]['generated_text'].replace(pt, '') for gt, pt in zip(generated_texts, prompt_texts)])
    
    spans = list(nltk_tokenizer.span_tokenize(generated_text))
    groups = chunker(spans, 110)
    generated_texts_preprocessed = [prepare_entry(generated_text[spans[0][0]:spans[-1][1]], mode='accurate', tokenizer='casual') for spans in groups]
    return preprocessed_doc, generated_texts_preprocessed

In [None]:
sys.argv[1] = 0

In [None]:
if __name__ == "__main__":
    instance_id = int(sys.argv[1])
    print('Instance ID for this machine:', instance_id, flush=True)
    
    
    ground_truth = {}
    with open(GROUND_TRUTH_PATH, 'r') as f:
        for l in f:
            d = json.loads(l)
            ground_truth[d['id']] = d['same']
            

    fanfic_recs = []
    with open(PREPROCESSED_DATA_PATH + 'preprocessed_test.jsonl', 'r') as f:
        for l in tqdm(f):
            d = json.loads(l)
            if ground_truth[d['id']] == True:
                fanfic_recs.append(d)
            if len(fanfic_recs) > MAX_RECORDS:
                break
    
    print('Loading models...', flush=True)                
    
    model = LlamaForCausalLM.from_pretrained("../temp_data/llama-weights/output/path")
    tokenizer = AutoTokenizer.from_pretrained("../temp_data/llama-weights/output/path/tokenizer.model")    
    
    
    text_generation = pipeline('text-generation', model=model, tokenizer=tokenizer)
    
    job_sz = MAX_RECORDS // NUM_MACHINES
    start_rec = instance_id * job_sz
    end_rec = (instance_id + 1) * job_sz
    fanfic_recs = fanfic_recs[start_rec:end_rec]
    nltk_tokenizer = nltk.tokenize.WhitespaceTokenizer()

    print('Recs on this machine:', (end_rec - start_rec), flush=True)
    with open(TEMP_DATA_PATH + 'human_ai_preprocessed' + str(instance_id) + '.jsonl', 'w') as f_out:
        for d in tqdm(fanfic_recs):
            d1_human, d1_ai = generate_ai_and_human_text_pair(text_generation, nltk_tokenizer, d['pair'][0])
            d2_human, d2_ai = generate_ai_and_human_text_pair(text_generation, nltk_tokenizer, d['pair'][1])

            preprocessed = {
                'id': d['id'],
                'fandoms': d['fandoms'],
                'pair': [
                    {'human': d1_human, 'ai': d1_ai},
                    {'human': d2_human, 'ai': d2_ai}
                ]
            }
            json.dump(preprocessed, f_out)
            f_out.write('\n')
            f_out.flush()