In [15]:
import pandas as pd
from pathlib import Path
import together
import os
from openai import OpenAI
import random
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff

In [16]:
from tqdm import tqdm

In [17]:
annotations = pd.read_csv('BMDS_story_annotations.csv')
annotations = annotations[annotations["Reveal border sentence"].notnull()]
border_sentences = annotations[["Story Code", "Reveal border sentence"]]

In [18]:
results_dict = {'story': [], 'reveal_sentence': []}

# sample 20 stories

In [19]:
import concurrent.futures
from tqdm import tqdm
client = OpenAI()

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def completion_with_backoff(**kwargs):
    response = client.chat.completions.create(**kwargs)
    return response 

def summarize_one(story_id):
    prompt_base = "Create a 1 paragraph plot outline of the following story:\n"
    try:
        with open(f'./texts/{story_id}.txt') as f:
            story = f.read()
            prompt = prompt_base + story
            response = completion_with_backoff(
                model="gpt-3.5-turbo-0125",
                temperature=0.8,
                messages=[
                    {"role": "system", "content": "You are an author's assistant."},
                    {"role": "user", "content": prompt}
                ]
            )
            return story_id, response.choices[0].message.content
    except Exception as e:
        print(str(e))
        return story_id, None

def summarize():
    
    summaries = {}
    sample_stories = border_sentences.sample(5)
    
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [executor.submit(summarize_one, row["Story Code"]) for _, row in sample_stories.iterrows()]
        
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
            story_id, summary = future.result()
            summaries[story_id] = summary
        
    return summaries


# Write 20 stories

In [20]:
def process_story(story_id, seed_key, seed_prompt, seed_story):
    prompt_base = "Write a 500-1000 word story based on the provided outline.\n"
    try:

        # Construct the prompt for the current story_id
        prompt = prompt_base + summaries[story_id]
        # Call the completion function
        response = completion_with_backoff(
            model="gpt-3.5-turbo-0125",
            temperature=0.8,
            messages=[
                {"role": "system", "content": "You are an author. Your job is to write interesting mystery stories."},
                {"role": "user", "content": seed_prompt},
                {"role": "assistant", "content": seed_story},
                {"role": "user", "content": prompt}
            ]
        )

        return story_id, response.choices[0].message.content

    except Exception as e:
        error = str(e)
        print(f"Error in story {story_id}: {error}")
        return story_id, None

def write():
    stories = {}
    prompt_base = "Write a 500-1000 word story based on the provided outline.\n"
    seed_key = random.choice(list(summaries.keys()))
    seed_prompt = prompt_base + summaries[seed_key]
    with open(f'./texts/{seed_key}.txt') as f:
        seed_story = f.read()
    # Create a ThreadPoolExecutor
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Submit tasks to the executor
        futures = [executor.submit(process_story, story_id, seed_key, seed_prompt, seed_story) for story_id in summaries]
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
            story_id, summary = future.result()
            stories[story_id] = summary
    return stories


# border sentence of 20 stories

In [21]:
def read_story(story_id, seed_prompt, seed_sentence):
    try:
        # Construct the prompt for the current story_id
        prompt_base = "In the following story, identify the sentence where the mystery is revealed:"
        prompt = prompt_base + stories[story_id]
        # Call the completion function
        response = completion_with_backoff(
            model="gpt-3.5-turbo-0125",
            temperature=0.8,
            messages=[
                {"role": "system", "content": "You are an author. Your job is to read interesting mystery stories."},
                {"role": "user", "content": seed_prompt},
                {"role": "assistant", "content": seed_sentence},
                {"role": "user", "content": prompt}
            ]
        )

        return story_id, response.choices[0].message.content

    except Exception as e:
        error = str(e)
        print(f"Error in story {story_id}: {error}")
        return story_id, None

def find_border():
    border_sentence = {}
    prompt_base = "In the following story, identify the sentence where the mystery is revealed:"
    seed_key = random.choice(list(stories.keys()))
    seed_prompt = prompt_base + stories[seed_key]
    seed_sentence = border_sentences[border_sentences['Story Code'] == seed_key]['Reveal border sentence'].values[0]

    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Submit tasks to the executor and gather futures
        futures = [executor.submit(read_story, story_id, seed_prompt, seed_sentence) for story_id in tqdm(summaries)]

        # Iterate over completed tasks
        for future in tqdm(concurrent.futures.as_completed(futures)):
            result = future.result()
            if result is not None:
                story_id, border_sentence_content = result
                border_sentence[story_id] = border_sentence_content

    return border_sentence

In [35]:
import time
for i in tqdm(range(10)):
    time.sleep(10)
    summaries = summarize()
    condition = lambda key, value: value != None
    summaries = {key: value for key, value in summaries.items() if condition(key, value)}
    time.sleep(5)
    stories = write()
    stories = {key: value for key, value in stories.items() if condition(key, value)}
    time.sleep(5)
    border_sentence = find_border()
    border_sentence = {key: value for key, value in border_sentence.items() if condition(key, value)}
    
    for key in border_sentence.keys():
        if key in stories:
            story = stories[key]
            sentence = border_sentence[key]
            if story.find(sentence)!=-1:
                results_dict['story'].append(story)
                results_dict['reveal_sentence'].append(sentence)
    print(len(results_dict['story']))
    


  0%|                                                                              | 0/10 [00:00<?, ?it/s]
  0%|                                                                               | 0/5 [00:00<?, ?it/s][A
 20%|██████████████▏                                                        | 1/5 [00:03<00:15,  3.98s/it][A
 40%|████████████████████████████▍                                          | 2/5 [00:04<00:05,  1.89s/it][A
 80%|████████████████████████████████████████████████████████▊              | 4/5 [00:05<00:01,  1.05s/it][A
100%|███████████████████████████████████████████████████████████████████████| 5/5 [00:08<00:00,  1.72s/it][A

  0%|                                                                               | 0/5 [00:00<?, ?it/s][A
 20%|██████████████▏                                                        | 1/5 [00:12<00:49, 12.26s/it][A
 40%|████████████████████████████▍                                          | 2/5 [00:14<00:19,  6.43s/it][A
 60%|███████

198



  0%|                                                                               | 0/5 [00:00<?, ?it/s][A
 20%|██████████████▏                                                        | 1/5 [00:03<00:14,  3.68s/it][A
 40%|████████████████████████████▍                                          | 2/5 [00:05<00:08,  2.84s/it][A
 60%|██████████████████████████████████████████▌                            | 3/5 [00:08<00:05,  2.53s/it][A
 80%|████████████████████████████████████████████████████████▊              | 4/5 [00:18<00:05,  5.54s/it][A
100%|███████████████████████████████████████████████████████████████████████| 5/5 [00:34<00:00,  6.81s/it][A

  0%|                                                                               | 0/5 [00:00<?, ?it/s][A
 20%|██████████████▏                                                        | 1/5 [00:13<00:52, 13.23s/it][A
 40%|████████████████████████████▍                                          | 2/5 [00:18<00:25,  8.44s/it][A
 60%|███

199



  0%|                                                                               | 0/5 [00:00<?, ?it/s][A
 20%|██████████████▏                                                        | 1/5 [00:03<00:14,  3.66s/it][A
 40%|████████████████████████████▍                                          | 2/5 [00:06<00:09,  3.05s/it][A
 60%|██████████████████████████████████████████▌                            | 3/5 [00:12<00:09,  4.58s/it][A
 80%|████████████████████████████████████████████████████████▊              | 4/5 [00:25<00:07,  7.80s/it][A
100%|███████████████████████████████████████████████████████████████████████| 5/5 [00:40<00:00,  8.12s/it][A

  0%|                                                                               | 0/5 [00:00<?, ?it/s][A
 20%|██████████████▏                                                        | 1/5 [00:12<00:48, 12.06s/it][A
 40%|████████████████████████████▍                                          | 2/5 [00:13<00:18,  6.08s/it][A
 60%|███

203



  0%|                                                                               | 0/5 [00:00<?, ?it/s][A
 20%|██████████████▏                                                        | 1/5 [00:03<00:12,  3.08s/it][A
 40%|████████████████████████████▍                                          | 2/5 [00:03<00:04,  1.62s/it][A
 60%|██████████████████████████████████████████▌                            | 3/5 [00:16<00:13,  6.78s/it][A
 80%|████████████████████████████████████████████████████████▊              | 4/5 [00:28<00:08,  8.67s/it][A
100%|███████████████████████████████████████████████████████████████████████| 5/5 [00:43<00:00,  8.73s/it][A

  0%|                                                                               | 0/5 [00:00<?, ?it/s][A
 20%|██████████████▏                                                        | 1/5 [00:20<01:23, 20.79s/it][A
 40%|████████████████████████████▍                                          | 2/5 [00:28<00:39, 13.33s/it][A
 60%|███

207



  0%|                                                                               | 0/5 [00:00<?, ?it/s][A
 20%|██████████████▏                                                        | 1/5 [00:03<00:13,  3.36s/it][A
 40%|████████████████████████████▍                                          | 2/5 [00:05<00:08,  2.73s/it][A
 60%|██████████████████████████████████████████▌                            | 3/5 [00:12<00:09,  4.52s/it][A
 80%|████████████████████████████████████████████████████████▊              | 4/5 [00:20<00:05,  5.92s/it][A
100%|███████████████████████████████████████████████████████████████████████| 5/5 [00:30<00:00,  6.09s/it][A

  0%|                                                                               | 0/5 [00:00<?, ?it/s][A
 20%|██████████████▏                                                        | 1/5 [00:12<00:48, 12.00s/it][A
 40%|████████████████████████████▍                                          | 2/5 [00:22<00:32, 10.98s/it][A
 60%|███

212



  0%|                                                                               | 0/5 [00:00<?, ?it/s][A
 20%|██████████████▏                                                        | 1/5 [00:03<00:15,  3.82s/it][A
 40%|████████████████████████████▍                                          | 2/5 [00:04<00:05,  1.78s/it][A
 60%|██████████████████████████████████████████▌                            | 3/5 [00:10<00:07,  3.69s/it][A
 80%|████████████████████████████████████████████████████████▊              | 4/5 [00:21<00:06,  6.62s/it][A
100%|███████████████████████████████████████████████████████████████████████| 5/5 [00:36<00:00,  7.24s/it][A

  0%|                                                                               | 0/5 [00:00<?, ?it/s][A
 20%|██████████████▏                                                        | 1/5 [00:16<01:05, 16.38s/it][A
 40%|████████████████████████████▍                                          | 2/5 [00:32<00:48, 16.08s/it][A
 60%|███

214



  0%|                                                                               | 0/5 [00:00<?, ?it/s][A
 20%|██████████████▏                                                        | 1/5 [00:03<00:13,  3.44s/it][A
 40%|████████████████████████████▍                                          | 2/5 [00:03<00:05,  1.67s/it][A
 60%|██████████████████████████████████████████▌                            | 3/5 [00:04<00:02,  1.02s/it][A
 80%|████████████████████████████████████████████████████████▊              | 4/5 [00:15<00:05,  5.22s/it][A
100%|███████████████████████████████████████████████████████████████████████| 5/5 [00:26<00:00,  5.32s/it][A

  0%|                                                                               | 0/5 [00:00<?, ?it/s][A
 20%|██████████████▏                                                        | 1/5 [00:15<01:03, 15.81s/it][A
 40%|████████████████████████████▍                                          | 2/5 [00:29<00:43, 14.54s/it][A
 60%|███

218



  0%|                                                                               | 0/5 [00:00<?, ?it/s][A
 20%|██████████████▏                                                        | 1/5 [00:03<00:14,  3.71s/it][A
 40%|████████████████████████████▍                                          | 2/5 [00:03<00:04,  1.62s/it][A
 60%|██████████████████████████████████████████▌                            | 3/5 [00:11<00:08,  4.25s/it][A
 80%|████████████████████████████████████████████████████████▊              | 4/5 [00:21<00:06,  6.64s/it][A
100%|███████████████████████████████████████████████████████████████████████| 5/5 [00:32<00:00,  6.43s/it][A

  0%|                                                                               | 0/5 [00:00<?, ?it/s][A
 20%|██████████████▏                                                        | 1/5 [00:12<00:50, 12.66s/it][A
 40%|████████████████████████████▍                                          | 2/5 [00:24<00:37, 12.41s/it][A
 60%|███

222



  0%|                                                                               | 0/5 [00:00<?, ?it/s][A
 20%|██████████████▏                                                        | 1/5 [00:03<00:14,  3.68s/it][A
 40%|████████████████████████████▍                                          | 2/5 [00:03<00:04,  1.60s/it][A
 60%|██████████████████████████████████████████▌                            | 3/5 [00:06<00:03,  1.99s/it][A
 80%|████████████████████████████████████████████████████████▊              | 4/5 [00:15<00:04,  4.87s/it][A
100%|███████████████████████████████████████████████████████████████████████| 5/5 [00:28<00:00,  5.71s/it][A

  0%|                                                                               | 0/5 [00:00<?, ?it/s][A
 20%|██████████████▏                                                        | 1/5 [00:11<00:45, 11.45s/it][A
 40%|████████████████████████████▍                                          | 2/5 [00:11<00:14,  4.79s/it][A
 60%|███

227



  0%|                                                                               | 0/5 [00:00<?, ?it/s][A
 20%|██████████████▏                                                        | 1/5 [00:03<00:12,  3.25s/it][A
 40%|████████████████████████████▍                                          | 2/5 [00:04<00:06,  2.15s/it][A
 60%|██████████████████████████████████████████▌                            | 3/5 [00:08<00:05,  2.89s/it][A
 80%|████████████████████████████████████████████████████████▊              | 4/5 [00:23<00:07,  7.76s/it][A
100%|███████████████████████████████████████████████████████████████████████| 5/5 [00:41<00:00,  8.37s/it][A

  0%|                                                                               | 0/5 [00:00<?, ?it/s][A
 20%|██████████████▏                                                        | 1/5 [00:13<00:52, 13.05s/it][A
 40%|████████████████████████████▍                                          | 2/5 [00:13<00:16,  5.60s/it][A
 60%|███

227





In [36]:
df = pd.DataFrame(results_dict)
df.to_csv('synthetic_batch_11.csv')

In [43]:
directory = './'

# List to store DataFrames from each CSV
dfs = []

# Iterate over each file in the directory
for filename in os.listdir(directory):
    if filename.find('synthetic_batch')!=-1:
        file_path = os.path.join(directory, filename)
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path, index_col=0)
        # Append the DataFrame to the list
        dfs.append(df)

# Concatenate all DataFrames in the list along rows
combined_df = pd.concat(dfs, ignore_index=True)

In [44]:
def truncate_story(row):
    sentence = row['reveal_sentence']
    story = row['story']
    index = story.find(sentence)
    return row['story'][:index]
combined_df['story_truncated'] = combined_df.apply(truncate_story, axis=1)
combined_df

Unnamed: 0,story,reveal_sentence,story_truncated
0,District Attorney Carton's urgent summons to K...,"Inside the package was a sophisticated bomb, d...",District Attorney Carton's urgent summons to K...
1,It was a cold and misty morning when Miss Viol...,The case took a dramatic turn when Miss Smith ...,It was a cold and misty morning when Miss Viol...
2,"Martin Hewitt, the renowned detective, receive...","In the letter, Mason confessed to the murder o...","Martin Hewitt, the renowned detective, receive..."
3,"The night was dark and foreboding as Kennedy, ...",In a dramatic confrontation with the family me...,"The night was dark and foreboding as Kennedy, ..."
4,It was a crisp autumn evening when Maude Eusto...,"With Maude's assistance, Kennedy pieced togeth...",It was a crisp autumn evening when Maude Eusto...
...,...,...,...
1002,"The 'Old Manor' in Clepperton-on-Rill, Yorkshi...",The discovery sent shockwaves through the town...,"The 'Old Manor' in Clepperton-on-Rill, Yorkshi..."
1003,It was a cold and foggy evening when Inspector...,"Holmes, with his unmatched powers of observati...",It was a cold and foggy evening when Inspector...
1004,Wendell Curtis Barrett sat by the open window ...,"The Thinking Machine, with his sharp intellect...",Wendell Curtis Barrett sat by the open window ...
1005,The evening at Mr. Sedgwick's grand Boston man...,"In a surprising turn of events, the Roman coin...",The evening at Mr. Sedgwick's grand Boston man...


In [47]:
combined_df.to_csv('unresolved_error_dataset.csv')

In [52]:
import re
def get_prop(row):
    story = row['story']
    truncate = row['story_truncated']
    sentences = [x.strip().replace('\n','') for x in story.split('.') if re.search(r'\w', x)!=0]
    truncated_sentences = [x.strip().replace('\n','') for x in truncate.split('.') if re.search(r'\w', x)!=0]
    row['story'] = truncate
    return 1-(len(truncated_sentences)/len(sentences))
    
combined_df['chunk_prop'] = combined_df.apply(get_prop, axis=1)
combined_df.head()

Unnamed: 0,story,reveal_sentence,story_truncated,chunk_prop
0,District Attorney Carton's urgent summons to K...,"Inside the package was a sophisticated bomb, d...",District Attorney Carton's urgent summons to K...,0.818182
1,It was a cold and misty morning when Miss Viol...,The case took a dramatic turn when Miss Smith ...,It was a cold and misty morning when Miss Viol...,0.48
2,"Martin Hewitt, the renowned detective, receive...","In the letter, Mason confessed to the murder o...","Martin Hewitt, the renowned detective, receive...",0.291667
3,"The night was dark and foreboding as Kennedy, ...",In a dramatic confrontation with the family me...,"The night was dark and foreboding as Kennedy, ...",0.32
4,It was a crisp autumn evening when Maude Eusto...,"With Maude's assistance, Kennedy pieced togeth...",It was a crisp autumn evening when Maude Eusto...,0.576923


In [58]:
def write_synthetic_datapoint_to_file(X, y, path, plot_hole_type):
    """
    write a synthetic datapoint to a file.
    :param X: synthetic document
    :param y: synthetic label
    :param path: path to write the file to
    :param plot_hole_type: type of plot hole, will be written at top of document
    :returns: None. file will be written at path. first line will be "plot_hole_type y", and
    rest of the lines will be X.
    """
    with open(path, "w", encoding="utf-8") as synthetic_document_f:
        synthetic_document_f.write(f"{plot_hole_type} {y}\n")
        synthetic_document_f.write(X)

In [59]:
for i,row in combined_df.iterrows():
    path = f"./test/test_unresolved{i}.txt"
    X = row['story_truncated']
    y = row['chunk_prop']
    write_synthetic_datapoint_to_file(X,y,path=path,plot_hole_type="unresolved")

In [61]:
from sklearn.model_selection import train_test_split
train_size = 0.8
validation_size = 0.1
test_size = 0.1

train_df, temp_df = train_test_split(combined_df, train_size=train_size, random_state=42)
validation_df, test_df = train_test_split(temp_df, test_size=test_size / (test_size + validation_size), random_state=42)

train_df.to_csv('train_dataset.csv', index=False)
validation_df.to_csv('validation_dataset.csv', index=False)
test_df.to_csv('test_dataset.csv', index=False)