In [1]:
import pandas as pd
import json
import requests
from nltk.tokenize import sent_tokenize
import nltk
import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

# Ensure NLTK punkt is downloaded
nltk.download('punkt')

# Function to generate a URL for a specific book
def generate_url(model_size='6b', book_num=0):
    return f"https://openaipublic.blob.core.windows.net/recursive-book-summ/website/data/booksum_book_trees/{model_size}/{book_num}/all.json"

# Function to scrape data from a URL
def scrape_data(url):
    response = requests.get(url)
    if response.status_code == 200:
        return json.loads(response.content)
    else:
        return None


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jonathanhu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Function to process a book and map summaries to text
def process_book(model_size, book_num):
    url = generate_url(model_size, book_num)
    data = scrape_data(url)
    if data is None or "3" not in data:
        return []
    
    mapped_summaries = []
    
    for entry in data["3"]:
        summary_sentences = sent_tokenize(entry["summary"])
        idx = 0
        for sentence in summary_sentences:
            mapped_summaries.append({
                "model_size": model_size,
                "book_num": book_num,
                "summary_sentence_num": idx,
                "summary_sentence": sentence,
                "text_chunk": entry["text"]
            })    
            idx += 1
    return mapped_summaries

# Function to process books in parallel
def process_books_parallel(model_sizes, book_nums):
    all_mappings = []
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(process_book, model_size, book_num) for model_size in model_sizes for book_num in book_nums]
        for future in tqdm.tqdm(as_completed(futures), total=len(futures), desc="Processing Books"):
            all_mappings.extend(future.result())
    
    return all_mappings

In [3]:
model_sizes = ['175b']
book_nums = list(range(268))

all_mappings = process_books_parallel(model_sizes, book_nums)

# Convert to DataFrame
df = pd.DataFrame(all_mappings)

df

Processing Books: 100%|██████████| 268/268 [00:27<00:00,  9.72it/s]


Unnamed: 0,model_size,book_num,summary_sentence_num,summary_sentence,text_chunk
0,175b,9,0,The narrator was born in a small town in Georg...,\n\nI know that in writing the following pages...
1,175b,9,1,"He has a faint recollection of his birthplace,...",\n\nI know that in writing the following pages...
2,175b,9,2,He remembers playing in the sand and digging u...,\n\nI know that in writing the following pages...
3,175b,9,3,"He also remembers a cow in an enclosure, and h...",\n\nI know that in writing the following pages...
4,175b,9,4,He only has a few memories of the people who l...,\n\nI know that in writing the following pages...
...,...,...,...,...,...
217891,175b,264,2,Doyce then comes in and tells Clennam that he ...,"Mr. F.'s Aunt holds out like a fortress, and F..."
217892,175b,264,3,"The next day, Little Dorrit visits the prison ...","Mr. F.'s Aunt holds out like a fortress, and F..."
217893,175b,264,4,"They quietly walk along the yard, and are marr...","Mr. F.'s Aunt holds out like a fortress, and F..."
217894,175b,264,5,"After the signing of the marriage register, th...","Mr. F.'s Aunt holds out like a fortress, and F..."


In [4]:
# Sort the dataframe by 'book_num' in ascending order while preserving the original order of 'summary_sentence_num'
df_sorted = df.sort_values(by=['book_num', 'summary_sentence_num']).reset_index(drop=True)

# Update 'summary_sentence_num' to increment for each 'book_num'
df_sorted['summary_sentence_num'] = df_sorted.groupby('book_num').cumcount()

# Save the updated dataframe to a new CSV file
df_sorted.to_csv("mapped_summaries_l3.csv", index=False)

df_sorted


Unnamed: 0,model_size,book_num,summary_sentence_num,summary_sentence,text_chunk
0,175b,0,0,"Uncle Pros is helping Laurella, his sick niece...","THE BIRTH OF A WOMAN-CHILD\n\n""Whose cradle's ..."
1,175b,0,1,"Laurella explains that her husband, Consadine,...",selfish--said she'd like to know how I was goi...
2,175b,0,2,Pros takes the cradle outside to get the last ...,"rich, broken light from the cavernous fireplac..."
3,175b,0,3,Mavity sends Bud and Mandy Ann to ask her fath...,"\n""Well, you go ask Pap to look in the green c..."
4,175b,0,4,Pros finishes repairing the cradle and places ...,"she wants--ain't ye, Pretty?""\n\nAnd, having m..."
...,...,...,...,...,...
217891,175b,266,1717,"Men of business may speak to Mr. Darnay now?""","Darnay: an intent look, deepening into a frown..."
217892,175b,266,1718,Carton replies that,"\n""I don't gainsay it. What has roughened your..."
217893,175b,266,1719,Mr. Lorry notices,"it, to know that it teems with interest; littl..."
217894,175b,266,1720,"I have seen it many times.""","linked hand in hand, and all spun round togeth..."
