In [1]:
import requests
import json
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
from nltk.util import ngrams
from concurrent.futures import ThreadPoolExecutor, as_completed
import tqdm

nltk.download('punkt')

# Function to generate a URL for a specific book
def generate_url(model_size='6b', book_num=0):
    return f"https://openaipublic.blob.core.windows.net/recursive-book-summ/website/data/booksum_book_trees/{model_size}/{book_num}/all.json"

# Function to scrape data from a URL
def scrape_data(url):
    response = requests.get(url)
    if response.status_code == 200:
        return json.loads(response.content)
    else:
        return None

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jonathanhu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Function to tokenize text and create n-grams
def create_ngrams(text, n=2):
    words = nltk.word_tokenize(text)
    return list(ngrams(words, n))

# Function to calculate 2-gram overlap using set operations for efficiency
def calculate_overlap(summary_ngrams, chunk_ngrams):
    overlap = set(summary_ngrams) & set(chunk_ngrams)
    return len(overlap)

# Function to map depth 2 summary sentences to text chunks in depth 3
def map_summaries_to_chunks(data, model_size, book_num):
    mapping = []

    for entry in data["2"]:
        summary_sentences = sent_tokenize(entry["summary"])
        summarize_range = entry["summarize_range"]
        if not summarize_range:
            for idx, sentence in enumerate(summary_sentences):
                mapping.append({
                    "model_size": model_size,
                    "book_num": book_num,
                    "summary_sentence_num": idx,
                    "summary_sentence": sentence,
                    "text_chunk": entry["text"]
                })
            continue
        relevant_depth_3 = list(enumerate(data["3"][summarize_range[0]:summarize_range[1]]))



        # Precompute 2-grams for all relevant depth 3 summaries
        depth_3_ngrams = {i: create_ngrams(chunk["summary"]) for i, chunk in relevant_depth_3}

        for idx, sentence in enumerate(summary_sentences):
            summary_ngrams = create_ngrams(sentence)
            max_overlap = 0
            best_chunk = None

            for i, chunk in relevant_depth_3:
                chunk_ngrams = depth_3_ngrams[i]
                overlap = calculate_overlap(summary_ngrams, chunk_ngrams)
                if overlap > max_overlap:
                    max_overlap = overlap
                    best_chunk = chunk

            if best_chunk:
                mapping.append({
                    "model_size": model_size,
                    "book_num": book_num,
                    "summary_sentence_num": idx,
                    "summary_sentence": sentence,
                    "text_chunk": best_chunk["text"]
                })

    return mapping

In [3]:
# Function to process data for a single book
def process_book(model_size, book_num):
    url = generate_url(model_size, book_num)
    data = scrape_data(url)
    if data and '2' in data and '3' in data and '4' not in data:
        return map_summaries_to_chunks(data, model_size, book_num)
    return []

# Main function to process data for multiple books and model sizes using parallelization
def process_books(model_sizes, book_nums):
    all_mappings = []
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(process_book, model_size, book_num): (model_size, book_num) for model_size in model_sizes for book_num in book_nums}
        for future in tqdm.tqdm(as_completed(futures), total=len(futures), desc="Processing Books"):
            all_mappings.extend(future.result())
    return all_mappings

In [4]:
# Example usage
model_sizes = ['175b']
book_nums = list(range(268))

all_mappings = process_books(model_sizes, book_nums)

# Convert to DataFrame and save as CSV
df = pd.DataFrame(all_mappings)

Processing Books: 100%|██████████| 268/268 [00:33<00:00,  8.04it/s]


In [5]:
df

Unnamed: 0,model_size,book_num,summary_sentence_num,summary_sentence,text_chunk
0,175b,9,0,The narrator is born in a small town in Georgi...,\n\nI know that in writing the following pages...
1,175b,9,1,"His mother moves them to Connecticut, where sh...","one, my mother; and the other, a tall man with..."
2,175b,9,2,The narrator develops a love for music and pla...,me my name and age and tell my mother what a p...
3,175b,9,3,"As he grows older, the narrator becomes more s...","fancied some slight where, I am sure, none was..."
4,175b,9,4,He becomes interested in the Bible and reads e...,of each one without having to read the subject...
...,...,...,...,...,...
26982,175b,250,1,He also mentions that Em'ly never married but ...,marriage was broken off by death. No one knows...
26983,175b,250,2,Mr. Peggotty stays with them for a month and a...,Mr. Micawber was a diligent and esteemed corre...
26984,175b,250,3,The narrator looks back at his life and sees h...,"nutmeg-grater, is just the same, and when I se..."
26985,175b,250,4,"He also sees the Beauty, who is a widow with a...",I leave them; thus I always find them; thus th...


In [6]:
# Sort the dataframe by 'book_num' in ascending order while preserving the original order of 'summary_sentence_num'
df_sorted = df.sort_values(by=['book_num', 'summary_sentence_num']).reset_index(drop=True)

# Update 'summary_sentence_num' to increment for each 'book_num'
df_sorted['summary_sentence_num'] = df_sorted.groupby('book_num').cumcount()

# Save the updated dataframe to a new CSV file
output_file_path = 'mapped_summaries_l2.csv'  # Update with the desired output file path
df_sorted.to_csv(output_file_path, index=False)

print(df_sorted)


      model_size  book_num  summary_sentence_num  \
0           175b         0                     0   
1           175b         0                     1   
2           175b         0                     2   
3           175b         0                     3   
4           175b         0                     4   
...          ...       ...                   ...   
26982       175b       266                   213   
26983       175b       266                   214   
26984       175b       266                   215   
26985       175b       266                   216   
26986       175b       266                   217   

                                        summary_sentence  \
0      Johnnie Consadine, a determined and ambitious ...   
1      Johnnie and Shade walk through the town of Cot...   
2      Johnnie is used to hardship and is excited to ...   
3      Stoddard is a socialist who believes that marr...   
4      Mandy invites Johnnie to a party at the Uplift...   
...            

In [7]:
df_sorted

Unnamed: 0,model_size,book_num,summary_sentence_num,summary_sentence,text_chunk
0,175b,0,0,"Johnnie Consadine, a determined and ambitious ...","walk. Her mother would get up too, and that wa..."
1,175b,0,1,Johnnie and Shade walk through the town of Cot...,"sunbonnet, and cowhide shoes with the apparel ..."
2,175b,0,2,Johnnie is used to hardship and is excited to ...,body together--have to labour desperately at r...
3,175b,0,3,Stoddard is a socialist who believes that marr...,"and stared at the younger man.\n\n""Well!"" he e..."
4,175b,0,4,Mandy invites Johnnie to a party at the Uplift...,"""They never saw anything like you in their bor..."
...,...,...,...,...,...
26982,175b,266,213,The Marquis reflects on the events of the day ...,"the fountain, and the mender of roads with his..."
26983,175b,266,214,Jerry asks if he can become a Resurrection Man...,wife would let his trade alone altogether. Cal...
26984,175b,266,215,The Doctor explains that the man would likely ...,"there has been,"" he paused and took a deep bre..."
26985,175b,266,216,"So far, they are not being pursued.","the little street, bereft of horses, and with ..."
