In [1]:
import json
import csv
import requests

# Function to generate URLs
def generate_urls():
    urls_with_book_nums = []
    for y in ['6b', '175b']:
        for x in range(268):  # Range is from 0 to 267
            url = f"https://openaipublic.blob.core.windows.net/recursive-book-summ/website/data/booksum_book_trees/{y}/{x}/all.json"
            urls_with_book_nums.append((url, y, x))
    return urls_with_book_nums

# Function to scrape data from a URL
def scrape_data(url):
    response = requests.get(url)
    if response.status_code == 200:
        return json.loads(response.content)
    else:
        return None


In [3]:
import pandas as pd
from model_helper import FILE_PATH

l3_df = pd.read_csv(FILE_PATH) # contains level 3 information

In [7]:
# get the summarized range from json data, then append all summaries from csv file/df
level_2_data = []
for url, model_size, book_num in generate_urls():
    data = scrape_data(url)
    # if there are 3 levels of summaries, then we need to append the text from depth 3 summaries
    if len(data) == 4:
        for level, entries in data.items():
            if int(level) == 2:
                for entry in entries:
                    # ignore entries where summarize range does not exist
                    if not entry["summarize_range"]:
                        continue
                    l3_start_range, l3_end_range = entry["summarize_range"][0], entry["summarize_range"][1]
                    chunk_index = entry["n_prev_summaries"]
                    filtered_df = l3_df[(l3_df['book_num'] == book_num) & (l3_df['model_size'] == model_size) & (l3_df["document_index"] >= l3_start_range) & (l3_df["document_index"] < l3_end_range)]
                    sorted_filtered_df = filtered_df.sort_values(by='document_index')
                    concatenated_text = "\n\n".join(sorted_filtered_df['book_text'])
                    level_2_data.append({
                        "depth_2_summary": entry["summary"],
                        "book_text": concatenated_text, 
                        "model_size": model_size,
                        "book_num": book_num, 
                        "document_index": chunk_index
                    })
    # else we only care about the depth 2 summaries, and add the summaries and text normally
    elif len(data) == 3:
        for level, entries in data.items():
            if int(level) == 2:
                for entry in entries:
                    level_2_data.append({
                        "depth_2_summary": entry["summary"],
                        "book_text": entry["text"],
                        "model_size": model_size,
                        "book_num": book_num, 
                        "document_index": chunk_index
                    })
        
                    
# Write the extracted data to a CSV file
with open('data/l2_extracted_data.csv', 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['depth_2_summary', 'book_text', "model_size", "book_num", "document_index"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for entry in level_2_data:
        writer.writerow(entry)

In [8]:
l2_df = pd.read_csv("data/l2_extracted_data.csv")

In [10]:
# get the summarized range from json data, then append all summaries from csv file/df
level_1_data = []
for url, model_size, book_num in generate_urls():
    data = scrape_data(url)
    # if there is only depth_0 and depth_1 summary, then we get book text from depth_1 summary; else we get from depth_2 
    if len(data) > 2:
        for level, entries in data.items():
            if int(level) == 1:
                for entry in entries:
                    # ignore entries where summarize range does not exist
                    if not entry["summarize_range"]:
                        continue
                    l2_start_range, l2_end_range = entry["summarize_range"][0], entry["summarize_range"][1]
                    chunk_index = entry["n_prev_summaries"]
                    filtered_df = l2_df[(l2_df['book_num'] == book_num) & (l2_df['model_size'] == model_size) & (l2_df["document_index"] >= l2_start_range) & (l2_df["document_index"] < l2_end_range)]
                    sorted_filtered_df = filtered_df.sort_values(by='document_index')
                    concatenated_text = "\n\n".join(sorted_filtered_df['book_text'])
                    level_1_data.append({
                        "depth_1_summary": entry["summary"],
                        "book_text": concatenated_text, 
                        "model_size": model_size,
                        "book_num": book_num, 
                        "document_index": chunk_index
                    })
    # else we only care about the depth 2 summaries, and add the summaries and text normally
    elif len(data) == 2:
        for level, entries in data.items():
            if int(level) == 1:
                for entry in entries:
                    level_1_data.append({
                        "depth_1_summary": entry["summary"],
                        "book_text": entry["text"],
                        "model_size": model_size,
                        "book_num": book_num, 
                        "document_index": chunk_index
                    })
        
                    
# Write the extracted data to a CSV file
with open('data/l1_extracted_data.csv', 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['depth_1_summary', 'book_text', "model_size", "book_num", "document_index"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for entry in level_1_data:
        writer.writerow(entry)