In [1]:
# Import necessary libraries
import pandas as pd
import tiktoken
import json

# Load the JSON file into a pandas DataFrame
# Replace 'your_file.json' with the actual path to your JSON file
df = pd.read_json('../govuk-policy-qa-pairs/policy_papers.json')

# Combine the 'header' and 'content' into a single feature
df['combined'] = df['header'] + " " + df['content']

# Load the TikToken tokenizer (using GPT-2 as an example; replace with correct tokenizer if needed)
enc = tiktoken.get_encoding("gpt2")

# Tokenize the 'combined' column and count the tokens for each row
df['token_count'] = df['combined'].apply(lambda text: len(enc.encode(text)))

# Calculate statistics
most_tokens = df['token_count'].max()
least_tokens = df['token_count'].min()
average_tokens = df['token_count'].mean()

# Display the results
print(f"Most tokens: {most_tokens}")
print(f"Least tokens: {least_tokens}")
print(f"Average tokens: {average_tokens:.2f}")

# Optional: Display the DataFrame with token counts
df[['url', 'header', 'content', 'token_count']].head()


Most tokens: 164456
Least tokens: 7
Average tokens: 4634.21


Unnamed: 0,url,header,content,token_count
0,https://www.gov.uk/government/publications/blu...,Blue Planet Fund,Introduction\nThe Blue Planet Fund is the UK’s...,2968
1,https://www.gov.uk/government/publications/blu...,Ocean Country Partnership Programme (OCPP),The Ocean Country Partnership Programme (OCPP)...,730
2,https://www.gov.uk/government/publications/blu...,Ocean Community Empowerment and Nature (OCEAN)...,The Ocean Community Empowerment and Nature (OC...,596
3,https://www.gov.uk/government/publications/alt...,Alternative ways of managing English fishing q...,Defra committed to considering alternative way...,368
4,https://www.gov.uk/government/publications/alt...,Cornwall community quota trial management plan,Group information \nRoles \nThe group is a par...,5214


In [5]:
# Import necessary libraries
import pandas as pd
import tiktoken
import json
from tqdm import tqdm  # Progress bar

# Load the JSON file into a pandas DataFrame
df = pd.read_json('../govuk-policy-qa-pairs/policy_papers.json')

print('Size of dataset: ', len(df))

# Combine the 'header' and 'content' into a single feature
df['combined'] = df['header'] + " " + df['content']

# Load the TikToken tokenizer
enc = tiktoken.get_encoding("gpt2")

# Tokenize the 'combined' column and count the tokens for each row
df['token_count'] = df['combined'].apply(lambda text: len(enc.encode(text)))

# Find entries with more than 10k tokens
threshold = 8000
large_docs = df[df['token_count'] > threshold]

# List how many documents have more than 10k tokens
num_large_docs = len(large_docs)
print(f"Number of documents with more than threshold tokens: {num_large_docs}")

# Function to split a document into chunks of less than 10k tokens, splitting on newlines where possible
def split_document_iteratively(text, token_limit=8000, token_overlap=500):
    # Tokenize the text
    tokens = enc.encode(text)
    
    chunks = []
    start_index = 0

    # Iteratively split the document
    while start_index < len(tokens):
        end_index = min(start_index + token_limit, len(tokens))
        
        # Try to split at a newline if possible
        while end_index > start_index and tokens[end_index - 1] != enc.encode('\n')[0]:
            end_index -= 1
        
        # If no newline is found, split at the token limit
        if end_index == start_index:
            end_index = min(start_index + token_limit, len(tokens))
        
        chunk_tokens = tokens[start_index:end_index]
        chunk_text = enc.decode(chunk_tokens)
        
        # Add notes to indicate the document is split
        if start_index > 0:
            chunk_text = "[Note: This is a continuation from the previous part of the document.]\n\n" + chunk_text
        
        if end_index < len(tokens):
            chunk_text += "\n\n[Note: This document was truncated here due to token length limitations. Please see the next part for the continuation.]"
        
        chunks.append(chunk_text)
        
        # Move the start index forward with overlap, ensuring it increases
        next_start_index = end_index - token_overlap
        if next_start_index <= start_index:
            start_index = end_index  # Ensure progress
        else:
            start_index = next_start_index

    return chunks

# Initialize the progress bar for large documents
progress_bar = tqdm(total=len(large_docs), desc="Processing large documents")

# For tracking stats
chunk_stats = []  # To store how many chunks each document is split into
new_entries = []
processed_docs = 0  # Counter for tracking how many large docs have been processed

# For each large document, split it into parts and collect the new entries
for _, row in large_docs.iterrows():
    # Split the document and create new rows for each part
    split_texts = split_document_iteratively(row['combined'])
    chunk_stats.append((row['url'], len(split_texts)))  # Track the number of chunks

    # Add each part as a new row, ensuring continuation headers where needed
    for i, part_text in enumerate(split_texts):
        header = row['header'] if i == 0 else row['header'] + " (continued)"
        new_entries.append({
            'url': row['url'],
            'header': header,
            'content': part_text,
            'split_from_large_doc': True  # Mark this row as part of a split document
        })

    # Update progress bar after processing each document
    progress_bar.update(1)
    processed_docs += 1

# Close the progress bar
progress_bar.close()

# Create a DataFrame from the new entries
new_entries_df = pd.DataFrame(new_entries)

# Remove the original large documents from the DataFrame
df_small_docs = df[df['token_count'] <= threshold]

# Create a separate dataset of just the datapoints less than 10k tokens
df_small_docs.to_json('../govuk-policy-qa-pairs/policy_papers_small.json', orient='records', indent=2)

# Concatenate the newly split entries back into the main DataFrame
df_combined = pd.concat([df_small_docs, new_entries_df], ignore_index=True)

# Add a column to mark documents that were not split from a large document
df_combined['split_from_large_doc'] = df_combined['split_from_large_doc'].fillna(False)

# Recalculate the token counts for the updated dataset
df_combined['combined'] = df_combined['header'] + " " + df_combined['content']
df_combined['token_count'] = df_combined['combined'].apply(lambda text: len(enc.encode(text)))

# Save the updated dataset to the specified file path
output_file = '../govuk-policy-qa-pairs/policy_papers_truncated.json'
df_combined[['url', 'header', 'content', 'split_from_large_doc']].to_json(output_file, orient='records', indent=2)

# Display stats about how many chunks each large document was split into
chunk_stats_df = pd.DataFrame(chunk_stats, columns=['url', 'num_chunks'])
print(chunk_stats_df.describe())  # Print stats like min, max, mean, etc.

# Perform the token length analysis again
most_tokens = df_combined['token_count'].max()
least_tokens = df_combined['token_count'].min()
average_tokens = df_combined['token_count'].mean()

# Display the updated token statistics
print(f"Most tokens after split: {most_tokens}")
print(f"Least tokens after split: {least_tokens}")
print(f"Average tokens after split: {average_tokens:.2f}")


Size of dataset:  5613
Number of documents with more than 10k tokens: 831


Processing large documents: 100%|██████████| 831/831 [00:05<00:00, 160.36it/s]
  df_combined['split_from_large_doc'] = df_combined['split_from_large_doc'].fillna(False)


       num_chunks
count  831.000000
mean     4.590854
std      1.851372
min      3.000000
25%      4.000000
50%      4.000000
75%      5.000000
max     20.000000
Most tokens after split: 10078
Least tokens after split: 7
Average tokens after split: 3172.34


In [5]:
output_file = '../govuk-policy-qa-pairs/policy_papers_truncated.json'
df_truncated = pd.read_json(output_file)

# Output the head of the DataFrame
df_truncated.head()

Unnamed: 0,url,header,content
0,https://www.gov.uk/government/publications/blu...,Blue Planet Fund,Introduction\nThe Blue Planet Fund is the UK’s...
1,https://www.gov.uk/government/publications/blu...,Ocean Country Partnership Programme (OCPP),The Ocean Country Partnership Programme (OCPP)...
2,https://www.gov.uk/government/publications/blu...,Ocean Community Empowerment and Nature (OCEAN)...,The Ocean Community Empowerment and Nature (OC...
3,https://www.gov.uk/government/publications/alt...,Alternative ways of managing English fishing q...,Defra committed to considering alternative way...
4,https://www.gov.uk/government/publications/alt...,Cornwall community quota trial management plan,Group information \nRoles \nThe group is a par...
