# Generate AI summaries
Use the ChatGPT API to generate summaries from OCR text. First chunk the document into 6000-character chunks, and get a summary of each chunk. Then, concatenate the summaries, chunk again, and repeat.



Warning: you can run up nontrivial cost. Use with caution and avoid duplication.

In [1]:
import os
import re
from openai import OpenAI
import fitz
import shutil
from tqdm import tqdm
import pandas as pd
import tiktoken
from dotenv import load_dotenv

load_dotenv() #load your custom environment variables from .env file in same directory

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

In [2]:
def create_folder_structure(src, dst):
    """
    Copies folder structure from src to dst without copying the files.
    """
    for dirpath, dirnames, _ in os.walk(src):
        structure = os.path.join(dst, os.path.relpath(dirpath, src))
        if not os.path.exists(structure):
            os.makedirs(structure)

def summarize_chunk(chunk, model="gpt-4"):
    # Summarize a single chunk of text
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": f"Please summarize the following text into two concise paragraphs: {chunk}"}
        ]
    )
    return response.choices[0].message.content

def summarize_text(text, model="gpt-4"):
    # Split the text into chunks to fit the token limit of the API
    max_chunk_size = 6000  # Adjust this to ensure a balance between input size and output
    chunks = [text[i:i+max_chunk_size] for i in range(0, len(text), max_chunk_size)]

    summaries = []
    for chunk in chunks:
        summary = summarize_chunk(chunk, model=model)
        summaries.append(summary)

    # Now condense all the summarized chunks into one summary
    final_summary_prompt = "Condense the following summaries into a single paragraph with bullet points as needed: " + ' '.join(summaries)
    
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": final_summary_prompt}
        ]
    )
    return response.choices[0].message.content

def count_tokens(text, model="gpt-4"):
    """
    Use tiktoken to count the number of tokens in the input text.
    """
    encoding = tiktoken.encoding_for_model(model)
    tokens = encoding.encode(text)
    return len(tokens)

def process_files_in_folders(src, dst):
    """
    Copy folder structure from src to dst and summarize text files in the source folder.
    """
    create_folder_structure(src, dst)

    for dirpath, _, filenames in os.walk(src):
        for filename in filenames:
            if filename.endswith('.txt'):
                file_path = os.path.join(dirpath, filename)
                
                # Build the output path
                relative_path = os.path.relpath(dirpath, src)
                output_dir = os.path.join(dst, relative_path)
                
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)
                output_file = os.path.join(output_dir, filename.replace('.txt', '_summary.txt'))

                if not os.path.exists(output_file): 
                    with open(file_path, 'r', encoding='utf-8') as f:
                        text_content = f.read()

                    # Count tokens using tiktoken
                    token_count = count_tokens(text_content, model="gpt-4")
                    print(filename+' token count: '+str(token_count))
    
                    # Summarize the content
                    summary = summarize_text(text_content)
                    if summary:
                        summary = re.sub(r'[\n•-]', ' ', summary)
                        summary = re.sub(r'\s+', ' ', summary).strip()
                        summary = summary.replace('Sandia Heights Homeowners Association (SHHA)','SHHA')
                        summary = summary.replace('Sandia Heights Homeowner Association (SHHA)','SHHA')
                        summary = summary.replace('Sandia Heights Homeowner\'s Association (SHHA)','SHHA')

                        # Write the summary to the new file
                        with open(output_file, 'w', encoding='utf-8') as f:
                            f.write(summary)
                else: 
                    print('already exists: '+file_path)


In [4]:
# SANITY CHECK: an example question
query = 'Which athletes won the gold medal in curling at the 2022 Winter Olympics?'
GPT_MODEL = "gpt-3.5-turbo"

response = client.chat.completions.create(
        model=GPT_MODEL,
        temperature=0,

        messages=[
        {'role': 'system', 'content': 'You answer questions about the 2022 Winter Olympics.'},
            {"role": "user", "content": query}
        ]
    )

print(response.choices[0].message.content)

The Swedish team won the gold medal in the men's curling event at the 2022 Winter Olympics. The Swiss team won the gold medal in the women's curling event.


In [None]:
source_folder = '/Users/heidi/Documents/SHHA/GRIT/GRIT_archive_OCRtext/'  # Replace with the path to your folder
destination_folder = '/Users/heidi/Documents/SHHA/GRIT/GRIT_archive_AI_summaries/'  # Replace with the path for the new folder structure

process_files_in_folders(source_folder, destination_folder)