In [1]:
import os
import re
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
import json
import pandas as pd
import openpyxl

In [None]:
#Set Working Directory
os.chdir('')
papers_path = "/papers"
excel_filename = "combined_summaries.xlsx"

In [2]:
#Setting API Key
os.environ['OPENAI_API_KEY'] = ''

In [6]:
# Step 1: Define a Custom Prompt
# This prompt instructs the model to extract the specific elements.
prompt_template_json = """
You are an expert summarizer with a background in economics. Read the following research paper and create a detailed summary of it, which includes the following elements:
1. Title, Authors, and Date of the Paper
2. The research questions being addressed.
3. The main findings.
4. The data used in the study. If no data are used, state that.
5. If there is a theoretical model, provide a summary of the model and the key results the model provides. If no model is used, state that.
6. The empirical methodology employed, including identification assumptions.
7. The strands of the economics literature that the paper contributes to.

Your responses in steps 2-7 should be in full sentences, and providing sufficient detail to understand the paper. Provide your answer strictly in JSON format with the following keys:"title", "authors", "date" ,"research_questions", "main_findings", "data_used", "theoretical_model", "empirical_methodology", "literature_contributions".

Text:
{text}
"""

prompt = PromptTemplate(template=prompt_template_json, input_variables=["text"])


# Step 2: Initialize the LLM and the Summarization Chain
# Set temperature=0 for more deterministic outputs.
llm = ChatOpenAI(model="gpt-4o-mini",
             temperature=0,
             max_tokens=1500)  # Make sure your OpenAI API key is set in your environment

chain = load_summarize_chain(llm, 
                             chain_type="stuff",
                             prompt=prompt)

# I want to remove the parts of the document that come after the "References" section
def filter_text_before_last_marker(documents, marker="References"):
    """
    Returns a new list of documents with content truncated at the last occurrence
    of the marker. It includes all documents up to the document containing the 
    last occurrence of the marker. That document is truncated to only include text 
    before the last occurrence of the marker.
    """
    last_index = None
    # Find the index of the last document that contains the marker
    for i, doc in enumerate(documents):
        if marker in doc.page_content:
            last_index = i

    # If no document contains the marker, return the original list
    if last_index is None:
        return documents

    # Include all documents before the one containing the last occurrence
    filtered_docs = documents[:last_index]

    # For the document with the marker, trim its content up to the last occurrence
    doc_with_marker = documents[last_index]
    pos = doc_with_marker.page_content.rfind(marker)
    if pos != -1:
        doc_with_marker.page_content = doc_with_marker.page_content[:pos]
    filtered_docs.append(doc_with_marker)

    return filtered_docs

In [9]:
# Step 3: Load existing summaries if the Excel file exists
if os.path.exists(excel_filename):
    existing_df = pd.read_excel(excel_filename)
    # Get a set of filenames already processed
    processed_files = set(existing_df["filename"].dropna().tolist())
else:
    existing_df = pd.DataFrame()
    processed_files = set()
    
new_summaries = []

# Loop over each PDF file in the folder
for filename in os.listdir(papers_path):
    if filename.endswith(".pdf"):
        if filename in processed_files:
            print("Skipping {}: already processed.".format(filename))
            continue

        file_path = os.path.join(papers_path, filename)
        loader = PyPDFLoader(file_path)
        documents = loader.load()
        
        # Preprocess: remove content after "References" in each document chunk
        filtered_docs = filter_text_before_last_marker(documents)
        
        # Run the summarization chain (it accepts a list of documents)
        summary_output = chain.run(filtered_docs)
        
        # Try to parse the output as JSON. If extra text is present, extract the JSON block.
        try:
            summary_data = json.loads(summary_output)
        except json.JSONDecodeError:
            # Look for a JSON block between ```json markers
            match = re.search(r"```json\s*(\{.*?\})\s*```", summary_output, re.DOTALL)
            if match:
                summary_data = json.loads(match.group(1))
            else:
                # Fallback: store the raw summary output
                summary_data = {"raw_summary": summary_output}
        
        # Add the filename to the summary data for reference
        summary_data["filename"] = filename
        
        # Append the summary dictionary to our list
        new_summaries.append(summary_data)

# Step 4: Update and save the Excel file
if new_summaries:
    new_df = pd.DataFrame(new_summaries)
    
    # Convert list-type values to comma-separated strings
    for col in new_df.columns:
        new_df[col] = new_df[col].apply(lambda x: ", ".join(map(str, x)) if isinstance(x, list) else x)
    
    if not existing_df.empty:
        combined_df = pd.concat([existing_df, new_df], ignore_index=True)
    else:
        combined_df = new_df
    
    combined_df.to_excel(excel_filename, index=False)
    print(f"Updated summaries saved to {excel_filename}")
else:
    print("No new files to summarize.")

Skipping w32620.pdf: already processed.
Skipping ssrn-4154564.pdf: already processed.
Skipping The_Perceived_Sources_of_Unexpected_Inflation.pdf: already processed.
Skipping ssrn-4280699.pdf: already processed.
Updated summaries saved to inflation surprises/combined_summaries.xlsx
