In [None]:
# First, install the necessary packages
# Run this cell first if you're setting up a new environment

!pip install -U langchain langchain-openai langchain-community faiss-cpu pypdf unstructured pandas openai

In [None]:
import getpass
import os

# Set up OpenAI API key - either from environment variable or enter
if not os.getenv("openai_api_key"):
    os.environ["openai_api_key"] = getpass.getpass("Enter your OpenAI API key: ")

openai_api_key = os.getenv("openai_api_key")

In [None]:
# Import necessary libraries
import pandas as pd

from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.document_loaders import PyPDFLoader, UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS

from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

from IPython.display import Markdown, display

In [None]:
def read_papers(papers_folder, docformat="pdf"):
    """
    Read all papers from a folder and load their content.
    
    Args:
        papers_folder: Path to the folder containing papers
        docformat: Format of documents ('pdf' or 'md')
    
    Returns:
        List of all pages from all papers
    """
    # Get list of all papers in the folder
    papers = [x for x in os.listdir(papers_folder)]
    all_pages = []
    
    if docformat == "pdf":
        # For PDF files
        paper_page = []
        for x in papers:
            try:
                # Load and split each PDF file
                loader = PyPDFLoader(os.path.join(papers_folder, x))
                pages = loader.load_and_split()
                paper_page.append(pages)
            except Exception as e:
                print(f"Error loading {x}: {e}")
        # Flatten all pages into a single list
        all_pages = [item for sublist in paper_page for item in sublist]
    
    elif docformat == "md":
        # For Markdown files
        paper_page = []
        for x in papers:
            try:
                # Load each Markdown file
                loader = UnstructuredMarkdownLoader(os.path.join(papers_folder, x))
                pages = loader.load()
                paper_page.append(pages)
            except Exception as e:
                print(f"Error loading {x}: {e}")
        all_pages = [x[0] for x in paper_page]
    
    # Create a DataFrame to display paper information
    df_papers = pd.DataFrame({"Paper": papers, "Page": [len(x) for x in paper_page]})
    display(df_papers)
    print(f"Total {len(df_papers)} Papers with {sum(df_papers['Page'])} Pages are read")
    
    return all_pages

In [None]:
def chunk_papers(all_pages, chunk_size=4096, chunk_overlap=200):
    """
    Split paper content into smaller chunks for better processing.
    
    Args:
        all_pages: List of document pages
        chunk_size: Maximum size of each chunk
        chunk_overlap: Overlap between chunks to maintain context
    
    Returns:
        List of chunked text documents
    """
    # Create a text splitter with specified parameters
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
    )
    
    # Combine all text and remove line breaks
    combined_text = " ".join([x.page_content.replace('\n', '') for x in all_pages])
    
    # Split the combined text into chunks
    chunked_texts = text_splitter.create_documents([combined_text])
    
    return chunked_texts

In [None]:
def chunk_papers_df(chunked_texts):
    """
    Create a DataFrame from chunked texts for easy viewing.
    This is just for display, not used for retrieval.
    
    Args:
        chunked_texts: List of chunked text documents
    
    Returns:
        DataFrame containing chunked texts and their lengths
    """
    # Extract text content from documents
    chunked_texts = [x.page_content for x in chunked_texts]
    
    # Create DataFrame and remove duplicates
    df_chunks = pd.DataFrame({"chunked_text": chunked_texts})
    df_chunks = df_chunks.drop_duplicates(subset=['chunked_text'], keep='first')
    
    # Calculate length of each chunk
    df_chunks['length'] = df_chunks['chunked_text'].apply(lambda x: len(x))
    
    return df_chunks

In [None]:
def get_openai_embeddings(df, model="text-embedding-3-small"):
    """
    Generate embeddings for chunked text using OpenAI.
    
    Args:
        df: DataFrame containing 'chunked_text' column
        model: The embedding model to use
    
    Returns:
        DataFrame with added 'embeddings' column
    """
    # Initialize the OpenAI embeddings model
    embeddings_model = OpenAIEmbeddings(
        model=model,
        openai_api_key=openai_api_key
    )
    
    try:
        # Generate embeddings for all chunks
        embedding_vectors = embeddings_model.embed_documents(df['chunked_text'].tolist())
        
        # Add embeddings to the DataFrame
        df["embeddings"] = embedding_vectors
    except Exception as e:
        print(f"Error generating embeddings: {e}")
        return None
        
    return df

In [None]:
def setup_vector_database(chunked_texts):
    """
    Create and configure a FAISS vector database for document retrieval.
    
    Args:
        chunked_texts: List of chunked text documents
    
    Returns:
        Configured retriever object
    """
    # Initialize the OpenAI embeddings model
    embeddings_model = OpenAIEmbeddings(openai_api_key=openai_api_key)
    
    # Create a FAISS vector store from documents
    vectorstore = FAISS.from_documents(chunked_texts, embeddings_model)
    
    # Save the vector store locally for future use
    vectorstore.save_local("siop_demo_code_assessment_db_siop_principles.db")
    
    # Create a retriever from the vector store
    retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
    
    return retriever

In [None]:
def generate_similar_question(question):
    """
    Generate a similar but differently worded question using GPT-4o.
    
    Args:
        question: Original user question
    
    Returns:
        Similar question with different wording
    """
    # Initialize ChatGPT model
    llm = ChatOpenAI(
        temperature=0.7,  # Some creativity for variation
        model_name="gpt-4o",
        openai_api_key=openai_api_key
    )
    
    # Prompt to generate similar question
    prompt = f"""
    Generate a question that is similar to but worded differently from this one: 
    '{question}'
    
    Return only the reworded question without any explanations or quotation marks.
    """
    
    # Generate the similar question
    similar_question = llm.invoke(prompt)
    
    # Extract the text content from the response
    if hasattr(similar_question, 'content'):
        return similar_question.content.strip()
    return similar_question

In [None]:
def enhanced_retrieval(question, retriever):
    """
    Enhance retrieval by using both the original question and a similar question.
    
    Args:
        question: Original user question
        retriever: Document retriever
    
    Returns:
        Dictionary containing combined documents and retrieval details
    """
    # Generate a similar question
    similar_question = generate_similar_question(question)
    print(f"Original question: {question}")
    print(f"Similar question: {similar_question}")
    
    # Retrieve documents for original question (top 2)
    original_docs = retriever.invoke(question)[:2]
    
    # Retrieve documents for similar question (top 2)
    similar_docs = retriever.invoke(similar_question)[:2]
    
    # Track which documents came from which query
    original_contents = [doc.page_content for doc in original_docs]
    similar_contents = [doc.page_content for doc in similar_docs]
    
    # Find overlapping documents (appearing in both searches)
    overlapping_contents = set(original_contents) & set(similar_contents)
    
    # Combine results (removing duplicates)
    combined_docs = []
    seen_contents = set()
    
    for doc in original_docs + similar_docs:
        if doc.page_content not in seen_contents:
            combined_docs.append(doc)
            seen_contents.add(doc.page_content)
    
    # Return as a dictionary with all the information
    return {
        "combined_docs": combined_docs,
        "original_docs": original_docs,
        "similar_docs": similar_docs,
        "overlapping_contents": overlapping_contents
    }

In [None]:
def create_qa_chain():
    """
    Create a modern LangChain QA chain using the latest patterns.
    
    Returns:
        A runnable QA chain
    """
    # Initialize the ChatGPT model
    llm = ChatOpenAI(
        temperature=0,  # Deterministic output
        model_name="gpt-4o",
        openai_api_key=openai_api_key
    )
    
    # Create the prompt template
    prompt = PromptTemplate.from_template("""
    You act as a helpful question answer assistant. 
    Given the following context as information source, answer any questions. 
    Always answer in two sections.

    Generate a concise answer from the given context as information source or mention that the source does not contain relevant information

    If you can't find the answer in the context below, just say "I'm not sure." Don't try to make up an answer.
    If the context or question is R codes, explain very details of these R codes.
    
    Context:
    {context}
    
    Question: {question}
    
    Always answer in Markdown format:
    """)
    
    # Create the QA chain using modern LangChain patterns
    qa_chain = (
        {"context": lambda input_dict: "\n\n".join([doc.page_content for doc in input_dict["documents"]]),
         "question": lambda input_dict: input_dict["question"]}
        | prompt
        | llm
        | StrOutputParser()
    )
    
    return qa_chain

In [None]:
def format_chunk_display(doc, is_overlapping=False, max_length=200):
    """
    Format a document chunk for display with proper highlighting.
    
    Args:
        doc: The document to format
        is_overlapping: Whether this chunk appears in both searches
        max_length: Maximum length to display before truncating
    
    Returns:
        Formatted HTML string for the chunk
    """
    from IPython.display import HTML
    
    overlap_style = "background-color: #fff3cd; padding: 5px; border-left: 3px solid #ffc107;" if is_overlapping else ""
    content = doc.page_content
    
    # Truncate if needed
    if len(content) > max_length:
        displayed_content = content[:max_length] + "..."
    else:
        displayed_content = content
    
    # Escape HTML characters
    import html
    displayed_content = html.escape(displayed_content)
    
    # Format with styling
    return f"<div style='margin-bottom: 10px; padding: 10px; border: 1px solid #ddd; border-radius: 5px; {overlap_style}'>{displayed_content}</div>"

In [None]:
def display_retrieval_results(retrieval_result):
    """
    Display the retrieval results in a more visual format.
    
    Args:
        retrieval_result: Dictionary containing retrieval details
    """
    from IPython.display import display, HTML
    
    # Safety check for backward compatibility
    if not isinstance(retrieval_result, dict):
        print("Unable to display results in visual format. Using text format instead.")
        print("\n==== Retrieved Chunks ====\n")
        for i, doc in enumerate(retrieval_result):
            truncated_content = doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content
            print(f"* Chunk {i+1}: {truncated_content}\n")
        return
    
    html_content = """
    <style>
    .chunk-container {
        margin-bottom: 20px;
    }
    .chunk-header {
        font-weight: bold;
        margin-bottom: 10px;
        padding: 5px;
        background-color: #f8f9fa;
    }
    .overlap-note {
        font-style: italic;
        color: #856404;
        background-color: #fff3cd;
        padding: 3px 6px;
        border-radius: 3px;
        margin-left: 5px;
    }
    .summary-box {
        background-color: #e9ecef;
        padding: 10px;
        border-radius: 5px;
        margin-top: 20px;
        margin-bottom: 20px;
    }
    </style>
    
    <h3>Retrieved Chunks</h3>
    
    <div class="chunk-container">
        <div class="chunk-header">Chunks from ORIGINAL question:</div>
    """
    
    # Add original chunks
    for i, doc in enumerate(retrieval_result["original_docs"]):
        is_overlapping = doc.page_content in retrieval_result["overlapping_contents"]
        overlap_mark = "<span class='overlap-note'>OVERLAP</span>" if is_overlapping else ""
        html_content += f"<div><strong>Chunk {i+1}</strong> {overlap_mark}</div>"
        html_content += format_chunk_display(doc, is_overlapping)
    
    html_content += """
    <div class="chunk-container">
        <div class="chunk-header">Chunks from SIMILAR question:</div>
    """
    
    # Add similar chunks
    for i, doc in enumerate(retrieval_result["similar_docs"]):
        is_overlapping = doc.page_content in retrieval_result["overlapping_contents"]
        overlap_mark = "<span class='overlap-note'>OVERLAP</span>" if is_overlapping else ""
        html_content += f"<div><strong>Chunk {i+1}</strong> {overlap_mark}</div>"
        html_content += format_chunk_display(doc, is_overlapping)
    
    # Add summary
    html_content += f"""
    <div class="summary-box">
        <strong>Summary:</strong><br>
        Found {len(retrieval_result['original_docs'])} chunks from original question, 
        {len(retrieval_result['similar_docs'])} chunks from similar question, 
        with {len(retrieval_result['overlapping_contents'])} overlapping chunks.<br>
        Combined unique chunks for answer generation: {len(retrieval_result['combined_docs'])}
    </div>
    
    <h3>Answer</h3>
    """
    
    display(HTML(html_content))

In [None]:
def ask(question, show_chunks=True, visual_display=True):
    """
    Ask a question and get an answer using our enhanced RAG system.
    
    Args:
        question: The user's question
        show_chunks: Whether to display the retrieved chunks (default: True)
        visual_display: Use visual HTML display for chunks (default: True)
    
    Returns:
        String containing the answer (can be displayed with Markdown())
    """
    # Get relevant documents using enhanced retrieval
    retrieval_result = enhanced_retrieval(question, retriever)
    
    # Handle both dictionary and list return types for backward compatibility
    if isinstance(retrieval_result, dict):
        combined_docs = retrieval_result["combined_docs"]
    else:
        # If retrieval_result is a list, it's the old format returning just combined_docs
        combined_docs = retrieval_result
        # Create a simplified dictionary for compatibility
        retrieval_result = {
            "combined_docs": combined_docs,
            "original_docs": combined_docs[:1],  # Simplification
            "similar_docs": combined_docs[1:] if len(combined_docs) > 1 else [],
            "overlapping_contents": set()
        }
    
    # Display the chunks if requested
    if show_chunks:
        if visual_display:
            # Use the visual HTML display
            display_retrieval_results(retrieval_result)
        else:
            # Use plain text display
            print("\n==== Retrieved Chunks ====\n")
            
            # Display chunks from original question
            print("Chunks from ORIGINAL question:")
            for i, doc in enumerate(retrieval_result["original_docs"]):
                is_overlapping = doc.page_content in retrieval_result["overlapping_contents"]
                overlap_mark = "* [OVERLAP] " if is_overlapping else "* "
                truncated_content = doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content
                print(f"{overlap_mark}Chunk {i+1}: {truncated_content}\n")
            
            # Display chunks from similar question
            print("\nChunks from SIMILAR question:")
            for i, doc in enumerate(retrieval_result["similar_docs"]):
                is_overlapping = doc.page_content in retrieval_result["overlapping_contents"]
                overlap_mark = "* [OVERLAP] " if is_overlapping else "* "
                truncated_content = doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content
                print(f"{overlap_mark}Chunk {i+1}: {truncated_content}\n")
            
            # Summary
            print(f"\nSummary: Found {len(retrieval_result['original_docs'])} chunks from original question, " + 
                  f"{len(retrieval_result['similar_docs'])} chunks from similar question, " +
                  f"with {len(retrieval_result['overlapping_contents'])} overlapping chunks")
            print(f"Combined unique chunks for answer generation: {len(combined_docs)}")
            print("\n==== Answer ====\n")
    
    # Create and use the QA chain
    qa_chain = create_qa_chain()
    answer = qa_chain.invoke({"documents": combined_docs, "question": question})
    
    # Return the answer as a string (not wrapped in Markdown)
    return answer

Example usage:

In [None]:
# 0. Create file directory if it doesn't exist, e.g. SIOP Principles
principles_dir = './Principles/'
if not os.path.exists(principles_dir):
    os.makedirs(principles_dir)

In [None]:
# 1. Read papers
all_papers = read_papers('./Principles/')
all_papers

In [None]:
# 2. Chunk papers into segments
chunked_texts = chunk_papers(all_papers, chunk_size=1000, chunk_overlap=200)
df_chunks = chunk_papers_df(chunked_texts)

In [None]:
df_chunks

In [None]:
# 3. Generate embeddings (optional display step)
df_with_embeddings = get_openai_embeddings(df_chunks)

In [None]:
df_with_embeddings

In [None]:
# 4. Setup vector database
retriever = setup_vector_database(chunked_texts)

In [None]:
# 5. Ask questions
display(Markdown(ask("tell me about the Generalizing Validity Evidence")))