# **NarrRAG** - An orchestrated RAG framework for narrative extraction from topic model output

This implementation of NarrRAG uses:

*   llama3.2 from Ollama for all LLM tasks
*   ChromaDB and BM25 retrievers
*   langChain and langGraph for orchestration
*   Pydantic for data structures

The goal of NarrRAG is to extract narratives from document clusters. It is independet of the model used to create those clusters. Also, alternative LLMs, retrievers and orchestration frameworks can be used.

### Setup and Dependencies

In [None]:
# use setup file to install system packages, ollama installer (change if necessary) and python libraries
!git clone https://github.com/lisagrobels/NarrRAG.git
%cd NarrRAG
!bash setup.sh

Cloning into 'narrRAG'...
fatal: could not read Username for 'https://github.com': No such device or address
[Errno 2] No such file or directory: 'narrRAG'
/content
bash: setup.sh: No such file or directory


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# REMOVE BEFORE UPLOAD
! sudo apt update
! sudo apt install -y pciutils
!curl -fsSL https://ollama.com/install.sh |sh

import threading
import subprocess
import time

def run_ollama_serve():
  subprocess.Popen(["ollama","serve"])
thread = threading.Thread(target=run_ollama_serve)

thread.start()
time.sleep(5)
!ollama pull llama3.2

!pip install langchain_community langchain-ollama chromadb torch jq rank-bm25 langgraph

[33m0% [Working][0m            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
[33m0% [Connecting to archive.ubuntu.com (185.125.190.39)] [Connecting to security.[0m[33m0% [Connecting to archive.ubuntu.com (185.125.190.39)] [Connecting to security.[0m                                                                               Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 https://cli.github.com/packages stable InRelease [3,917 B]
Get:4 https://cli.github.com/packages stable/main amd64 Packages [346 B]
Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:7 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:8 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:9 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [9,278 kB]
Get:10 https://developer.downloa

In [None]:
# import libraries

# general libraries
import pandas as pd
import numpy as np
import json
import traceback
import threading
import subprocess
import time
from enum import Enum
from pathlib import Path
from typing import List, Optional, Dict
from IPython.display import Markdown
# pydantic
from pydantic import BaseModel, Field
from pydantic import TypeAdapter
# langchain libraries, adjust for LLM and orchestration framework used used
from langchain_ollama import ChatOllama
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.retrievers import BM25Retriever
from langchain_community.document_loaders import JSONLoader
from langchain.schema import Document
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain_core.tools import tool
from langchain_core.runnables import RunnableConfig
from langgraph.graph import StateGraph, START, END

from rag_pipeline.utils import load_json_documents
from rag_pipeline.pipeline_functions import retrieve_node, extract_narrative, grade_narrative
from rag_pipeline.pipeline_functions import run_narrative_extraction



In [None]:
# start ollama server
def run_ollama_serve():
  subprocess.Popen(["ollama","serve"])
thread = threading.Thread(target=run_ollama_serve)

thread.start()
time.sleep(5)
!ollama pull llama3.2

[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l


### Setup Paths and Retrievers

We use a cleaned subsample (1,000 messages) of the X dataset in https://github.com/sinking8/x-24-us-election (Publication: https://arxiv.org/abs/2411.00376). Shared under CC BY-SA 4.0.

For NarrRAG, three input files are needed:


1.   A .csv file with at least columns: Document = text of the short message, Topic = number of topic assigned to the document,
2.   a .json file with topic keywords (see sample file) and,
3.   a .json file with news data (see sample file).



In [None]:
REPO_PATH = Path("/content/NarrRAG")
CSV_PATH = REPO_PATH / "data" / "testdata_seedtopics.csv" # csv file with document text and topic number
KEYW_PATH = REPO_PATH / "data" / "testdata_topic_keywords.json" # json file with topic keywords
NEWS_PATH = REPO_PATH / "data" / "testdata_news.json" # json file with news data
OUTPUT_DIR = REPO_PATH / "results" # output directory for the narratives
OUTPUT_DIR.mkdir(exist_ok=True)     # create if it doesn't exist
# Add repo root to Python path for imports
sys.path.append(str(REPO_PATH))


In [None]:
df = pd.read_csv(CSV_PATH)

with open(KEYW_PATH, "r") as f:
    topic_keywords = json.load(f)

In [None]:
#from langchain_ollama import OllamaEmbeddings

embedding_model = OllamaEmbeddings(
    model="llama3.2",
)

In [None]:
chroma = Chroma(
    embedding_function=embedding_model
)

  chroma = Chroma(


In [None]:
# Load news documents
docs = load_json_documents(NEWS_PATH, content_key="description")

{'source': '/content/drive/My Drive/narrRAG/testdata_news.json', 'seq_num': 1, 'publisher_title': 'Brennan Center for Justice', 'published_date': 'Fri, 19 Jul 2024 07:00:00 GMT'}


In [None]:
# add news documents to chroma vector storage
chroma.add_documents(docs)

['d9544728-edff-4bb3-a65f-01aa9fad45be',
 '78947c9a-8b00-4c6b-a325-ce30b98242c9',
 'cbf2e2ed-f262-484e-a274-3e07141f3f03',
 '09e0bed9-fd51-40d7-8a06-ef7e6d64ee23',
 'b1cc177f-2409-4119-b096-0606564a64b8',
 'f552c650-1c06-4e5c-bc10-866a8f67fae1',
 '77eabe5b-6177-48d3-b9db-b8ae54e9babb',
 '25aca266-0cf0-4a53-ab85-111a4d8d0227',
 'ae66c0b8-89d4-46bf-a18e-66cc88b95cb0',
 '562d20ad-64bd-44db-ab74-ab451b9ae6bb',
 '296fd981-1cf5-45f6-97a6-71565d2ca6cf',
 'd816a8d5-e4fa-4e4a-9642-ac4d8bd8e38e',
 '4b616a88-8490-4b4e-b951-87765641d996',
 '6f60451c-747d-4164-abb7-7457dec6add8',
 '3815e8bf-b415-4513-91a4-a5741a0d800a',
 '551bb293-a57f-44ae-b876-28c103599651',
 '05454950-0dd9-4801-b303-1f0041b14482',
 '965dfd05-fb02-43a4-85c9-3f773125d477',
 '19324f4a-7870-4eca-8690-b463a7317a57',
 '503aa1e1-9192-427d-a541-c852fa52f409',
 '960a8a56-a188-4fe3-b093-79ebac490382',
 '5badfde4-4468-498e-b4b5-a303f2ec849b',
 '2a919b6d-1aa6-40d2-83c4-feea6cf04deb',
 'a216ee63-2f3a-4d7a-b376-7be50ddab92c',
 '63d9ba46-924e-

In [None]:
# Build retriever
chroma_retriever = chroma.as_retriever(search_kwargs={"k": 5})  # retrieve top 5 docs

In [None]:
# Build bm25 retriever dict
bm25_retrievers = {}

# Make sure topic values are integers or strings, depending on your JSON keys
df['Topic'] = df['Topic'].astype(str)

# Loop over each topic to create a BM25 retriever
for topic_id in df['Topic'].unique():
    topic_docs = df[df['Topic'] == topic_id]

    # Convert to LangChain Document objects
    documents = [
        Document(
            page_content=row['Document'],
            metadata={"topic": topic_id}
        )
        for _, row in topic_docs.iterrows()
    ]

    # Build BM25Retriever for this topic
    retriever = BM25Retriever.from_documents(documents)
    retriever.k = 10

    # Add to dictionary
    bm25_retrievers[topic_id] = retriever

### Setup pydantic and graph state

In [None]:
# Setup pydantic models and graph state models
class Narrative(BaseModel):
    topic_id: str = Field(description="The topic ID of the narrative.")
    actor: str = Field(description="The actor(s) of the narrative.")
    action: str = Field(description="Action that is carried out by actor(s) or other entities or individuals.")
    event: str = Field(description="The event linking the actor(s) and their action.")
    description: str = Field(description="A one sentence long description of the narrative.")

class MergedNarratives(BaseModel):
    merged_narrative: Narrative
    merged_from: List[Narrative]

class Grade(str, Enum):
    approved = "approved"
    refine = "refine"

class GradedNarrative(BaseModel):
    grade: Grade  # Enum ensures only valid values
    explanation: str

class ApprovedNarrativeWithDocs(BaseModel):
    narrative: Narrative
    documents_bm25: List[Document]
    documents_chroma: List[Document]

class GraphState(BaseModel):
    topic_id: str = Field(description="The topic ID of the narrative.")
    query: Optional[str] = None
    documents_bm25: Optional[List[Document]] = None
    documents_chroma: Optional[List[Document]] = None
    narratives: Optional[List[Narrative]] = Field(default_factory=list)
    grade_result: Optional[GradedNarrative] = None
    approved_narratives: Optional[List[ApprovedNarrativeWithDocs]] = Field(default_factory=list)
    pending_narratives_with_docs: Dict[str, ApprovedNarrativeWithDocs] = Field(default_factory=dict)
    refine_counts: Dict[str, int] = Field(default_factory=dict)

# Setup LLMs
llm = ChatOllama(model="llama3.2")
llm_struct = ChatOllama(model='llama3.2').with_structured_output(Narrative, method='json_schema')
llm_struct_merge = ChatOllama(model='llama3.2').with_structured_output(MergedNarratives, method='json_schema')
llm_grader = ChatOllama(model='llama3.2').with_structured_output(GradedNarrative, method='json_schema')

### Run NarrRAG

In [None]:
# Run narrative extraction
all_approved_narratives, topic_results = run_narrative_extraction(topic_keywords, OUTPUT_DIR)

print(f"Total approved narratives: {len(all_approved_narratives)}")

In [None]:
# in case ollama disconnects
def run_ollama_serve():
  subprocess.Popen(["ollama","serve"])
thread = threading.Thread(target=run_ollama_serve)

thread.start()
time.sleep(5)

!ollama pull llama3.2