# Current Version (20-05): Separates extraction tasks by 2 LLMs. Uploads posts while processing them, removes duplicates, optimized for context retrieval.

### Using deepseek-r1:8b for arguments and motivations due to deep think capabilities.

In [1]:

from langchain_community.graphs import Neo4jGraph
from langchain_community.graphs.graph_document import GraphDocument, Node, Relationship
import os
from langchain_openai import ChatOpenAI
from langchain_ollama import ChatOllama
from langchain_core.prompts import  PromptTemplate
from langchain.docstore.document import Document
import json
import uuid
from sentence_transformers import SentenceTransformer, util

from dotenv import load_dotenv

load_dotenv(override=True)

True

In [2]:
graph = Neo4jGraph()

  graph = Neo4jGraph()


# Extract comments from a thread

Scraper

In [4]:
import praw
import re

class RedditThreadScraper:
    def __init__(self, client_id, client_secret, user_agent):
        self.reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent=user_agent)
        self.comment_tree = {}
        self.root_id = None
        self.user_map = {}  # Maps real usernames to pseudonyms
        self.user_counter = 1

    def _get_pseudonym(self, username):
        if username is None:
            return "Unknown"
        if username not in self.user_map:
            self.user_map[username] = f"User{self.user_counter}"
            self.user_counter += 1
        return self.user_map[username]

    def limpar_texto(self, text):
        # ... (keep your existing limpar_texto code unchanged)
        parts = text.strip().split('\n\n')
        converted_parts = []
        for part in parts:
            part = part.strip()
            if part.startswith('>'):
                citation_content = part[1:].strip()
                converted_parts.append(f"**Quoting** {citation_content} **End of Quote**")
            else:
                converted_parts.append(part)
        text = '\n\n'.join(converted_parts)
        text = re.sub(r'[^\w\s.,!?*]', '', text)
        lines = text.split('\n\n')
        cleaned_lines = []
        for line in lines:
            cleaned_line = re.sub(r'\s+', ' ', line).strip()
            if cleaned_line:
                cleaned_lines.append(cleaned_line)
        return '\n\n'.join(cleaned_lines)

    def build_comment_tree(self, thread_url):
        try:
            submission = self.reddit.submission(url=thread_url)
            titulo_op = self.limpar_texto(submission.title)
            submission.comments.replace_more(limit=0)
            self.comment_tree = {}
            self.root_id = submission.id
            op_author = self._get_pseudonym(submission.author.name if submission.author else None)
            self.comment_tree[self.root_id] = {
                "id": self.root_id,
                "title": titulo_op,
                "tags": [submission.link_flair_text] if submission.link_flair_text else [],
                "text": self.limpar_texto(submission.selftext),
                "author": op_author,
                "parent_id": None,
                "children": []
            }
            for comment in submission.comments.list():
                if comment.author != "AutoModerator":
                    comment_id = comment.id
                    parent_id = comment.parent_id.split("_")[-1]
                    author = self._get_pseudonym(comment.author.name if comment.author else None)
                    self.comment_tree[comment_id] = {
                        "id": comment_id,
                        "text": self.limpar_texto(comment.body),
                        "author": author,
                        "parent_id": parent_id,
                        "children": []
                    }
                    if parent_id in self.comment_tree:
                        self.comment_tree[parent_id]["children"].append(comment_id)
            return True
        except Exception as e:
            print(f"Erro ao construir a árvore de comentários: {e}")
            self.comment_tree = {}
            self.root_id = None
            return False

    def get_comment_tree(self):
        return self.comment_tree

    def get_root_id(self):
        return self.root_id

    def print_tree(self, node_id=None, level=0):
        if not self.comment_tree:
            print("Árvore de comentários vazia.")
            return
        if node_id is None:
            node_id = self.root_id
        if node_id not in self.comment_tree:
            print(f"Nó {node_id} não encontrado na árvore.")
            return
        node = self.comment_tree[node_id]
        prefix = "➡" if level == 0 else " " * (level * 4) + "↳"
        if node_id == self.root_id:
            print(f"{prefix} {node['author']}: {node['text'][:100]} (Tags: {', '.join(node['tags'])})")
        else:
            print(f"{prefix} {node['author']}: {node['text'][:100]}")
        for child_id in node["children"]:
            self.print_tree(child_id, level + 1)


Thread scraping

In [5]:

scraper = RedditThreadScraper(
        client_id=os.getenv("REDDIT_CLIENT_ID"),
        client_secret=os.getenv("REDDIT_CLIENT_SECRET"),
        user_agent="meu_bot_para_scraping"
    )
    
thread_url = "https://www.reddit.com/r/PoliticalDiscussion/comments/1lfqdh3/could_us_involvement_in_iran_trigger_a_larger/"

if scraper.build_comment_tree(thread_url):
    print(f"\nTítulo do OP: {scraper.comment_tree[scraper.root_id]['title']}\n")
    scraper.print_tree()




Título do OP: Could U.S. involvement in Iran trigger a larger global war?

➡ User1: This post is speculative and is not intended to fearmonger.

President Donald Trump has stated that  (Tags: International Politics)
    ↳ User2: I dont believe China will ever invade Taiwan unless something very majorly changes with regard to th
        ↳ User42: I would hesitate to use the term ever in regards to a Chinese invasion of Taiwan but I think for the
            ↳ User92: **Quoting** Right now the conventional military gap between China and the US is closing but it still
        ↳ User46: Its also good to remember that Americas power is declining relative to other countries, whereas Chin
            ↳ User93: Taiwan isnt going anywhere. China is just waiting for the U.S. to waste money on wars and lose soft 
            ↳ User94: This is a garbage take on so many levels. The only neighbors China is on good terms with are Russia 
                ↳ User113: Isnt this exactly what we are seein

In [5]:
scraper.comment_tree

{'1lfqdh3': {'id': '1lfqdh3',
  'title': 'Could U.S. involvement in Iran trigger a larger global war?',
  'tags': ['International Politics'],
  'text': 'This post is speculative and is not intended to fearmonger.\n\nPresident Donald Trump has stated that he has an attack plan ready for Irans nuclear enrichment facility and will decide within the next two weeks whether to authorize a strike. Israel supposedly needs the U.S. to carry out the strike because it lacks the bunkerbuster bomb and other equipment necessary to destroy the facility on its own. A U.S. strike could be the firstand possibly the lastdirect military action against Irans nuclear infrastructure, or it could be the event that triggers a larger regional war. Depending on how Iran and its allies respond, any strike could escalate tensions in the region and potentially draw in other Western allies alongside the U.S. and Israel.\n\nIf the situation in Iran spirals into a larger conflict, it raises the question could this ins

# Context retrieval and prompt creation

Prompt:

Functions to retrieve context (with a summarizer and optimized to store responses/process only large comments)

In [None]:
summarizer = ChatOllama(model="llama3.1")

summary_cache = {}  # To store summaries of previous comments

# Estimate tokens based on words
def estimate_tokens(text: str) -> int:
    return int(len(text.split()) * 1.3)  # reasonable approximation

# Function to fetch the parents of a comment (up to 3 levels)
def get_parent_comments(comment_id: str, max_levels: int = 3):
    query = f"""
    MATCH (child:Comment {{id: $comment_id}})-[:RESPONDS_TO*1..{max_levels}]->(parent)
    WHERE (parent:Comment OR parent:OriginalPost) AND toLower(parent.text) <> '[removed]'
    RETURN parent.text AS text, parent.id AS id
    ORDER BY size(parent.text) DESC
    """
    results = graph.query(query, params={"comment_id": comment_id})
    return [{"id": row["id"], "text": row["text"]} for row in results]

# Function to fetch the text of the current comment
def get_comment_text(comment_id: str):
    return scraper.comment_tree.get(comment_id).get("text")

# Function to summarize text with LLM, checking size and using cache
def summarize_text(text: str, token_threshold: int = 150) -> str:
    if text in summary_cache:
        return summary_cache[text]

    if estimate_tokens(text) <= token_threshold:
        summary = text.strip()
    else:
        prompt = f"""
        Summarize the following comment in a concise and informative way. If you refer to the author, make sure you refer to them as 'an author of previous comments'. Only keep what is essential to understand the point made, and return only the summary:

        \"\"\"{text}\"\"\"
        """
        summary = summarizer.invoke(prompt).content.strip()

    summary_cache[text] = summary
    return summary

# Main function to build summarized context from previous comments
def get_context(comment_id: str, max_parents: int = 3, token_threshold: int = 200):
    parents = get_parent_comments(comment_id, max_levels=max_parents)
    child_comment = get_comment_text(comment_id)

    if not child_comment:
        raise ValueError("Child comment not found.")

    summarized_context = []
    for p in parents[:max_parents]:
        summary = summarize_text(p["text"], token_threshold=token_threshold)
        summarized_context.append(f"*Context from previous user's comment:* {summary}")

    final_context = "\n".join(summarized_context)
    return final_context

In [None]:
# Example of usage

comment_id_input = "mlk6gmm"  
context = get_context(comment_id_input)
print(context)

# Prepare processing loop

Function to convert JSON output to graph document and LLM invocation via Ollama

In [None]:
def json_to_graph_document(data: dict, input_text: str) -> GraphDocument:
    graph_documents = []

    nodes = []
    relationships = []

    for entry in data.get("arguments", []):
        argument_id = str(uuid.uuid4())

        # Extract descriptions of motivations
        motivations_descriptions = [
            motivation["description"] for motivation in entry.get("motivations", [])
        ]

        # Create the Argument node with the list of descriptions
        argument_node = Node(
            id=argument_id,
            type="Argument",
            properties={
                "description": entry["argument"],
                "motivations_descriptions": motivations_descriptions
            }
        )
        nodes.append(argument_node)

        # Create relationships with existing MaxNeefCategory nodes (using name as ID)
        for motivation in entry.get("motivations", []):
            for category_name in motivation.get("max_neef_category", []):
                category_node = Node(
                    id=category_name,
                    type="MaxNeefCategory",
                    properties={}  # Reference only — assumed to already exist
                )

                relationship = Relationship(
                    source=argument_node,
                    target=category_node,
                    type="reflects",
                    properties={}
                )
                relationships.append(relationship)

    # Source document
    doc = Document(page_content=input_text)

    graph_doc = GraphDocument(
        nodes=nodes,
        relationships=relationships,
        source=doc
    )

    graph_documents.append(graph_doc)
    return graph_documents


def parse_llm_output(output_str: str):
    """
    Extract and process a valid JSON from a string, even if it contains reasoning or additional text.
    """
    if not output_str or not isinstance(output_str, str):
        print("❌ LLM output is None or not a string.")
        return []

    # Regular expression to extract JSON block between brackets
    json_match = re.search(r'\[\s*{.*?}\s*\]', output_str, re.DOTALL)

    if json_match:
        json_str = json_match.group(0)

        try:
            return json.loads(json_str)
        except json.JSONDecodeError as e:
            print("❌ Extracted JSON is not valid:", e)
            print("📝 Extracted JSON:\n", json_str)
            return []
    else:
        print("❌ No JSON block found.")
        print("📝 Raw content:\n", output_str)
        return []

In [None]:

ARGUMENT_MODEL = ""
MOTIVATION_MODEL = ""

USE_OPENAI = False  # Set to True to use OpenAI, False for Ollama

if USE_OPENAI:
    # openAI's deepthinking models don't permit altering the temperature 
    ARGUMENT_MODEL = "llama3.1:8b"
    MOTIVATION_MODEL = "llama3.1:8b"
    argument_extractor = ChatOpenAI(
        model=ARGUMENT_MODEL,
        #temperature=0,
        openai_api_key=os.getenv("OPENAI_API_KEY"),
    )
    motivation_extractor = ChatOpenAI(
        model=MOTIVATION_MODEL,
        #temperature=0,
        openai_api_key=os.getenv("OPENAI_API_KEY"),
    )
    print("Using OpenAI models for extraction.")
else:
    ARGUMENT_MODEL = "deepseek-r1:8b"
    MOTIVATION_MODEL = "deepseek-r1:8b"
    argument_extractor = ChatOllama(
        model=ARGUMENT_MODEL,
        temperature=0,
    )
    motivation_extractor = ChatOllama(
        model=MOTIVATION_MODEL,
        temperature=0,
    )
    print("Using Ollama models for extraction.")



Using Ollama models for extraction.


In [37]:

argument_extraction_prompt = PromptTemplate(
    input_variables=["comment", "context"],
    template= """
    Your task is to extract **complete arguments** expressed **explicitly in the current comment** below.

    An **argument** consists of:
    - A clear opinion, preference, or claim (the *thesis*), and
    - A reason, explanation, or consequence that supports it (*justification*).

    Do **not** split one argument into several — combine related parts into a single argument.
    Do **not** extract vague or generic observations that are not reasoned claims.
    Only extract what is clearly expressed in the comment, even if context is provided.  

    Do **not** extract anything from the context — use it only to understand ambiguous terms **in the current comment**.
    **If there are no arguments, return an empty list.**

    Format:
    {{
        "arguments": [
            {{"argument": "<argument text>"}}
        ]
    }}

    ### CONTEXT (for reference only, do not extract arguments from here): {context} 

    ### CURRENT COMMENT TO ANALYZE:
    \"\"\"{comment}\"\"\"
    """
    )
    
    
motivation_extraction_prompt = PromptTemplate(
    input_variables=["argument", "context"],
    template="""
        You are tasked with extracting underlying motivations for the given argument, based on Max-Neef's Fundamental Human Needs theory.

        Each motivation should be:
        1. A concise explanation of *why* the author may have made the argument.
        2. Clearly linked to one or more Max-Neef categories.
        3. Based only on the content of the argument — do not assume things not stated.

        Each motivation must include one or more of the following Max-Neef categories:

        1. **Subsistence** - e.g. health, food, physical needs.
        2. **Protection** - e.g. safety, stability, environmental concerns.
        3. **Affection** - e.g. family, love, empathy, community.
        4. **Understanding** - e.g. knowledge, curiosity, critical thinking.
        5. **Participation** - e.g. responsibility, involvement, civic engagement.
        6. **Leisure** - e.g. enjoyment, relaxation, hobbies.
        7. **Creativity** - e.g. design, innovation, artistic expression.
        8. **Identity** - e.g. cultural pride, belonging, values.
        9. **Freedom** - e.g. autonomy, fairness, personal choice.

        ### Examples:
        - Argument: "Schools should teach more practical life skills like taxes or cooking."
        → Motivation: "Wants education to be useful in real life." → Category: `Understanding`, `Subsistence`

        - Argument: "We need stricter laws to combat pollution."
        → Motivation: "Concern about public health and environmental impact." → Category: `Protection`

        Be careful not to confuse:
        - "Leisure" with appreciation of design or aesthetics — prefer "Creativity" in such cases.
        - "Identity" with general positive feelings — only use it when pride, culture, or sense of belonging are present.
        If unsure, **do not assign a category**.

        Please follow this JSON format exactly:
        [
            {{
                "description": "<motivation>",
                "max_neef_category": ["<category1>", ...]
            }}
        ]

        ### CONTEXT (for reference only, do not extract motivations from here): {context}

        Argument: "{argument}"
        """
)

argument_chain = argument_extraction_prompt | argument_extractor        
motivation_chain = motivation_extraction_prompt | motivation_extractor


In [None]:
def extract_arguments(comment: str, context = None):

    input = {
        "comment" : comment,
        "context" : context
    }

    response = argument_chain.invoke(input).content
    
    try:
        return parse_llm_output(response)
    except Exception as e:

        print("Error extracting arguments:", e)
        return []

def extract_motivations(argument: str, context = None):
    if argument is not None:
        
        input = {
            "argument" : argument,
            "context" : context
        }

        response = motivation_chain.invoke(input).content

        try:
            #print("Response:", response)
            return parse_llm_output(response)
        except Exception as e:
            print(f"Error extracting motivations for the argument '{argument}':", e)
            return []
    else:
        print("Invalid or null argument.")
        return []

def extract_nodes(comment: str, context = None):
    
    print("Extracting arguments ...")

    extracted_arguments = extract_arguments(comment, context)
    final_result = {"arguments": []}

    print("Extracting motivations ...")
    for arg in extracted_arguments:
        argument_text = arg["argument"]
        motivations = extract_motivations(argument_text, context=context)
        final_result["arguments"].append({
            "argument": argument_text,
            "motivations": motivations
        })
    print("Final: ", final_result)
    return final_result


comment_tree = scraper.comment_tree

Retrieve the topic title (useful for attaching arguments to posts later) and function to upload the post node

In [None]:
def get_original_comment_title(comment_tree):
    '''Helper function to find the original post's title, optimized for parent node being first'''
    # Check if the first node in comment_tree is the original post
    first_post = next(iter(comment_tree.values()), None)
    if first_post and first_post.get("parent_id") is None and "title" in first_post:
        return first_post["title"]

topic_title = get_original_comment_title(comment_tree=comment_tree)

def upload_comment(comment_info, topic_title):
    ''' Function to upload posts. The initial comment is represented as OriginalPost with title and tags. '''

    if "title" in comment_info:  # Original Post

        # Create/Update the OriginalPost node with the discussion topic properties included
        graph.query("""
            MERGE (p:OriginalPost {id: $id})
            SET p.text = $text, 
                p.author = $author, 
                p.topic_title = $topic_title,
                p.tags = $tags
            RETURN p
            """, 
            {
                "id": comment_info["id"],
                "text": comment_info["text"],
                "author": comment_info["author"],
                "topic_title": topic_title,
                "tags": comment_info["tags"]
            }
        )

    else: 
        
        # Create/Update the Comment node
        graph.query("""
            MERGE (p:Comment {id: $id})
            SET p.text = $text, 
                p.author = $author, 
                p.topic_title = $topic_title
            RETURN p
            """, 
            {
                "id": comment_info["id"],
                "text": comment_info["text"],
                "author": comment_info["author"],
                "topic_title": topic_title
            }
        )

        # Create the RESPONDS_TO relationship
        graph.query("""
            MATCH (child:Comment {id: $child_id}), 
                  (parent) 
            WHERE parent.id = $parent_id
            MERGE (child)-[:RESPONDS_TO]->(parent)
            """, 
            {
                "child_id": comment_info["id"],
                "parent_id": comment_info["parent_id"]
            }
        )

Prepare duplicate verification with embeddings

In [None]:
# Load the embedding model once
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

def filter_unique_arguments(raw_format: dict, processed_arguments: list[str], threshold: float = 0.90) -> dict:
    """
    Filters duplicate arguments based on embeddings. Updates raw_format.
    """
    valid_arguments = []

    for arg_data in raw_format.get("arguments", []):
        argument = arg_data.get("argument", "").strip()

        if not argument:
            continue

        if processed_arguments:
            new_embedding = embedding_model.encode(argument, convert_to_tensor=True)
            existing_embeddings = embedding_model.encode(processed_arguments, convert_to_tensor=True)
            similarities = util.cos_sim(new_embedding, existing_embeddings)
            if any(sim > threshold for sim in similarities[0]):
                print(f"⚠️ Argument ignored as duplicate: {argument[:80]}...")
                continue

        valid_arguments.append(arg_data)
        processed_arguments.append(argument)

    raw_format["arguments"] = valid_arguments
    return raw_format

Function to order comments depth-first (for demonstration purposes)

In [13]:
def get_comments_depth_first(comment_tree, root_id, max_comments=15):
    stack = [root_id]
    visited = set()
    result = []
    while stack and len(result) < max_comments:
        node_id = stack.pop()
        if node_id in visited:
            continue
        visited.add(node_id)
        node = comment_tree[node_id]
        result.append(node)
        # Add children in reverse to maintain order
        stack.extend(reversed(node.get("children", [])))
    return result

## Main processing loop

In [None]:
MAX_LOOP = 15 # Maximum number of comments to be examined

# For depth-first:
comments = get_comments_depth_first(scraper.comment_tree, scraper.root_id, MAX_LOOP)
# For breadth-first
#comments = scraper.comment_tree.values()

n_comments = len(comments)

In [None]:
graph_documents = []
count = 1
for comment_info in comments:
    processed_arguments = []
    print(f"Processing comment {count}/{n_comments}")
    count += 1
    
    # Upload comment node to the database
    upload_comment(comment_info, topic_title)
    
    # Context retrieval
    comment = comment_info.get("text")
    
    context = get_context(comment_info.get("id"))

    # Node extraction
    raw_format = extract_nodes(comment=comment, context=context)
    
    # Remove duplicates
    raw_format = filter_unique_arguments(raw_format, processed_arguments)

    # Convert JSON to GraphDocument (to facilitate upload)
    temp_doc = json_to_graph_document(raw_format, comment)
    
    # Add the comment ID to the nodes' metadata
    temp_doc[0].source.metadata['comment_id'] = comment_info.get("id") 

    graph_documents.extend(temp_doc)

    if count > MAX_LOOP:
        print(f"Reached {MAX_LOOP} comments, exiting loop.")
        break

Processing comment 1/15
Extracting arguments ...
Extracting motivations ...
Final:  {'arguments': [{'argument': 'Israel supposedly needs the U.S. to carry out the strike because it lacks the bunkerbuster bomb and other equipment necessary to destroy the facility on its own.', 'motivations': [{'description': 'Highlights Israel’s reliance on U.S. military resources to ensure the success of a critical strike, reflecting concern for security and stability.', 'max_neef_category': ['Protection']}, {'description': 'Points to the necessity of strategic cooperation between nations in defense operations, indicating a desire for collaborative involvement in policy decisions.', 'max_neef_category': ['Participation']}, {'description': 'Underscores Israel’s limited autonomy in carrying out the strike alone, suggesting a motivation to achieve greater self-reliance in defense capabilities.', 'max_neef_category': ['Freedom']}, {'description': 'Seeks to clarify technical constraints and equipment shortf

In [45]:
for doc in graph_documents:
    for node in doc.nodes:
        print(node)


id='0f125ca4-b2dc-4eab-aa54-a3a4ffad4461' type='Argument' properties={'description': "The recent trend of leftwing candidates winning in multiple countries, including Romania and Poland, suggests a pattern towards left-wing success. This is supported by Nicușor Dan's victory as an independent candidate with leftwing party support.", 'motivations_descriptions': ['Wants to understand political trends', 'Concerned about increasing left-wing success and its implications on society']}


In [13]:
with open("dummytext/dummytext_good.txt", "r", encoding="utf-8") as f:
    dummytext_content = f.read()

In [None]:
text = """Today, Nicușor Dan won the Romanian presidential election. Though he has registered as an Independent candidate, Dan was supported by leftwing parties. After the elections in Canada, Germany, and Australia, this makes the fourth time that a leftwing candidate emerged victorious in an election. And judging by the first round today, it seems that Poland will also have a left President. Many have said that Trumps victory has caused a surge of left victories. But is that true? Is there anything else at play thats causing this pattern?"""

#Node extraction
raw_format = extract_nodes(comment = text)

#Convert json to GraphDocument (to ease the upload)
temp_doc = json_to_graph_document(raw_format, text)

#Add the comment id to the node's metadata
temp_doc[0].source.metadata['comment_id'] = "test"




Extracting arguments ...
Extracting motivations ...
Final:  {'arguments': [{'argument': "The recent trend of leftwing candidates winning in multiple countries, including Romania and Poland, suggests a pattern towards left-wing success. This is supported by Nicușor Dan's victory as an independent candidate with leftwing party support.", 'motivations': [{'description': 'Wants to understand political trends', 'max_neef_category': ['Understanding']}, {'description': 'Concerned about increasing left-wing success and its implications on society', 'max_neef_category': ['Participation']}]}]}


In [40]:
graph_documents = temp_doc

# Export nodes and relationships

In [None]:
# Add source nodes
# Upload the graph with arguments and motivations
graph.add_graph_documents(graph_documents=graph_documents, include_source=False)

# Update each Argument node with the 'model' property
for doc in graph_documents:
    comment_id = doc.source.metadata.get('comment_id')
    
    for node in doc.nodes:
        if "Argument" in node.type:
            # Update the Argument node with the 'model' property
            graph.query(
                """
                MATCH (n:Argument {id: $node_id})
                SET n.argument_model = $modela
                SET n.motivation_model = $modelm
                """,
                {"node_id": node.id, "modela": ARGUMENT_MODEL, "modelm": MOTIVATION_MODEL}
            )
            # Connect arguments to the comment or original post (accepts both)
            graph.query(
                """
                MATCH (p) WHERE (p:Comment OR p:OriginalPost) AND p.id = $comment_id
                MATCH (n:Argument {id: $node_id})
                MERGE (p)-[:STATED]->(n)
                """,
                {"comment_id": comment_id, "node_id": node.id}
            )