In [1]:
import os
import pandas as pd
from pathlib import Path
import sys
from typing import Dict, Any

# add src to Python path
notebook_dir = Path.cwd()
project_root = notebook_dir.parent
src_path = project_root / 'src'
sys.path.append(str(project_root))

# import local modules
from src.process import FilesProcessor, process_files
from src.vectordb import ChromaStore
from src.embeddings import EmbeddingsManager
from src.process import KGProcessor
from src.cluster import KGClusterer
from langchain_openai import ChatOpenAI

In [2]:
from src.generator import MultiCollectionQA, QAGenerator

In [3]:
# configure paths
data_dir = project_root / 'data'
notes_dir = data_dir / 'notes'
docs_dir = data_dir / 'docs'
chroma_dir = project_root / 'chroma_db'

In [4]:
# step 1: File Processing
processor = FilesProcessor(
    md_path=str(notes_dir),
    pdf_path=str(docs_dir)
)

try:
    processed_files = process_files(
        file_paths=[str(notes_dir), str(docs_dir)],  # Explicitly provide paths
        display_output=True,
        chunk_size=1000
    )
    
    print("\nProcessed Files Summary:")
    for file_type, docs in processed_files.items():
        print(f"{file_type}: {len(docs)} documents")
        
except Exception as e:
    print(f"Processing failed: {e}")
    processed_files = {'markdown': [], 'pdf': []}


=== Files Found ===
Markdown files: 3
PDF files: 1

Processing markdown files...


Markdown: 100%|██████████| 1/1 [00:00<00:00, 8208.03it/s]



Processing pdf files...


Pdf: 100%|██████████| 1/1 [00:12<00:00, 12.93s/it]


=== Processing Summary ===
Total files processed: 4
Total chunks generated: 1374

=== Processed Chunks ===

MARKDOWN Chunks:

Chunk 1:
Content: Brain Regions Showing Increased Activation:
- Parahippocampus
- Cerebellum
- Superior Lateral Occipital
- Fusiform and Lingual Gyri
- Precuneus
- Posterior Cingulate Gyrus
Metadata: {'source': '/Users/kat/Desktop/projects/nova-note-dev/backend/data/notes/lecture-01.md', 'word_count': 24, 'section': 'Cognitive Science Lecture Notes - Week 8 October 28, 2024 Prof. Martinez - COGS 301 ## Working Memory & Attention The multi-store model of memory proposes distinct components for processing and storing information. Working memory acts as a temporary storage system with limited capacity. There is broad consensus that working memory and attention are intimately linked. ### Key Components of Working Memory: - Central Executive - Phonological Loop - Visuospatial Sketchpad - Episodic Buffer Important: Central Executive Functions: Processing and coordina




In [5]:
# step 2: ChromaDB Setup
chroma_store = ChromaStore(
    persist_directory=str(chroma_dir)
)

try:
    # Initialize database
    chroma_store.load_db()
    
    # Create collections
    collections = ['notes', 'documents', 'notes_kg']
    chroma_store.load_collections(collections)
    
    # Verify initialization
    db_info = chroma_store.get_db_info()
    if db_info:
        print(f"Database initialized at: {db_info.base_path}")
        print(f"Collections created: {db_info.collection_count}")
        print(f"Storage size: {db_info.size_mb:.2f} MB")
    
except Exception as e:
    print(f"ChromaDB initialization failed: {e}")

collection 'notes' ready
collection 'documents' ready
collection 'notes_kg' ready
Database initialized at: /Users/kat/Desktop/projects/nova-note-dev/backend/chroma_db
Collections created: 3
Storage size: 6.02 MB


In [None]:
chroma_store.get_db_info()

In [None]:

# step 3: Create Embeddings
embeddings_manager = EmbeddingsManager(
    chroma_store=chroma_store,
    batch_size=32
)

# Process markdown documents
if processed_files['markdown']:
    print("\nEmbedding markdown documents...")
    try:
        embeddings_manager.embed_documents(
            docs=processed_files['markdown'],
            collection_name='notes'
        )
        print(f"Successfully embedded {len(processed_files['markdown'])} markdown documents")
    except Exception as e:
        print(f"Markdown embedding failed: {e}")

# Process PDF documents
if processed_files['pdf']:
    print("\nEmbedding PDF documents...")
    try:
        embeddings_manager.embed_documents(
            docs=processed_files['pdf'],
            collection_name='documents'
        )
        print(f"Successfully embedded {len(processed_files['pdf'])} PDF documents")
    except Exception as e:
        print(f"PDF embedding failed: {e}")

In [7]:
# step 4: Load environment variables for API key
from dotenv import load_dotenv
load_dotenv(project_root / '.env')

api_key = os.getenv('OPENAI_API_KEY')
if not api_key:
    raise ValueError("OPENAI_API_KEY not found in environment variables")

In [None]:
# step 5: Initialize LLM and QA System
llm = ChatOpenAI(
    api_key=api_key,
    model="gpt-4",
    temperature=0.1,
    max_tokens=100,
    presence_penalty=0.7,
    frequency_penalty=0.7,
    top_p=0.9
)

# initialize MultiCollectionQA
try:
    qa = MultiCollectionQA(
        llm=llm,
        chroma_store=chroma_store,
        collections=[
            ('notes', 1.0),
            ('documents', 1.8)
        ],
        cache_size=1000
    )

    # testing the query
    test_query = "What are the primary cross-linguistic effects observed when applying multilingual analysis methods in cognitive linguistics?"
    
    results = qa.query(test_query)
    
    # printing results
    print(f"\nQuery: {test_query}")
    print(f"\nAnswer: {results['answer']}")
    print(f"\nMetrics:")
    print(f"Retrieval Time: {results['metrics']['retrieval_time']:.2f}s")
    print(f"Average Score: {results['metrics']['avg_score']:.3f}")
    print(f"Coherence: {results['metrics']['coherence']:.3f}")
    
    print("\nSources:")
    for source in results['sources']:
        print(f"\nCollection: {source['collection']}")
        print(f"Score: {source['weighted_score']:.3f}")
        print(f"Content: {source['content'][:200]}...")

except Exception as e:
    print(f"MultiCollectionQA testing failed: {e}")

In [8]:

# Define the academic ontology configuration
# TODO: think of allowed relationships that will make the nodes more connected with one another.
ACADEMIC_ONTOLOGY = {
    "allowed_nodes": [
        # Core Knowledge Nodes
        "Concept",        # Fundamental ideas (e.g., "Working Memory", "Neural Plasticity")
        "Definition",     # Explicit meanings (e.g., "temporary storage system", "Hebbian learning")
        "Topic",    
        "Idea", 
        
        # Structural Nodes
        "Architecture",   # High-level frameworks (e.g., "ACT-R", "SOAR", "Feed-forward Network")
        "System",        # Organized functional units (e.g., "Working Memory System", "Language Processing System")
        "Component",     # Discrete parts (e.g., "Central Executive", "Phonological Loop")
        "Module",        # Specialized subsystems (e.g., "Memory Module", "Attention Module")
        "Structure",     # Physical or conceptual arrangements (e.g., "Neural Circuit", "Processing Hierarchy")
        
        # Empirical Nodes
        "Evidence",       # Supporting data (e.g., "ERP Components", "fMRI studies")
        "Observation",    # Direct findings (e.g., "cocktail party effect", "garden path sentences")
        "Measurement",    # Quantified data (e.g., "memory span", "response time")
        
        # Methodological Nodes
        "Method",         # Procedures (e.g., "ERP Analysis", "Pattern Recognition")
        "Protocol",       # Standardized processes (e.g., "Experimental Design", "Lab Procedure")
        "Technique",      # Specific approaches (e.g., "Semantic Priming", "Cognitive Rehabilitation")
        "Task",
        
        # Theoretical Nodes
        "Theory",         # Explanatory frameworks (e.g., "Multi-store Model", "Universal Grammar")
        "Principle",      # Core rules (e.g., "Hebbian Learning", "Miller's Law")
        "Model",         # Representational systems (e.g., "Working Memory Model", "Neural Network")
        
        # Meta Nodes
        "Question",       # Inquiries (e.g., "cognitive load effects", "bilingual advantages")
        "Result",        # Results (e.g., "processing difficulty increases", "accuracy rate")
        "Note"           # Annotations (e.g., "review needed", "important concept")
    ],
    "allowed_relationships": [
        
        # Definitional Relations
        "IS_DEFINED_AS",         # Links Concept/Theory to Definition
        "IS_INSTANCE_OF",        # Links specific examples to general concepts
        "PART_OF",           # Shows category membership
        
        # Structural Relations
        "CONSISTS_OF",          # System-Component breakdown
        "INTEGRATES_WITH",      # Shows system interactions
        "IMPLEMENTS",           # Architecture-System implementation
        
        # Evidential Relations
        "IS_SUPPORTED_BY",      # Theory-Evidence connections
        "IS_MEASURED_BY",       # Concept-Measurement links
        "IS_OBSERVED_IN",       # Theory-Observation connections
        "VALIDATES",            # Evidence supporting Theory/Model
        
        # Methodological Relations
        "IS_STUDIED_THROUGH",    # Concept-Method relationships
        "IS_ANALYZED_BY",       # Evidence-Technique connections
        "PREREQUISTE_OF",              # Protocol-Method sequences
        
        # Theoretical Relations
        "EXTENDS",              # Theory building on Theory
        "CONTRADICTS",          # Opposing theoretical relationships
        "PREDICTS",             # Theory-Observation predictions
        "EXPLAINS",             # Theory-Mechanism explanations
        "PROVES",
        "EXEMPLIFIES",
        
        # Semantic Relations
        "ASSOCIATES_WITH",      # General conceptual connections
        "DERIVES_FROM",         # Origin relationships
        "INFLUENCES",           # Impact relationships
        
        # Meta Relations
        "RAISES_QUESTION_ABOUT", # Question-Concept inquiries
        "NOTES_ON",             # Note-Topic annotations
        "DOCUMENTS",            # Finding documentation
        "REFERENCES"            # Citation/source connections
    ]
}



In [None]:
# step 7: initialize KGProcessor to create kg docs

# note do not set max token requirement
llm = ChatOpenAI(
    api_key=api_key,
    model="gpt-4o-mini",  
    temperature=0
)


try:
    kg_processor = KGProcessor(
        chroma_store=chroma_store,
        ontology=ACADEMIC_ONTOLOGY,
        batch_size=16  
    )
    print("KGProcessor initialized successfully")
    
    # generate Knowledge Graph Documents
    print("\nGenerating knowledge graph documents...")
    kg_docs = kg_processor.generate_kg_docs(
        documents=processed_files['markdown'],
        llm=llm
    )
    
    # process and verify results
    if kg_docs:
        print(f"\nGenerated {len(kg_docs)} knowledge graph documents")
        print("\nSample nodes and relationships:")
        for i, doc in enumerate(kg_docs):  # Show first 3 docs
            print(f"\nDocument {i+1}:")
            print(f"Nodes: {len(doc.nodes)}")
            print(f"Relationships: {len(doc.relationships)}")
    else:
        print("No knowledge graph documents were generated")

except Exception as e:
    print(f"Error in knowledge graph processing: {e}")

In [None]:
# optional step: visualize kg 
from pyvis.network import Network
import networkx as nx
from IPython.display import HTML, display

def interactive_graph(graph_documents, height="750px", width="100%", bgcolor="#ffffff", 
                           font_color="#000000"):

    G = nx.DiGraph()
    
    node_types = set()
    edge_types = set()
    
    for doc in graph_documents:
        for node in doc.nodes:
            G.add_node(node.id, title=f"Type: {node.type}", type=node.type)
            node_types.add(node.type)
        
        for rel in doc.relationships:
            G.add_edge(rel.source.id, rel.target.id, title=f"Relationship: {rel.type}", type=rel.type)
            edge_types.add(rel.type)
    
    net = Network(height=height, width=width, bgcolor=bgcolor, font_color=font_color, 
                 notebook=True, cdn_resources='in_line')
    net.from_nx(G)
    
    color_palette = {
        node_type: f"#{hash(node_type) % 0xFFFFFF:06x}" 
        for node_type in node_types
    }
    
    for node in net.nodes:
        node_type = node.get('type', 'default')
        node.update({
            'color': color_palette.get(node_type, '#97c2fc'),
            'size': 25,
            'font': {'size': 12},
            'shape': 'dot',
            'borderWidth': 2,
            'borderWidthSelected': 4,
        })
    
    for edge in net.edges:
        edge.update({
            'arrows': 'to',
            'color': {'color': '#848484', 'opacity': 0.8},
            'width': 2,
            'smooth': {'type': 'continuous'}
        })
    
    net.set_options("""
    var options = {
        "physics": {
            "enabled": true,
            "forceAtlas2Based": {
                "gravitationalConstant": -50,
                "springLength": 100,
                "springConstant": 0.08
            },
            "minVelocity": 0.75,
            "solver": "forceAtlas2Based"
        },
        "interaction": {
            "dragNodes": true,
            "dragView": true,
            "hideEdgesOnDrag": false,
            "hideNodesOnDrag": false,
            "hover": true,
            "navigationButtons": true,
            "multiselect": true,
            "zoomView": true
        },
        "edges": {
            "smooth": {
                "type": "continuous",
                "forceDirection": "none"
            }
        }
    }
    """)
    
    return net

def display_legend(net, graph_documents):
    net.show("KG.html")
    
    with open("KG.html", "r", encoding="utf-8") as f:
        graph_html = f.read()
    
    # Create legend
    node_types = set(node.type for doc in graph_documents for node in doc.nodes)
    edge_types = set(rel.type for doc in graph_documents for rel in doc.relationships)
    
    legend_html = """
    <div style="padding: 10px; background-color: white; border: 1px solid #ddd; margin-top: 10px;">
        <h3 style="margin: 0 0 10px 0;">Legend</h3>
        <div style="display: flex; gap: 20px;">
            <div>
                <h4>Node Types</h4>
                <ul style="list-style-type: none; padding: 0;">
    """
    
    for node_type in node_types:
        color = f"#{hash(node_type) % 0xFFFFFF:06x}"
        legend_html += f"""
            <li style="margin: 5px 0;">
                <span style="display: inline-block; width: 12px; height: 12px; 
                background-color: {color}; border-radius: 50%; margin-right: 5px;"></span>
                {node_type}
            </li>
        """
    
    legend_html += """
                </ul>
            </div>
            <div>
                <h4>Relationship Types</h4>
                <ul style="list-style-type: none; padding: 0;">
    """
    
    for edge_type in edge_types:
        legend_html += f"""
            <li style="margin: 5px 0;">
                <span style="display: inline-block; width: 20px; height: 2px; 
                background-color: #848484; margin-right: 5px;"></span>
                {edge_type}
            </li>
        """
    
    legend_html += """
                </ul>
            </div>
        </div>
    </div>
    """
    
    final_html = f"{graph_html}{legend_html}"
    display(HTML(final_html))

# call function 
net = interactive_graph(kg_docs)
display_legend(net, kg_docs)

In [9]:
llm = ChatOpenAI(
    api_key=api_key,
    model="gpt-4o-mini",  
    temperature=0
)


try:
    kg_processor = KGProcessor(
        chroma_store=chroma_store,
        ontology=ACADEMIC_ONTOLOGY,
        batch_size=16  
    )
    print("KGProcessor initialized successfully")
    
    # generate Knowledge Graph Documents
    print("\nGenerating knowledge graph documents...")
    kg_docs = kg_processor.generate_kg_docs(
        documents=processed_files['markdown'],
        llm=llm
    )
    
    # process and verify results
    if kg_docs:
        print(f"\nGenerated {len(kg_docs)} knowledge graph documents")
        print("\nSample nodes and relationships:")
        for i, doc in enumerate(kg_docs[:3]):  # Show first 3 docs
            print(f"\nDocument {i+1}:")
            print(f"Nodes: {len(doc.nodes)}")
            print(f"Relationships: {len(doc.relationships)}")
            
        # MISSING STEP: Embed the KG documents into ChromaDB
        print("\nEmbedding knowledge graph into ChromaDB...")
        kg_processor.embed_kg(
            kg_docs=kg_docs,
            collection_name='notes_kg',
            replace_existing=True  # Set to False if you want to append
        )
        
        # Now clustering should work
        clusterer = KGClusterer(chroma_store)
        results = clusterer.cluster_nodes(
            'notes_kg',
            n_neighbors=25,
            min_cluster_size=3,
            min_samples=2,
            min_dist=0.1
        )
        
        if results['stats']:
            print("\nClustering Statistics:")
            for metric, value in results['stats'].items():
                print(f"{metric}: {value}")

            # interpret results
            print("\nCluster Quality Analysis:")
            if 'silhouette' in results['stats']:
                silhouette = results['stats']['silhouette']
                print(f"Silhouette Score: {silhouette:.3f}")
                if silhouette > 0.5:
                    print("Good cluster separation")
                elif silhouette > 0.3:
                    print("Moderate cluster separation")
                else:
                    print("Poor cluster separation")

        # visualize if nodes exist
        if results['nodes']:
            fig = clusterer.visualize_clusters(results['nodes'])
            fig.show()
                
    else:
        print("No knowledge graph documents were generated")

except Exception as e:
    print(f"Error in knowledge graph processing: {e}")

KGProcessor initialized successfully

Generating knowledge graph documents...


Add of existing embedding ID: node_0
Add of existing embedding ID: node_1
Add of existing embedding ID: node_2
Add of existing embedding ID: node_3
Add of existing embedding ID: node_4
Add of existing embedding ID: node_5
Add of existing embedding ID: node_6
Add of existing embedding ID: node_7
Add of existing embedding ID: node_8
Add of existing embedding ID: node_9
Add of existing embedding ID: node_10
Add of existing embedding ID: node_11
Add of existing embedding ID: node_12
Add of existing embedding ID: node_13
Add of existing embedding ID: node_14
Add of existing embedding ID: node_15
Add of existing embedding ID: node_0
Add of existing embedding ID: node_1
Add of existing embedding ID: node_2
Add of existing embedding ID: node_3
Add of existing embedding ID: node_4
Add of existing embedding ID: node_5
Add of existing embedding ID: node_6
Add of existing embedding ID: node_7
Add of existing embedding ID: node_8
Add of existing embedding ID: node_9
Add of existing embedding ID: no


Generated 10 knowledge graph documents

Sample nodes and relationships:

Document 1:
Nodes: 6
Relationships: 0

Document 2:
Nodes: 6
Relationships: 5

Document 3:
Nodes: 17
Relationships: 11

Embedding knowledge graph into ChromaDB...

Processing 87 total nodes...
Prepared 86 unique nodes for embedding
Processed batch 1, total nodes: 16
Processed batch 2, total nodes: 32
Processed batch 3, total nodes: 48
Processed batch 4, total nodes: 64
Processed batch 5, total nodes: 80
Processed batch 6, total nodes: 86
Successfully embedded 86 nodes in total

Final verification: 86 nodes in collection
performing UMAP reduction...


  warn(
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


performing HDBSCAN clustering...

Clustering Statistics:
total_nodes: 86
clusters: 16
noise_points: 9
silhouette: 0.4165956974029541
calinski_harabasz: 125.64975743429187
davies_bouldin: 0.6847939953794568

Cluster Quality Analysis:
Silhouette Score: 0.417
Moderate cluster separation


In [10]:
# step 9: initialize QAGenerator to generate qs based on node embeddings
try:
    print("Initializing question generation process...")
    qa_gen = QAGenerator(
        llm=llm,
        chroma_store=chroma_store,
        collection_name='notes_kg',
        max_workers=4  
    )

    # generate questions from clusters
    print("\nGenerating questions from clusters...")
    questions_list, stats_list = qa_gen.generate_cluster_questions(
        clustering_results=results,
        show_tokens=True  # show token usage statistics
    )

    # process and display results
    if questions_list and stats_list:
        questions_data = []
        for questions in questions_list:
            for q in questions.questions:
                questions_data.append({
                    'Cluster ID': questions.cluster_id,
                    'Theme': questions.theme,
                    'Question': q.text,
                    'Type': q.type,
                    'Concepts': ', '.join(q.concepts)
                })
        
        stats_data = [{
            'Cluster ID': stats.cluster_id,
            'Concepts': stats.concept_count,
            'Relationships': stats.relationship_count,
            'Tokens': stats.token_count
        } for stats in stats_list]

        questions_df = pd.DataFrame(questions_data)
        stats_df = pd.DataFrame(stats_data)

        print("\n=== Generated Questions ===")
        print(questions_df.to_string(index=False))
        
        print("\n=== Cluster Statistics ===")
        print(stats_df.to_string(index=False))

        # Additional Analysis
        print("\n=== Question Analysis ===")
        print("\nQuestions per Cluster:")
        print(questions_df.groupby('Cluster ID').size())
        
        print("\nQuestion Types Distribution:")
        print(questions_df['Type'].value_counts())
        
    else:
        print("No questions or statistics generated")

except Exception as e:
    print(f"Error in question generation process: {e}")

Initializing question generation process...

Generating questions from clusters...


Processing clusters: 100%|██████████| 16/16 [00:23<00:00,  1.46s/it]



=== Generated Questions ===
 Cluster ID                                                                                                             Theme                                                                                                                                                      Question         Type                                                                                           Concepts
         12                                                         Parahippocampus & Cerebellum & Superior Lateral Occipital                                                                       How do the parahippocampus and cerebellum interact in the processing of spatial memory?  theoretical                                                                        Parahippocampus, Cerebellum
         12                                                         Parahippocampus & Cerebellum & Superior Lateral Occipital                                     What role does th