In [None]:
from loguru import logger

# Use offline data loader instead of API
from iqrah.quran_offline import load_quran_offline

logger.remove()

# Load Quran data from local JSON files (no API calls)
print("Loading Quran data from offline sources...")
quran = load_quran_offline()
print(f"‚úì Loaded {len(quran.chapters)} chapters, {quran.total_verses()} verses")

In [None]:
quran.chapters[1].verses[1].words[0]

In [None]:
# --- Old method
import networkx as nx

from iqrah.graph.identifiers import NIG, NIP
from iqrah.graph.knowledge import Distribution, KnowledgeEdgeManager
from iqrah.graph.node_manager import NodeManager
from iqrah.quran_api.models import Chapter, Verse, Word


class KnowledgeExperiments:
    def __init__(self, graph: nx.DiGraph, quran: Quran):
        self.edge_manager = KnowledgeEdgeManager(graph)
        self.G = graph
        self.quran = quran
        self.node_manager = NodeManager(graph)
        self._is_compiled = False

    def get_nodes_by_type(self, node_type: str) -> set[str]:
        return self.node_manager.get_nodes_by_type(node_type)

    def get_verse_words(self, verse_id: str) -> list[str]:
        return self.node_manager.get_verse_words(verse_id)

    def get_chapter_verses(self, chapter_id: str) -> list[str]:
        return self.node_manager.get_chapter_verses(chapter_id)

    def has_tajweed_rules(self, word_id: str) -> bool:
        """Check if word has tajweed rules (placeholder)"""
        node = self.G.nodes.get(word_id)
        if not node:
            return False
        return node.get('has_tajweed', False)

    def get_word_root(self, word_id: str, cutoff=3) -> str | None:
        """Get root of a word by traversing the graph"""
        for path in nx.all_simple_paths(self.G, word_id,
                                      self.node_manager.get_nodes_by_type("root"), cutoff=cutoff):
            return path[-1]
        return None

    def setup_standard_memorization(self):
        """
        Standard memorization configuration:
        - Words -> Verse (based on word length)
        - Verse -> Chapter (equal weights)
        - Context windows for words
        """
        for chapter_id in self.get_nodes_by_type("chapter"):
            chapter : Chapter = self.quran[ NIP.get_chapter_key(chapter_id) ]
            for verse in chapter.verses:
                # Verse to chapter memorization edges
                self.edge_manager.add_knowledge_edge(
                    f"{NIG.for_verse(verse)}:memorization",
                    f"{NIG.for_chapter(chapter)}:memorization",
                    Distribution.auto(weight=verse.get_letters_count())
                )

                # Word specific
                for word in (w for w in verse.words if not w.is_end_word()):
                    # Word to verse memorization edges
                    self.edge_manager.add_knowledge_edge(
                        f"{NIG.for_word_instance(word, verse)}:memorization",
                        f"{NIG.for_verse(verse)}:memorization",
                        Distribution.auto(weight=word.get_letters_count())
                    )


                # Contextual memorization for words
                self.edge_manager.add_gaussian_window_edges(
                    [f"{NIG.for_word_instance(w, verse)}:memorization" for w in verse.words if not w.is_end_word()],
                    window_size=3,
                    base_weight=0.5,
                    std_scale=0.15
                )

    def setup_tajweed_learning(self):
        """
        Tajweed learning configuration:
        - Strong impact on memorization
        - Affected by neighboring words (for rules spanning multiple words)
        """
        from itertools import tee, zip_longest

        # Tajweed impacts memorization
        for verse_id in self.get_nodes_by_type("verse"):
            verse = self.quran[NIP.get_verse_key(verse_id)]

            iter_current, iter_next = tee(verse.words)
            next(iter_next, None)

            for current_word, next_word in zip_longest(iter_current, iter_next, fillvalue=None):
                word_instance_id = NIG.for_word_instance(current_word, verse)
                if self.has_tajweed_rules(word_instance_id):
                    self.edge_manager.add_knowledge_edge(
                        f"{word_instance_id}:tajweed",
                        f"{word_instance_id}:memorization",
                        Distribution.normal(mean=0.7, std=0.1), # strong impact
                    )

                    # Connect neighboring tajweed nodes (for rules spanning words)
                    if next_word:
                        self.edge_manager.add_knowledge_edge(
                            f"{word_instance_id}:tajweed",
                            f"{NIG.for_word_instance(next_word, verse)}:tajweed",
                            Distribution.normal(mean=0.3, std=0.1), # weak impact
                        )

    def get_all_translatable_nodes(self):
        return self.node_manager.get_nodes_by_type("word_instance") | self.node_manager.get_nodes_by_type("verse")

    def get_duplicated_verses(self) -> list[tuple[str, list[str]]]:
        h = {}
        # for verse in quran[:,:]:
        for verse_id in self.get_nodes_by_type("verse"):
            verse = self.quran[NIP.get_verse_key(verse_id)]
            k = verse.text_uthmani_simple
            assert k is not None, f"we need text_uthmani_simple to be defined"
            h[k] = h.get(k, []) + [verse.verse_key]

        h = dict(filter(lambda x:len( x[1]) > 1, h.items()))
        return sorted(h.items(), key=lambda x: len(x[1]), reverse=True)

    def setup_translation_understanding(self):
        """
        Translation learning configuration:
        - Word meanings contribute to verse meaning
        - Understanding impacts memorization
        - Related verses strengthen each other
        """

        for chapter_id in self.get_nodes_by_type("chapter"):
            chapter = self.quran[ NIP.get_chapter_key(chapter_id) ]
            for verse in chapter.verses:

                # Verse translation to chapter translation
                self.edge_manager.add_knowledge_edge(
                    f"{NIG.for_verse(verse)}:translation",
                    f"{chapter_id}:translation",
                    Distribution.auto(weight=verse.get_words_count())
                )

                # Word translation to verse translation
                for word in (w for w in verse.words if not w.is_end_word()):
                    self.edge_manager.add_knowledge_edge(
                        f"{NIG.for_word_instance(word, verse)}:translation",
                        f"{NIG.for_verse(verse)}:translation",
                        Distribution.auto(weight=word.get_letters_count())
                    )

                    # Word instance to word memorization edges
                    self.edge_manager.add_knowledge_edge(
                        f"{NIG.for_word_instance(word, verse)}:translation",
                        f"{NIG.for_word(word)}:translation",
                        Distribution.normal(mean=0.9, std=0.1) # very high impact (basically same word)
                    )


        # Translation helps memorization
        for node_id in self.get_all_translatable_nodes():
            self.edge_manager.add_knowledge_edge(
                f"{node_id}:translation",
                f"{node_id}:memorization",
                Distribution.normal(mean=0.4, std=0.15)
            )

        # Strongly connect duplicated verses
        for (verse, verse_pairs) in self.get_duplicated_verses():
            for i in range(len(verse_pairs)):
                for j in range(i+1, len(verse_pairs)):
                    v1, v2 = verse_pairs[i], verse_pairs[j]
                    self.edge_manager.add_bidirectional_knowledge_edge(
                        f"{NIG.for_verse(v1)}:translation",
                        f"{NIG.for_verse(v2)}:translation",
                        Distribution.normal(mean=0.9, std=0.1)
                    )

    def setup_deep_understanding(self):
        """
        Deep understanding configuration:
        - Translation -> Tafsir
        - Root meaning impacts understanding
        - Related verses strengthen understanding
        """
        # Translation aids tafsir
        for verse_id in (self.get_nodes_by_type("verse") & self.node_manager.get_nodes_by_metadata("has_tafsir")):
            self.edge_manager.add_knowledge_edge(
                f"{verse_id}:translation",
                f"{verse_id}:tafsir",
                Distribution.normal(mean=0.3, std=0.1)
            )

        # Root meanings impact word understanding
        for lemma in self.get_nodes_by_type("lemma"):
            root = self.get_word_root(lemma)
            if root:
                self.edge_manager.add_knowledge_edge(
                    f"{root}:meaning",
                    f"{lemma}:translation",
                    Distribution.beta(alpha=4, beta=2) # Strong positive skew
                )

    def compile(self) -> None:
        """
        Finalize the knowledge graph by computing all pending weights
        and performing any necessary validations.

        Should be called after all setup_* methods and before saving/using the graph.

        Raises:
            RuntimeError: If compile is called more than once
            ValueError: If there are invalid edge configurations
        """
        if self._is_compiled:
            raise RuntimeError("Knowledge graph has already been compiled")

        # Attribute weights to all pending edges
        self.edge_manager.compile()

        # Validate final graph state
        self._validate_compiled_graph()

        self._is_compiled = True

    def _validate_compiled_graph(self) -> None:
        """
        Perform final validation checks on the compiled graph.
        Add any necessary validation logic here.
        """
        # Example validation: ensure no edges are missing weights
        for src, dst, data in self.G.edges(data=True):
            if "dist" not in data:
                is_exception = (
                    data.get("type") == "dependency" # Allow dependency edges to be missing weights, they are not used for learning but scheduling
                )
                if not is_exception:
                    raise ValueError(f"Found edge missing weight distribution after compilation: {src} -> {dst} {{{data}}}")

    def save(self, filename: str) -> None:
        """
        Save the compiled knowledge graph to a file.

        Args:
            filename: Path to save the GraphML file

        Raises:
            RuntimeError: If save is attempted before compilation
        """
        if not self._is_compiled:
            raise RuntimeError(
                "Cannot save uncompiled knowledge graph. Call compile() first"
            )
        if filename.endswith(".graphml"):
            # Save as GraphML
            nx.write_graphml(self.G, filename)
        else:
            raise ValueError("Unsupported file format. Use .graphml or .json")

    def setup_experimental_learning(self):
        """
        Experimental learning strategies to test hypotheses
        """
        # Hypothesis: Memorization is strengthened by multiple knowledge types
        def setup_multimodal_learning():
            for verse_id in self.get_nodes_by_type("verse"):
                # Create composite effect from different knowledge types
                knowledge_types = ['translation', 'tajweed', 'contextual_memorization']
                for kt in knowledge_types:
                    self.edge_manager.add_knowledge_edge(
                        f"{verse_id}:{kt}",
                        f"{verse_id}:memorization",
                        distribution="normal",
                        m=0.3,  # Individual effects are moderate
                        s=0.1
                    )

        # Hypothesis: Learning is affected by position in chapter
        def setup_position_based_learning():
            for chapter_id in self.get_nodes_by_type("chapter"):
                verses = self.get_chapter_verses(chapter_id)
                for i, verse_id in enumerate(verses):
                    position_factor = 1 - (i / len(verses))  # Stronger at start
                    self.edge_manager.add_knowledge_edge(
                        f"{verse_id}:memorization",
                        f"{chapter_id}:memorization",
                        distribution="normal",
                        m=position_factor * 0.5,
                        s=0.1
                    )

        # Hypothesis: Understanding spreads differently than memorization
        def setup_differential_propagation():
            for verse_id in self.get_nodes_by_type("verse"):
                words = self.get_verse_words(verse_id)

                # Understanding propagates bottom-up
                self.edge_manager.add_cascading_edges(
                    [f"{w}:translation" for w in words],
                    f"{verse_id}:translation",
                    cascade_type="bottom_up",
                    base_weight=0.4
                )

                # Memorization propagates top-down
                self.edge_manager.add_cascading_edges(
                    [f"{w}:memorization" for w in words],
                    f"{verse_id}:memorization",
                    cascade_type="top_down",
                    base_weight=0.6
                )

    def setup_grammar_nodes(self):
        for word_id in self.get_nodes_by_type("word"):
            lemma_ids = self.node_manager.get_related_nodes(word_id, successor_type="lemma")
            # assert len(lemma_ids) <= 1, f"Word {word_id} has multiple lemmas: {lemma_ids}"
            for lemma_id in lemma_ids:
                _, lemma = lemma_id.split(":")

                # Add translation edge between word and lemma
                self.edge_manager.add_bidirectional_knowledge_edge(
                    f"{word_id}:translation",
                    f"{lemma_id}:translation",
                    Distribution.auto(weight=len(lemma))
                )

                # attempt to look for root
                root_ids = self.node_manager.get_related_nodes(word_id, successor_type="root")
                assert len(root_ids) <= 1, f"Word {word_id} has multiple roots: {root_ids}"
                if root_ids:
                    # Add translation edge between lemma and root
                    self.edge_manager.add_bidirectional_knowledge_edge(
                        f"{lemma_id}:translation",
                        f"{root_ids[0]}:meaning"
                    )

                # FIXME: can we have a root but no lemma?
                assert not self.node_manager.get_related_nodes(word_id, successor_type="root"), f"Word {word_id} has neither root nor lemma"


# Example usage in notebook:
# graph = nx.read_graphml("quran_dependency_fatiha.graphml")
# graph = nx.read_graphml("quran_dependency_medium.graphml")
# graph = nx.read_graphml("quran_dependency_big.graphml")
graph = nx.read_graphml("quran_dependency_full.graphml")
exp = KnowledgeExperiments(graph, quran)

# Try different configurations
exp.setup_standard_memorization()
exp.setup_tajweed_learning()
# exp.setup_translation_understanding()
exp.setup_deep_understanding()
exp.setup_grammar_nodes()

# # Test experimental hypotheses
# exp.setup_experimental_learning()

# Save different configurations
# nx.write_graphml(graph, "quran_knowledge_medium.graphml")
# nx.write_graphml(graph, "quran_knowledge_fatiha.graphml")
exp.compile()
# exp.save("quran_knowledge_fatiha.graphml")
# exp.save("quran_knowledge_medium2.graphml")
# exp.save("quran_knowledge_big.graphml")

In [None]:
from iqrah.graph.identifiers import NodeIdentifierParser, NodeType
import networkx as nx
import numpy as np
from tqdm.notebook import tqdm

# ---------------------------
# Personalization by node type
# ---------------------------
NODE_TYPE_WEIGHTS = {
    NodeType.ROOT: 3.0,
    NodeType.LEMMA: 2.5,
    NodeType.CHAPTER: 2.0,
    NodeType.VERSE: 1.5,
    NodeType.WORD: 1.0,
    NodeType.WORD_INSTANCE: 0.5,
}

def create_personalized_nstart(graph: nx.DiGraph) -> dict[str, float]:
    if not graph.nodes():
        return {}
    node_weights: dict[str, float] = {}
    total = 0.0
    for node_id in tqdm(graph.nodes(), desc="Calculating nstart weights", leave=False):
        try:
            node_type, _ = NodeIdentifierParser.parse(node_id)
            w = NODE_TYPE_WEIGHTS.get(node_type, 1.0)
        except Exception:
            w = 1.0
        if w < 0 or not np.isfinite(w):
            w = 0.0
        node_weights[node_id] = w
        total += w
    if total == 0.0:
        u = 1.0 / max(1, len(node_weights))
        return {n: u for n in node_weights}
    return {n: w / total for n, w in node_weights.items()}

# ---------------------------
# Edge weight expectation
# ---------------------------
def _expected_edge_weight(data: dict) -> float:
    """Analytic expectation; keep weights nonnegative (and ‚â§1 if probability-like)."""
    dist = data.get("dist")
    if dist == "normal":
        m = float(data.get("m", 0.0))
        w = np.clip(m, 0.0, 1.0)  # treat as probability
    elif dist == "beta":
        a = float(data.get("a", 1.0))
        b = float(data.get("b", 1.0))
        denom = a + b
        w = (a / denom) if denom > 0 else 0.0
    elif dist in ("auto", "constant"):
        w = float(data.get("weight", 1.0))
        w = np.clip(w, 0.0, 1.0) if data.get("probability_like", True) else max(0.0, w)
    else:
        return 1.0
    if not np.isfinite(w) or w < 0:
        return 0.0
    return float(w)

def _normalize_dist(vec: dict[str, float] | None, universe: list[str]) -> dict[str, float]:
    """Ensure nonnegative and sum to 1 over the universe; fallback to uniform."""
    out = {}
    total = 0.0
    for n in universe:
        v = float(vec.get(n, 0.0)) if vec is not None else 0.0
        if not np.isfinite(v) or v < 0:
            v = 0.0
        out[n] = v
        total += v
    if total == 0.0:
        u = 1.0 / max(1, len(universe))
        return {n: u for n in universe}
    return {n: v / total for n, v in out.items()}

# ---------------------------
# Log01 normalization util
# ---------------------------
def _log01_array(arr: np.ndarray, scale: float | None = None) -> np.ndarray:
    """
    Clip to >=0, apply log1p(arr*scale), then min-max to [0,1].
    scale defaults to 1/median to avoid collapsing the bulk.
    """
    arr = np.clip(arr, 0.0, None)
    if arr.size == 0:
        return arr

    if scale is None:
        positives = arr[arr > 0]
        med = np.median(positives) if positives.size else 0.0
        scale = (1.0 / med) if med > 0 else 1e9

    x = np.log1p(arr * scale)
    xmin = np.min(x)
    denom = np.ptp(x)  # == x.max() - x.min()

    if not np.isfinite(xmin) or not np.isfinite(denom) or denom == 0:
        return np.zeros_like(arr)

    return (x - xmin) / denom

# ---------------------------
# Main scoring (stores only log01 scores)
# ---------------------------
def calculate_knowledge_scores(
    G: nx.DiGraph,
    alpha: float = 0.85,
    max_iter: int = 50000,
    nstart_foundational: dict[str, float] | None = None,
    nstart_influence: dict[str, float] | None = None,
) -> None:
    """
    Computes two PageRank vectors on a knowledge-only graph, log01-normalizes them,
    and writes ONLY:
      - G.nodes[n]["foundational_score"]
      - G.nodes[n]["influence_score"]
    """
    # Build knowledge edges with analytic expectations
    knowledge_edges = []
    for u, v, data in tqdm(G.edges(data=True), desc="Building knowledge graph", leave=False):
        # if data.get("type") == "dependency":
            # continue
        w = _expected_edge_weight(data)
        if w > 0.0:
            knowledge_edges.append((u, v, {"weight": w}))

    knowledge_graph = nx.DiGraph()
    knowledge_graph.add_nodes_from(G.nodes())
    knowledge_graph.add_edges_from(knowledge_edges)

    # Personalized vectors + dangling vectors
    nodes_list = list(knowledge_graph.nodes())
    pers_found = _normalize_dist(nstart_foundational, nodes_list)
    pers_infl  = _normalize_dist(nstart_influence, nodes_list)

    # Foundational PageRank
    pr_found = nx.pagerank(
        knowledge_graph,
        alpha=alpha,
        personalization=pers_found,
        dangling=pers_found,
        weight="weight",
        max_iter=max_iter,
    )
    # Influence PageRank (reverse)
    pr_infl = nx.pagerank(
        knowledge_graph.reverse(copy=False),
        alpha=alpha,
        personalization=pers_infl,
        dangling=pers_infl,
        weight="weight",
        max_iter=max_iter,
    )

    # Convert to arrays in a consistent node order for normalization
    f_arr = np.array([float(pr_found[n]) for n in nodes_list], dtype=float)
    i_arr = np.array([float(pr_infl[n])  for n in nodes_list], dtype=float)

    # Clip tiny numerical negatives (shouldn't appear) and log01-normalize
    f_norm = _log01_array(np.clip(f_arr, 0.0, None))
    i_norm = _log01_array(np.clip(i_arr, 0.0, None))

    # Write ONLY the final normalized scores
    for n, f, i in zip(nodes_list, f_norm, i_norm):
        G.nodes[n]["foundational_score"] = float(f)
        G.nodes[n]["influence_score"]    = float(i)

# ============================
# Usage
# ============================
# 1) Get your graph
G = exp.G

# 2) Personalization for foundational; keep influence unbiased
nstart_foundational = create_personalized_nstart(G)
nstart_influence = None

# 3) Compute and store ONLY the two normalized scores
calculate_knowledge_scores(
    G,
    alpha=0.85,
    max_iter=100_000,
    nstart_foundational=nstart_foundational,
    nstart_influence=nstart_influence,
)

# (Optional) Quick peek
some_nodes = list(G.nodes())[:5]
for n in some_nodes:
    print(n, "found:", G.nodes[n].get("foundational_score"),
             "infl:",  G.nodes[n].get("influence_score"))


In [None]:
[(nid, d) for (nid, d) in G.nodes(data=True)][:100]

In [None]:
list(G.predecessors("LEMMA:Ÿà"))

In [None]:
for node_id, node_data in sorted(
    # [(nid, d) for (nid, d) in G.nodes(data=True) ], key=lambda x: x[1].get("influence_score", 0), reverse=True
    [(nid, d) for (nid, d) in G.nodes(data=True) ], key=lambda x: x[1].get("foundational_score", 0), reverse=True
)[:1000]:
# ):
    foundational = node_data.get("foundational_score", "N/A")
    influence = node_data.get("influence_score", "N/A")
    print(
        f"Node: {node_id}, Foundational: {foundational:.6f}, Influence: {influence:.6f}"
    )

In [None]:
import pandas as pd

rows = []
for _, n in G.nodes(data=True):
    if "foundational_score" in n and "influence_score" in n:
        rows.append({
            "foundational_score": n.get("foundational_score"),
            "influence_score": n.get("influence_score"),
        })

df = pd.DataFrame(rows)
df.describe()

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8,5))
df["foundational_score"].hist(bins=500, alpha=0.7, label="Foundational")
df["influence_score"].hist(bins=500, alpha=0.7, label="Influence")
plt.xlabel("log01 score")
plt.ylabel("Frequency")
plt.legend()
plt.title("Distribution of log01 scores")
plt.show()

In [None]:
import cbor2
from iqrah.quran_api.models import TranslationByWord, TransliterationByWord
import zstandard as zstd
import os

def export_graph_to_cbor(G, output_path="iqrah-graph-v1.0.0.cbor.zst"):
    """Export NetworkX graph to compressed CBOR sequence with progress info"""

    print(f"üöÄ Starting export of {len(G.nodes)} nodes, {len(G.edges)} edges...")

    with open(output_path, "wb") as f:
        cctx = zstd.ZstdCompressor(level=9).stream_writer(f)
        enc = cbor2.CBOREncoder(cctx)

        # Header with metadata
        header = {
            "v": 1,
            "graph": {
                "directed": G.is_directed(),
                "multi": G.is_multigraph(),
                "node_count": len(G.nodes),
                "edge_count": len(G.edges)
            }
        }
        enc.encode(header)
        print(f"üìã Header written: {len(G.nodes)} nodes, {len(G.edges)} edges")

        # Export nodes directly
        node_count = 0
        for node_id, attrs in G.nodes(data=True):
            match attrs.get('type', 'unknown'):
                case 'word_instance':
                    word_key = NodeIdentifierParser.get_word_instance_key(node_id)
                    word : Word = quran[word_key]
                    translation : TranslationByWord = word.translation
                    transliteration : TransliterationByWord = word.transliteration
                    attrs['audio_url'] = word.audio_url
                    attrs['arabic'] = word.text_uthmani
                    attrs['translation'] = translation.text
                    attrs['transliteration'] = transliteration.text

                case 'verse':
                    verse_key = NodeIdentifierParser.get_verse_key(node_id)
                    verse : Verse = quran[verse_key]
                    attrs['arabic'] = verse.text_uthmani
                    # attrs['translation'] = verse.translation

            enc.encode({
                "t": "node",
                "id": str(node_id),
                "a": dict(attrs)  # NetworkX attributes as-is
            })
            node_count += 1

            if node_count % 1000 == 0:
                print(f"üì¶ Exported {node_count} nodes...")

        print(f"‚úÖ All {node_count} nodes exported")

        # Export edges directly
        edge_count = 0
        for u, v, attrs in G.edges(data=True):
            # if 'type' in attrs:
                # attrs =

            enc.encode({
                "t": "edge",
                "u": str(u),
                "v": str(v),
                "a": dict(attrs)  # NetworkX attributes as-is
            })
            edge_count += 1

            if edge_count % 1000 == 0:
                print(f"üîó Exported {edge_count} edges...")

        print(f"‚úÖ All {edge_count} edges exported")
        cctx.flush()

    # File size report
    file_size = os.path.getsize(output_path)
    print(f"üéØ Export complete!")
    print(f"üìÅ File: {output_path}")
    print(f"üìä Size: {file_size / 1024 / 1024:.2f} MB ({file_size:,} bytes)")


def inspect_exported_graph(file_path, sample_size=10):
    """Fixed inspection - decode records in loop"""

    if not os.path.exists(file_path):
        print(f"‚ùå File not found: {file_path}")
        return

    file_size = os.path.getsize(file_path)
    print(f"\nüîç Inspecting: {file_path}")
    print(f"üìÅ Size: {file_size / 1024 / 1024:.2f} MB")

    try:
        with open(file_path, "rb") as f:
            dctx = zstd.ZstdDecompressor().stream_reader(f)
            decoder = cbor2.CBORDecoder(dctx)

            # Read header
            header = decoder.decode()
            print(f"\nüìã Header:")
            print(f"   Version: {header['v']}")
            print(f"   Nodes: {header['graph']['node_count']:,}")
            print(f"   Edges: {header['graph']['edge_count']:,}")

            # Sample records with manual iteration
            print(f"\nüî¨ Sample Records (first {sample_size}):")

            node_types = {}
            edge_attrs = {}
            count = 0

            try:
                while record := decoder.decode():
                    count += 1

                    print(f"[{count}] {record}")
                    if record["t"] == "node":
                        attrs = record["a"]
                        node_type = attrs.get("type", "unknown")
                        node_types[node_type] = node_types.get(node_type, 0) + 1

                        if count <= 3:
                            print(f"   Node: {record['id']}")
                            print(f"         attrs: {attrs}")

                    elif record["t"] == "edge":
                        attrs = record["a"]
                        for key in attrs.keys():
                            edge_attrs[key] = edge_attrs.get(key, 0) + 1

                        if count <= 3:
                            print(f"   Edge: {record['u']} ‚Üí {record['v']}")
                            print(f"         attrs: {attrs}")

            except EOFError:
                print(f"‚úÖ Reached end of file after {count} records")
                pass  # End of file

            print(f"\nüìä Sample Statistics:")
            print(f"   Node types: {dict(sorted(node_types.items()))}")
            print(f"   Edge attribute keys: {dict(sorted(edge_attrs.items()))}")

    except Exception as e:
        print(f"‚ùå Inspection failed: {e}")


def run_export(G):
    """Run the complete export process"""
    print(f"üéØ NetworkX Graph Export")
    print(f"üìä Graph: {len(G.nodes):,} nodes, {len(G.edges):,} edges")
    print(f"üîÑ Directed: {G.is_directed()}, Multi: {G.is_multigraph()}")

    # Export
    output_file = "iqrah-graph-v1.0.1.cbor.zst"
    export_graph_to_cbor(G, output_file)

    # Inspect results
    print("=" * 50)
    inspect_exported_graph(output_file, 1000)

    print(f"\nüöÄ Ready for Sprint 4!")
    return output_file

# Run it
exported_file = run_export(exp.G)

In [None]:
# Check if edges have distribution parameters
sample_edges = list(exp.G.edges(data=True))
for u, v, attrs in sample_edges:
    if attrs:
        print(f"Edge {u} -> {v}: {attrs}")
        break
else:
    print("No edge attributes found in first 10 edges")

In [None]:
# quran["1:2"]
# exp.G["LEMMA:ÿßŸÑ:translation"]
exp.G.number_of_edges()

In [None]:
graph.number_of_edges()

In [None]:
## version 2 (not working)

import networkx as nx
import logging
import json
from typing import Dict, List, Set, Tuple, Optional, Any
from pathlib import Path
from collections import Counter
import time

from iqrah.graph.identifiers import NIG, NIP
from iqrah.graph.knowledge import Distribution, KnowledgeEdgeManager
from iqrah.graph.node_manager import NodeManager
from iqrah.quran_api.models import Quran, Chapter, Verse, Word


class KnowledgeGraphBuilder:
    """
    Builds a knowledge graph for Quranic learning with focus on:
    - Memorization relationships
    - Translation understanding
    - Grammar connections

    This is a streamlined version focused on core functionality.
    """

    def __init__(self, graph: nx.DiGraph, quran: Quran):
        """
        Initialize the knowledge graph builder.

        Args:
            graph: Base dependency graph (should already contain nodes)
            quran: Quran data model
        """
        self.G = graph
        self.quran = quran
        self.edge_manager = KnowledgeEdgeManager(graph)
        self.node_manager = NodeManager(graph)
        self._is_compiled = False

        # Initialize statistics
        self.stats = Counter()

        logger.info(f"Initialized KnowledgeGraphBuilder with graph of {self.G.number_of_nodes()} nodes and {self.G.number_of_edges()} edges")

    def get_nodes_by_type(self, node_type: str) -> Set[str]:
        """Get all nodes of a specific type"""
        return self.node_manager.get_nodes_by_type(node_type)

    def get_verse_words(self, verse_id: str) -> List[str]:
        """Get all word instance IDs for a verse"""
        return self.node_manager.get_verse_words(verse_id)

    def get_chapter_verses(self, chapter_id: str) -> List[str]:
        """Get all verse IDs for a chapter"""
        return self.node_manager.get_chapter_verses(chapter_id)

    def get_word_root(self, word_id: str, cutoff: int = 3) -> Optional[str]:
        """Get root of a word by traversing the graph"""
        for path in nx.all_simple_paths(self.G, word_id,
                                      self.node_manager.get_nodes_by_type("root"),
                                      cutoff=cutoff):
            return path[-1]  # Return first found root
        return None

    def get_all_translatable_nodes(self) -> Set[str]:
        """Get all nodes that can have translation knowledge"""
        return (self.node_manager.get_nodes_by_type("word_instance") |
                self.node_manager.get_nodes_by_type("verse") |
                self.node_manager.get_nodes_by_type("chapter"))

    def get_duplicated_verses(self) -> List[Tuple[str, List[str]]]:
        """Find verses with identical text for connecting"""
        verse_map = {}
        for verse_id in self.get_nodes_by_type("verse"):
            verse = self.quran[NIP.get_verse_key(verse_id)]
            text = verse.text_uthmani_simple
            if text is None:
                continue
            verse_map[text] = verse_map.get(text, []) + [verse.verse_key]

        duplicates = {text: verses for text, verses in verse_map.items() if len(verses) > 1}
        return sorted(duplicates.items(), key=lambda x: len(x[1]), reverse=True)

    def calculate_word_complexity(self, word: Word) -> float:
        """
        Calculate linguistic complexity of a word based on basic factors.

        Args:
            word: Word to analyze

        Returns:
            Complexity score (0.0-1.0)
        """
        # Simple complexity based on length
        text = word.text_uthmani or ""
        if not text:
            return 0.5

        # Length complexity (longer words are harder)
        char_count = word.get_letters_count()
        length_complexity = min(1.0, char_count / 10.0)

        # Position factor (words at beginning or end are more memorable)
        position_complexity = 0.5

        # Combine factors
        complexity = 0.7 * length_complexity + 0.3 * position_complexity

        return min(1.0, complexity)

    def build_all(self) -> None:
        """
        Build all knowledge connections.
        """
        start_time = time.time()
        logger.info("Building knowledge graph connections")

        # Step 1: Standard memorization
        logger.info("Building memorization connections")
        self.build_memorization_connections()

        # Step 2: Translation understanding
        logger.info("Building translation connections")
        self.build_translation_connections()

        # Step 3: Grammar connections
        logger.info("Building grammar connections")
        self.build_grammar_connections()

        # Step 4: Connect learning dimensions
        logger.info("Connecting learning dimensions")
        self.connect_learning_dimensions()

        # Finalize
        self.compile()

        # Report statistics
        elapsed_time = time.time() - start_time
        logger.info(f"Knowledge graph built in {elapsed_time:.2f}s")
        logger.info(f"Created {self.stats['edges_created']} knowledge edges")

    def build_memorization_connections(self) -> None:
        """
        Build memorization connections:
        - Words to verses
        - Verses to chapters
        - Context windows for words
        """
        edges_before = self.stats.get("edges_created", 0)

        # Process all chapters
        for chapter_id in self.get_nodes_by_type("chapter"):
            chapter = self.quran[NIP.get_chapter_key(chapter_id)]

            # Process verses
            for verse in chapter.verses:
                verse_id = NIG.for_verse(verse)

                # Verse to chapter memorization edges
                self.edge_manager.add_knowledge_edge(
                    f"{verse_id}:memorization",
                    f"{chapter_id}:memorization",
                    Distribution.auto(weight=verse.get_letters_count())
                )
                self.stats["edges_created"] += 1

                # Word specific
                word_nodes = []
                for word in verse.words:
                    if word.char_type_name == "end":  # Skip end markers
                        continue

                    word_id = NIG.for_word_instance(word, verse)
                    word_nodes.append(word_id)

                    # Word to verse memorization edges
                    self.edge_manager.add_knowledge_edge(
                        f"{word_id}:memorization",
                        f"{verse_id}:memorization",
                        Distribution.auto(weight=word.get_letters_count())
                    )
                    self.stats["edges_created"] += 1

                # Contextual memorization for words
                if word_nodes:
                    window_edges = self.edge_manager.add_gaussian_window_edges(
                        [f"{w}:memorization" for w in word_nodes],
                        window_size=min(3, len(word_nodes)),
                        base_weight=0.5,
                        std_scale=0.15
                    )
                    self.stats["edges_created"] += window_edges

        edges_created = self.stats["edges_created"] - edges_before
        logger.info(f"Created {edges_created} memorization edges")

    def build_translation_connections(self) -> None:
        """
        Build translation understanding connections:
        - Words to verses
        - Verses to chapters
        - Word instances to word types
        - Duplicate verse connections
        """
        edges_before = self.stats.get("edges_created", 0)

        # Process chapters
        for chapter_id in self.get_nodes_by_type("chapter"):
            chapter = self.quran[NIP.get_chapter_key(chapter_id)]

            # Process verses
            for verse in chapter.verses:
                verse_id = NIG.for_verse(verse)

                # Verse to chapter translation
                self.edge_manager.add_knowledge_edge(
                    f"{verse_id}:translation",
                    f"{chapter_id}:translation",
                    Distribution.auto(weight=verse.get_words_count())
                )
                self.stats["edges_created"] += 1

                # Process words
                for word in verse.words:
                    if word.char_type_name == "end":
                        continue

                    word_instance_id = NIG.for_word_instance(word, verse)
                    word_type_id = NIG.for_word(word)

                    # Word to verse translation
                    self.edge_manager.add_knowledge_edge(
                        f"{word_instance_id}:translation",
                        f"{verse_id}:translation",
                        Distribution.auto(weight=word.get_letters_count())
                    )
                    self.stats["edges_created"] += 1

                    # Word instance to word type translation
                    self.edge_manager.add_knowledge_edge(
                        f"{word_instance_id}:translation",
                        f"{word_type_id}:translation",
                        Distribution.normal(mean=0.9, std=0.1)
                    )
                    self.stats["edges_created"] += 1

        # Connect duplicate verses
        duplicates = 0
        for text, verse_keys in self.get_duplicated_verses():
            # Connect all pairs
            for i in range(len(verse_keys)):
                for j in range(i+1, len(verse_keys)):
                    v1, v2 = verse_keys[i], verse_keys[j]
                    self.edge_manager.add_bidirectional_knowledge_edge(
                        f"{NIG.for_verse(v1)}:translation",
                        f"{NIG.for_verse(v2)}:translation",
                        Distribution.normal(mean=0.9, std=0.1)
                    )
                    self.stats["edges_created"] += 2
                    duplicates += 1

        logger.info(f"Connected {duplicates} duplicate verse pairs")

        edges_created = self.stats["edges_created"] - edges_before
        logger.info(f"Created {edges_created} translation edges")

    def build_grammar_connections(self) -> None:
        """
        Build grammar connections:
        - Words to lemmas
        - Lemmas to roots
        """
        edges_before = self.stats.get("edges_created", 0)

        # Process words
        for word_id in self.get_nodes_by_type("word"):
            # Get lemmas
            lemma_ids = self.node_manager.get_related_nodes(word_id, successor_type="lemma")

            for lemma_id in lemma_ids:
                _, lemma = lemma_id.split(":", 1)

                # Word to lemma translation (bidirectional)
                self.edge_manager.add_bidirectional_knowledge_edge(
                    f"{word_id}:translation",
                    f"{lemma_id}:translation",
                    Distribution.auto(weight=len(lemma))
                )
                self.stats["edges_created"] += 2

                # Get roots
                root_ids = self.node_manager.get_related_nodes(lemma_id, successor_type="root")

                for root_id in root_ids:
                    # Lemma to root meaning
                    self.edge_manager.add_bidirectional_knowledge_edge(
                        f"{lemma_id}:translation",
                        f"{root_id}:meaning",
                        Distribution.beta(alpha=4, beta=1.5)
                    )
                    self.stats["edges_created"] += 2

        edges_created = self.stats["edges_created"] - edges_before
        logger.info(f"Created {edges_created} grammar edges")

    def connect_learning_dimensions(self) -> None:
        """
        Connect different learning dimensions to model how they affect each other:
        - Translation helps memorization
        - Grammar helps translation
        """
        edges_before = self.stats.get("edges_created", 0)

        # Translation helps memorization
        logger.info("Connecting translation to memorization")
        for node_id in self.get_all_translatable_nodes():
            self.edge_manager.add_knowledge_edge(
                f"{node_id}:translation",
                f"{node_id}:memorization",
                Distribution.beta(alpha=3, beta=2)
            )
            self.stats["edges_created"] += 1

        # Grammar helps translation for words
        logger.info("Connecting grammar to translation")
        for word_id in self.get_nodes_by_type("word"):
            self.edge_manager.add_knowledge_edge(
                f"{word_id}:grammar",
                f"{word_id}:translation",
                Distribution.normal(mean=0.5, std=0.15)
            )
            self.stats["edges_created"] += 1

        edges_created = self.stats["edges_created"] - edges_before
        logger.info(f"Created {edges_created} cross-dimension learning edges")

    def compile(self) -> None:
        """
        Finalize the knowledge graph by computing all pending weights
        and performing any necessary validations.
        """
        if self._is_compiled:
            raise RuntimeError("Knowledge graph has already been compiled")

        logger.info("Compiling knowledge graph")

        # Compile edge weights
        self.edge_manager.compile()

        # Validate final graph state
        self._validate_compiled_graph()

        # Update graph metadata
        self.G.graph['knowledge_edges'] = self.stats["edges_created"]
        self.G.graph['knowledge_compiled'] = True

        self._is_compiled = True
        logger.info("Knowledge graph compiled successfully")

    def _validate_compiled_graph(self) -> None:
        """
        Perform final validation checks on the compiled graph.
        """
        # Ensure no edges are missing weights
        for src, dst, data in self.G.edges(data=True):
            if "dist" not in data:
                is_exception = (
                    data.get("type") == "dependency"  # Dependency edges don't need weights
                )
                if not is_exception and data.get("knowledge_type") is not None:
                    raise ValueError(
                        f"Found edge missing weight distribution after compilation: "
                        f"{src} -> {dst} {{{data}}}"
                    )

    def save(self, filename: str) -> None:
        """
        Save the compiled knowledge graph to a file.

        Args:
            filename: Path to save the graph
        """
        if not self._is_compiled:
            raise RuntimeError("Cannot save uncompiled knowledge graph. Call compile() first")

        logger.info(f"Saving knowledge graph to {filename}")

        # Add timestamp
        import datetime
        self.G.graph['created_at'] = datetime.datetime.now().isoformat()
        self.G.graph['node_count'] = self.G.number_of_nodes()
        self.G.graph['edge_count'] = self.G.number_of_edges()

        # Save graph
        nx.write_graphml(self.G, filename)

        # Save stats file
        stats_file = Path(filename).with_suffix('.stats.json')
        stats = {
            'nodes': self.G.number_of_nodes(),
            'edges': self.G.number_of_edges(),
            'knowledge_edges': self.stats["edges_created"],
            'node_types': {
                node_type: len(self.get_nodes_by_type(node_type))
                for node_type in ['word', 'word_instance', 'verse', 'chapter', 'lemma', 'root']
            }
        }

        with open(stats_file, 'w', encoding='utf-8') as f:
            json.dump(stats, f, indent=2)

        logger.info(f"Knowledge graph saved. Statistics written to {stats_file}")



def build_knowledge_graph(dependency_graph_path: str, output_path: str, quran: Quran) -> None:
    """
    Build a knowledge graph from the dependency graph and Quran data.

    Args:
        dependency_graph_path: Path to the dependency graph (GraphML)
        output_path: Path to save the resulting knowledge graph
        quran: Quran data model
    """
    logger.info(f"Loading dependency graph from {dependency_graph_path}")

    # Load dependency graph
    graph = nx.read_graphml(dependency_graph_path)

    logger.info(f"Loaded graph with {graph.number_of_nodes()} nodes and {graph.number_of_edges()} edges")

    # Initialize builder
    builder = KnowledgeGraphBuilder(graph, quran)

    # Build knowledge connections
    builder.build_all()

    # Save the graph
    builder.save(output_path)

    logger.info(f"Knowledge graph saved to {output_path}")


# graph = nx.read_graphml("quran_dependency_big.graphml")
# exp = KnowledgeExperiments(graph, quran)

# # Try different configurations
# exp.setup_standard_memorization()
# exp.setup_tajweed_learning()
# exp.setup_translation_understanding()
# exp.setup_deep_understanding()
# exp.setup_grammar_nodes()

# # # Test experimental hypotheses
# # exp.setup_experimental_learning()

# # Save different configurations
# # nx.write_graphml(graph, "quran_knowledge_medium.graphml")
# # nx.write_graphml(graph, "quran_knowledge_fatiha.graphml")
# exp.compile()
# # exp.save("quran_knowledge_fatiha.graphml")
# # exp.save("quran_knowledge_medium.graphml")
# exp.save("quran_knowledge_big.graphml")

build_knowledge_graph("quran_dependency_big.graphml", "quran_knowledge_big.graphml", quran)

In [None]:


def main():
    """Main entry point"""
    parser = argparse.ArgumentParser(description="Build a knowledge graph for the Iqrah app")
    parser.add_argument("--input", required=True, help="Path to dependency graph (GraphML)")
    parser.add_argument("--output", required=True, help="Path to save knowledge graph")
    parser.add_argument("--quran-data", required=True, help="Path to Quran data (pickle or JSON)")
    args = parser.parse_args()

    try:
        # Load Quran data
        logger.info(f"Loading Quran data from {args.quran_data}")

        quran_path = Path(args.quran_data)
        if quran_path.suffix == '.pickle':
            import pickle
            with open(quran_path, 'rb') as f:
                quran = pickle.load(f)
        elif quran_path.suffix in ['.json', '.jsonl']:
            # Implement JSON loading based on your Quran data format
            raise NotImplementedError("JSON loading not implemented in this example")
        else:
            raise ValueError(f"Unsupported Quran data format: {quran_path.suffix}")

        # Build graph
        build_knowledge_graph(args.input, args.output, quran)

    except Exception as e:
        logger.error(f"Error building knowledge graph: {str(e)}", exc_info=True)
        sys.exit(1)
