In [None]:
!pip install googletrans==3.1.0a0

Collecting googletrans==3.1.0a0
  Downloading googletrans-3.1.0a0.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3 (from googletrans==3.1.0a0)
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans==3.1.0a0)
  Downloading hstspreload-2025.1.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==3.1.0a0)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting idna==2.* (from httpx==0.13.3->googletrans==3.1.0a0)
  Downloading idna-2.10-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting rfc3986<2,>=1.3 (from httpx==0.13.3->googletrans==3.1.0a0)
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting httpcore==0.9.* (from httpx==0.13.3->googletrans==3.1.0a0)
  Downloading httpcore-0.9.1-py3-none-any.whl.metadata (4.6 kB)
Collecting h11<0.10,>=0.8 (from httpcore==0.9.*->httpx==0.13.3->googletrans==3.1.0a0

In [None]:
import os
import requests
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Dict

def limit_characters(text, limit):
    return text[:limit] if isinstance(text, str) else ""

def translate_text(text, target_lang):
    return text  # Placeholder for translation function

def get_user_language():
    return "en"  # Placeholder for user language function

class NewsRAGSystem:
    def __init__(self, news_api_key):
        self.news_api_key = news_api_key
        self.model = SentenceTransformer('all-MiniLM-L6-v2')

    def fetch_news(self, query, language='en', top_k=5):
        url = f'https://newsapi.org/v2/everything?q={query}&language={language}&apiKey={self.news_api_key}'
        response = requests.get(url)
        if response.status_code == 200:
            return response.json()['articles'][:top_k]
        else:
            print(f"Error fetching news: {response.status_code}")
            return []

    def semantic_search(self, query, articles, top_k=5):
        if not articles:
            return []
        query_embedding = self.model.encode([query])[0]
        article_embeddings = self.model.encode([a['description'] for a in articles if a['description']])
        similarities = cosine_similarity([query_embedding], article_embeddings)[0]
        top_indices = similarities.argsort()[-top_k:][::-1]
        return [articles[i] for i in top_indices]

def create_news_dataframe(articles: List[Dict], target_lang: str, char_limit: int) -> pd.DataFrame:
    df = pd.DataFrame(articles)[["title", "content"]].copy()
    df["content"] = df["content"].apply(lambda x: limit_characters(x, char_limit))

    print("\nTranslating articles...")
    df["Combined"] = df["title"].apply(lambda x: translate_text(x, target_lang)) + " " + df["content"].apply(lambda x: translate_text(x, target_lang))

    return df[["Combined"]]

if __name__ == "__main__":
    NEWS_API_KEY = os.environ.get("NEWS_API_KEY", "your-newsapi-key-here")
    X, Q, Y = 25, "Virat Kohli News", 500
    print(f"\nFetching {X} news articles on '{Q}' with {Y} characters per article.")
    target_lang = get_user_language()
    rag_system = NewsRAGSystem(NEWS_API_KEY)
    articles = rag_system.fetch_news(Q, top_k=25)

    if articles:
        relevant_articles = rag_system.semantic_search(Q, articles, top_k=X)
        result_df = create_news_dataframe(relevant_articles, target_lang, Y)
        print(result_df)
    else:
        print("No articles retrieved or error occurred.")

In [None]:
import re

# Function to clean text
def clean_text(text):
    text = re.sub(r"[\\\[\]\r\n\"]+", " ", text)  # Remove \, [, ], \r, \n, "
    text = re.sub(r"\s+", " ", text.replace("\n", " ").replace("\u200c", ""))  # Normalize spaces and remove \u200c
    text = re.sub(r"\+\d+\sఅక్షరాలు", "", text)  # Remove patterns like +4487 అక్షరాలు
    text = re.sub(r"\+\d+\s*chars?", "", text)  # Remove patterns like +2820 chars or +2820 char
    text = re.sub(r"</?li>|<ul>", "", text)  # Remove </li>, <li>, and <ul>
    text = re.sub(r"\bremove it\b", "", text, flags=re.IGNORECASE)  # Remove "remove it" case-insensitively
    return text.strip()

# Apply cleaning to the entire DataFrame column
result_df["cleaned_text"] = result_df["Combined"].apply(clean_text)

# Concatenate cleaned rows in the format [Article 1] ... [Article n]
text = " ".join([f"[Article {i+1}] {row}" for i, row in enumerate(result_df["cleaned_text"])])

print(text)

[Article 1] Virat Does A Kohli : How Pak Media Reacted To India Star's Knockout Punch The master of chases in one-day cricket, Virat Kohli gave the world another reminder of his greatness in white-ball cricket and never-ending hunger for runs as he overcame a patchy run of form to sco… [Article 2] 50-Over Cricket’s Midlife Crisis, With India Twist Champions Trophy, an ODI tourney, will get real buzzy only with an Indo-Pak final. Indian selectors, however, have batted for the past and interrupted the future When the last ICC Champions Trophy … [Article 3] Mohammad Amir desires to play in IPL as part of Virat Kohli’s RCB Former Pakistan pacer Mohammad Amir, who is believed to have applied for UK passport as his wife Narjis is a citizen of the country, is eyeing the Indian Premier League (IPL). While Pakistani players… [Article 4] You tried to break my legs: Rohit hails Pakistan-born net bowler after 'fiery' session With his quick yorkers directed at Rohit Sharma's toes, Pakistan-born net

In [None]:
text

"[Article 1] Virat Does A Kohli : How Pak Media Reacted To India Star's Knockout Punch The master of chases in one-day cricket, Virat Kohli gave the world another reminder of his greatness in white-ball cricket and never-ending hunger for runs as he overcame a patchy run of form to sco… [Article 2] 50-Over Cricket’s Midlife Crisis, With India Twist Champions Trophy, an ODI tourney, will get real buzzy only with an Indo-Pak final. Indian selectors, however, have batted for the past and interrupted the future When the last ICC Champions Trophy … [Article 3] Mohammad Amir desires to play in IPL as part of Virat Kohli’s RCB Former Pakistan pacer Mohammad Amir, who is believed to have applied for UK passport as his wife Narjis is a citizen of the country, is eyeing the Indian Premier League (IPL). While Pakistani players… [Article 4] You tried to break my legs: Rohit hails Pakistan-born net bowler after 'fiery' session With his quick yorkers directed at Rohit Sharma's toes, Pakistan-born ne

In [None]:
from google.colab import files

# Save text to a file
with open("output.txt", "w", encoding="utf-8") as file:
    file.write(text)

# Download the file
files.download("output.txt")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!pip install --upgrade "httpx<0.25"
!pip install --upgrade groq

Collecting httpx<0.25
  Downloading httpx-0.24.1-py3-none-any.whl.metadata (7.4 kB)
Collecting httpcore<0.18.0,>=0.15.0 (from httpx<0.25)
  Downloading httpcore-0.17.3-py3-none-any.whl.metadata (18 kB)
Collecting h11<0.15,>=0.13 (from httpcore<0.18.0,>=0.15.0->httpx<0.25)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading httpx-0.24.1-py3-none-any.whl (75 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.4/75.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpcore-0.17.3-py3-none-any.whl (74 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.5/74.5 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: h11, httpcore, httpx
  Attempting uninstall: h11
    Found existing installation: h11 0.9.0
    Uninstalling h11-

Collecting groq
  Downloading groq-0.19.0-py3-none-any.whl.metadata (15 kB)
Downloading groq-0.19.0-py3-none-any.whl (122 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/122.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━[0m [32m112.6/122.2 kB[0m [31m4.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.2/122.2 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.19.0


In [None]:
!pip install networkx
!pip install python-igraph
!pip install leidenalg
!pip install pyvis
!pip install ipython
!pip install groq
!pip install sentence-transformers
!pip install fuzzywuzzy
!pip install python-Levenshtein
!pip install langdetect

Collecting python-igraph
  Downloading python_igraph-0.11.8-py3-none-any.whl.metadata (2.8 kB)
Collecting igraph==0.11.8 (from python-igraph)
  Downloading igraph-0.11.8-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting texttable>=1.6.2 (from igraph==0.11.8->python-igraph)
  Downloading texttable-1.7.0-py2.py3-none-any.whl.metadata (9.8 kB)
Downloading python_igraph-0.11.8-py3-none-any.whl (9.1 kB)
Downloading igraph-0.11.8-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading texttable-1.7.0-py2.py3-none-any.whl (10 kB)
Installing collected packages: texttable, igraph, python-igraph
Successfully installed igraph-0.11.8 python-igraph-0.11.8 texttable-1.7.0
Collecting leidenalg
  Downloading leidenalg-0.10.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Downloading leidenalg-0.10

In [None]:
with open("/content/output.txt", "r", encoding="utf-8") as file:
    text = file.read()

In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import networkx as nx
from pyvis.network import Network
from collections import Counter
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from langdetect import detect
import numpy as np
import webbrowser
from groq import Groq
from fuzzywuzzy import fuzz

class LSTMHybridMultilingualModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_languages, lang_emb_size):
        super(LSTMHybridMultilingualModel, self).__init__()
        self.lang_embedding = nn.Embedding(num_languages, lang_emb_size)
        self.lstm = nn.LSTM(input_size + lang_emb_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.value_head = nn.Linear(hidden_size, 1)

    def forward(self, x, lengths, lang_index):
        lang_emb = self.lang_embedding(lang_index)
        batch_size, seq_len, _ = x.size()
        lang_emb_expanded = lang_emb.unsqueeze(1).repeat(1, seq_len, 1)
        x_cat = torch.cat([x, lang_emb_expanded], dim=2)
        packed = pack_padded_sequence(x_cat, lengths, batch_first=True, enforce_sorted=False)
        lstm_out, _ = self.lstm(packed)
        unpacked, _ = pad_packed_sequence(lstm_out, batch_first=True)
        final_out = unpacked[:, -1, :]
        output = self.fc(final_out)
        value = self.value_head(final_out)
        return output, value

class GraphRAGPipeline:
    def __init__(self, api_key):
        self.client = Groq(api_key=api_key)
        self.model_name = "llama-3.3-70b-versatile"
        self.visualization_path = "graph_visualization"
        self.lstm_model = LSTMHybridMultilingualModel(
            input_size=512,
            hidden_size=128,
            output_size=512,
            num_languages=10,
            lang_emb_size=50
        )
        self.optimizer = optim.Adam(self.lstm_model.parameters(), lr=1e-4)
        self.rl_criterion = nn.SmoothL1Loss()

        if not os.path.exists(self.visualization_path):
            os.makedirs(self.visualization_path)

        self.multilingual_model = SentenceTransformer('distiluse-base-multilingual-cased-v2')
        self.language_map = {}
        self.next_lang_index = 0
        self.current_lang_index = None
        self.rl_memory = []
        self.training = True

    def train_mode(self):
        """Set the pipeline to training mode."""
        self.training = True

    def eval_mode(self):
        """Set the pipeline to evaluation mode."""
        self.training = False

    def get_language_index(self, lang):
        if lang in self.language_map:
            return self.language_map[lang]
        else:
            index = self.next_lang_index
            self.language_map[lang] = index
            self.next_lang_index += 1
            return index

    def _detect_language(self, text):
        try:
            lang = detect(text)
            print(f"[DEBUG] Detected language: {lang}")
            return lang
        except Exception as e:
            print(f"[ERROR] Language detection failed: {e}")
            return 'en'

    def _split_text(self, text, chunk_size=600, overlap=100):
        chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size - overlap)]
        print(f"[DEBUG] Split text into {len(chunks)} chunks.")
        return chunks

    def _extract_elements(self, chunks):
        elements = []
        for i, chunk in enumerate(chunks):
            try:
                response = self.client.chat.completions.create(
                    model=self.model_name,
                    messages=[
                        {"role": "system", "content": "Extract key entities and relationships:"},
                        {"role": "user", "content": chunk},
                    ]
                )
                print(f"[DEBUG] API response for chunk {i + 1}: {response.choices[0].message.content}")
                elements.append(response.choices[0].message.content)
            except Exception as e:
                print(f"[ERROR] Extraction error for chunk {i + 1}: {e}")
        return elements

    def _build_graph(self, elements):
        G = nx.Graph()
        edge_weights = {}
        node_frequencies = {}

        for element in elements:
            try:
                entities = [e.strip() for e in element.split('\n') if e.strip()]
                for entity in entities:
                    node_frequencies[entity] = node_frequencies.get(entity, 0) + 1
                    if not G.has_node(entity):
                        G.add_node(entity)
                for i in range(len(entities) - 1):
                    for j in range(i + 1, len(entities)):
                        edge = tuple(sorted([entities[i], entities[j]]))
                        edge_weights[edge] = edge_weights.get(edge, 0) + 1
            except Exception as e:
                print(f"[ERROR] Graph building error: {e}")

        for (node1, node2), weight in edge_weights.items():
            G.add_edge(node1, node2, weight=weight, title=f"Weight: {weight}")

        for node, freq in node_frequencies.items():
            G.nodes[node]['size'] = 10 + (freq * 5)
            G.nodes[node]['title'] = f"Frequency: {freq}"

        print(f"[DEBUG] Graph built with {len(G.nodes)} nodes and {len(G.edges)} edges.")
        return G

    def _detect_communities_leiden(self, nx_graph):
        try:
            import igraph as ig
            import leidenalg as la

            print(f"[DEBUG] Starting Leiden community detection on graph with {nx_graph.number_of_nodes()} nodes and {nx_graph.number_of_edges()} edges.")

            ig_graph = ig.Graph.from_networkx(nx_graph)
            print(f"[DEBUG] Converted NetworkX graph to iGraph. Verifying: {ig_graph.vcount()} nodes, {ig_graph.ecount()} edges.")

            weights = nx_graph.edges(data='weight', default=1.0)
            weight_list = [w for _, _, w in weights]
            print(f"[DEBUG] Edge weight statistics: min={min(weight_list):.2f}, max={max(weight_list):.2f}, avg={sum(weight_list)/len(weight_list):.2f}")

            best_partition = None
            best_quality = float('-inf')
            resolutions = [0.1, 0.3, 0.5, 0.7, 1.0, 1.3, 1.5, 1.7, 2.0]

            print("[DEBUG] Starting resolution parameter sweep:")
            for resolution in resolutions:
                partition = la.find_partition(
                    ig_graph,
                    la.RBConfigurationVertexPartition,
                    weights=weight_list,
                    resolution_parameter=resolution,
                    n_iterations=200
                )
                quality = partition.quality()
                num_communities = len(partition)
                avg_community_size = ig_graph.vcount() / num_communities
                print(f"[DEBUG] Resolution: {resolution:.1f}, Quality: {quality:.4f}, Communities: {num_communities}, Avg Size: {avg_community_size:.2f}")

                if quality > best_quality:
                    best_quality = quality
                    best_partition = partition
                    print(f"[DEBUG] New best partition found at resolution {resolution:.1f}")

            print(f"[DEBUG] Best partition: Resolution={resolutions[resolutions.index(best_partition.resolution_parameter)]:.1f}, Quality={best_quality:.4f}")

            communities = [
                set(ig_graph.vs[node]["_nx_name"] for node in community)
                for community in best_partition
            ]

            community_sizes = [len(comm) for comm in communities]
            print(f"[DEBUG] Community size statistics: min={min(community_sizes)}, max={max(community_sizes)}, avg={sum(community_sizes)/len(community_sizes):.2f}")

            print(f"[DEBUG] Found {len(communities)} communities using optimized Leiden algorithm.")

            # Modularity calculation
            modularity = best_partition.modularity
            print(f"[DEBUG] Modularity of best partition: {modularity:.4f}")

            return communities

        except Exception as e:
            print(f"[ERROR] Leiden community detection error: {e}")
            print("[DEBUG] Falling back to Louvain method.")
            import community as community_louvain

            partition = community_louvain.best_partition(nx_graph)
            print(f"[DEBUG] Louvain method found {len(set(partition.values()))} communities.")

            communities = [
                set(node for node, part in partition.items() if part == i)
                for i in set(partition.values())
            ]

            community_sizes = [len(comm) for comm in communities]
            print(f"[DEBUG] Louvain community size statistics: min={min(community_sizes)}, max={max(community_sizes)}, avg={sum(community_sizes)/len(community_sizes):.2f}")

            # Modularity calculation for Louvain
            modularity = community_louvain.modularity(partition, nx_graph)
            print(f"[DEBUG] Modularity of Louvain partition: {modularity:.4f}")

            return communities

    def _visualize_graph(self, graph, communities=None):
        try:
            net = Network(height="750px", width="100%", bgcolor="#ffffff", font_color="black")
            net.force_atlas_2based()
            if communities:
                color_palette = [
                    "#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00",
                    "#ffff33", "#a65628", "#f781bf", "#999999"
                ]
                node_colors = {}
                for i, community in enumerate(communities):
                    color = color_palette[i % len(color_palette)]
                    for node in community:
                        node_colors[node] = color
                for node, data in graph.nodes(data=True):
                    net.add_node(
                        node,
                        color=node_colors.get(node, "gray"),
                        size=data.get('size', 25),
                        title=f"{node}\n{data.get('title', '')}"
                    )
            else:
                for node, data in graph.nodes(data=True):
                    net.add_node(
                        node,
                        size=data.get('size', 25),
                        title=f"{node}\n{data.get('title', '')}"
                    )
            for source, target, data in graph.edges(data=True):
                weight = data.get('weight', 1.0)
                width = 1 + (weight * 2)
                net.add_edge(source, target, width=width, title=f"Weight: {weight}")

            net.set_options("""
            {
                "physics": {
                    "forceAtlas2Based": {
                        "gravitationalConstant": -100,
                        "centralGravity": 0.01,
                        "springLength": 100,
                        "springConstant": 0.08
                    },
                    "maxVelocity": 50,
                    "solver": "forceAtlas2Based",
                    "timestep": 0.35,
                    "stabilization": {"iterations": 150}
                },
                "interaction": {
                    "hover": true,
                    "navigationButtons": true,
                    "dragNodes": true
                }
            }
            """)

            output_path = os.path.join(self.visualization_path, "graph.html")
            net.save_graph(output_path)
            webbrowser.open(f'file://{os.path.abspath(output_path)}')
            print(f"[INFO] Graph visualization saved to {output_path}")
        except Exception as e:
            print(f"[ERROR] Visualization error: {e}")

    def _rl_update(self):
        if len(self.rl_memory) == 0:
            print("[DEBUG] RL memory is empty. Skipping update.")
            return

        # Unpack experiences
        states, actions, rewards, next_states, values = zip(*self.rl_memory)

        print(f"[DEBUG] Updating RL model with {len(self.rl_memory)} experiences.")

        # Convert to tensors
        rewards_tensor = torch.stack(rewards)
        values_tensor = torch.stack(values).squeeze(-1)
        actions_tensor = torch.stack(actions)

        print(f"[DEBUG] Rewards tensor shape: {rewards_tensor.shape}")
        print(f"[DEBUG] Values tensor shape: {values_tensor.shape}")
        print(f"[DEBUG] Actions tensor shape: {actions_tensor.shape}")

        # Calculate advantages using simple reward-value difference
        with torch.no_grad():
            advantages = rewards_tensor - values_tensor

        print(f"[DEBUG] Advantages mean: {advantages.mean().item():.4f}, std: {advantages.std().item():.4f}")

        # Normalize advantages
        advantages_normalized = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

        print(f"[DEBUG] Normalized advantages mean: {advantages_normalized.mean().item():.4f}, std: {advantages_normalized.std().item():.4f}")

        # Calculate losses
        policy_loss = -(actions_tensor * advantages_normalized).mean()
        value_loss = self.rl_criterion(values_tensor, rewards_tensor.detach())
        total_loss = policy_loss + 0.5 * value_loss

        print(f"[RL DEBUG] Policy Loss: {policy_loss.item():.4f}, Value Loss: {value_loss.item():.4f}, Total Loss: {total_loss.item():.4f}")

        # Backpropagation
        self.optimizer.zero_grad()
        total_loss.backward()

        # Gradient norms before clipping
        grad_norms_before = [p.grad.norm().item() for p in self.lstm_model.parameters() if p.grad is not None]
        print(f"[DEBUG] Gradient norms before clipping: mean={np.mean(grad_norms_before):.4f}, max={max(grad_norms_before):.4f}")

        torch.nn.utils.clip_grad_norm_(self.lstm_model.parameters(), 0.5)

        # Gradient norms after clipping
        grad_norms_after = [p.grad.norm().item() for p in self.lstm_model.parameters() if p.grad is not None]
        print(f"[DEBUG] Gradient norms after clipping: mean={np.mean(grad_norms_after):.4f}, max={max(grad_norms_after):.4f}")

        self.optimizer.step()

        # Parameter update magnitudes
        with torch.no_grad():
            param_update_norms = [(p.data - p.old_data).norm().item() for p in self.lstm_model.parameters() if hasattr(p, 'old_data')]
            print(f"[DEBUG] Parameter update norms: mean={np.mean(param_update_norms):.4f}, max={max(param_update_norms):.4f}")

        # Store current parameters for next update comparison
        for p in self.lstm_model.parameters():
            p.old_data = p.data.clone()

        # Clear memory
        self.rl_memory = []
        print("[DEBUG] RL memory cleared after update.")
    def _multi_hop_reasoning(self, graph, query):
        candidates = []
        print(f"[DEBUG] Starting multi-hop reasoning for query: {query}")

        # Step 1: Identify Seed Nodes
        seed_nodes = [
            node for node in graph.nodes()
            if fuzz.partial_ratio(query.lower(), node.lower()) > 70
        ]
        if not seed_nodes:
            degrees = dict(graph.degree())
            seed_nodes = sorted(degrees, key=degrees.get, reverse=True)[:3] if degrees else []
        seed_nodes = list(set(seed_nodes))
        print(f"[DEBUG] Identified seed nodes: {seed_nodes}")

        # Step 2: Query Embedding
        query_embedding = torch.tensor(
            self.multilingual_model.encode([query])[0],
            dtype=torch.float32
        ).unsqueeze(0).reshape(1, -1)
        print(f"[DEBUG] Query embedding shape: {query_embedding.shape}")

        # Step 3: Process each seed node
        for seed_node in seed_nodes:
            print(f"\n[DEBUG] Processing seed node: {seed_node}")
            try:
                # Personalized PageRank
                personalized = {n: 1 if n == seed_node else 0 for n in graph.nodes()}
                ppr = nx.pagerank(graph, personalization=personalized, alpha=0.9)
                top_nodes = sorted(ppr, key=ppr.get, reverse=True)[:15]
                top_nodes = [
                    node for node in top_nodes
                    if not any(ignore_word in node.lower() for ignore_word in ["entities", "relationships", "extracted"])
                ]
                print(f"[DEBUG] Top nodes from PageRank: {top_nodes}")

                # Step 4: Extract Paths
                paths = [
                    nx.shortest_path(graph, seed_node, target)
                    for target in top_nodes
                    if target != seed_node and nx.has_path(graph, seed_node, target)
                ]
                print(f"[DEBUG] Number of paths: {len(paths)}")
                print(f"[DEBUG] Sample paths: {paths[:3]}")

                path_targets = [path[-1] for path in paths]

                # Step 5: Path Embeddings
                path_embeddings = np.array([
                    np.mean(self.multilingual_model.encode(path), axis=0)
                    for path in paths
                ])
                path_tensor = torch.tensor(path_embeddings, dtype=torch.float32)
                print(f"[DEBUG] Path embeddings shape: {path_tensor.shape}")

                # Step 6: LSTM Processing
                batch_size = path_tensor.size(0)
                lstm_input = path_tensor.unsqueeze(1)
                path_lengths = torch.full((batch_size,), 1, dtype=torch.long)
                lang_index = torch.full((batch_size,), self.current_lang_index, dtype=torch.long)
                outputs, values = self.lstm_model(lstm_input, path_lengths, lang_index)

                # Cosine Similarity Debug
                expanded_query = query_embedding.expand(outputs.size(0), -1)
                cosine_similarities = F.cosine_similarity(outputs, expanded_query)
                print("[DEBUG] Cosine similarities for paths:")
                for i, (path, similarity) in enumerate(zip(paths, cosine_similarities)):
                    print(f"Path {i}: {' -> '.join(path)} | Similarity: {similarity.item():.4f}")

                # PageRank scores
                ppr_scores = torch.tensor(
                    [ppr.get(target, 0) for target in path_targets],
                    dtype=torch.float32
                )
                print("[DEBUG] PageRank scores for path targets:")
                for i, (target, score) in enumerate(zip(path_targets, ppr_scores)):
                    print(f"Target {i}: {target} | PageRank Score: {score.item():.4f}")

                # Combined scores
                scores = 0.7 * cosine_similarities + 0.3 * ppr_scores
                print("[DEBUG] Combined scores (0.7 * Cosine + 0.3 * PageRank):")
                for i, score in enumerate(scores):
                    print(f"Path {i}: {score.item():.4f}")

                probs = F.softmax(scores / 0.2, dim=0)
                log_probs = torch.log(probs + 1e-9)
                rewards = scores.detach()

                # RL Debug
                value_loss = F.mse_loss(values.squeeze(-1), scores).item()
                policy_loss = log_probs.mean().item()
                print(f"[RL DEBUG] Policy Loss: {policy_loss:.6f}, Value Loss: {value_loss:.6f}")

                # Step 7: Store in RL Memory
                for inp, lp, rw, val in zip(lstm_input, log_probs, rewards, values):
                    self.rl_memory.append((inp, lp, rw, None, val))
                if len(self.rl_memory) >= 32:
                    print("[DEBUG] RL Memory reached threshold, triggering update.")
                    self._rl_update()

                # Step 8: Candidate Selection
                num_candidates = min(3, scores.shape[0])
                if num_candidates > 0:
                    selected = (
                        torch.multinomial(probs, num_candidates, replacement=False)
                        if np.random.rand() < 0.3
                        else torch.topk(scores, num_candidates).indices
                    )
                    selected = selected.tolist() if isinstance(selected, torch.Tensor) else selected
                    top_paths = [paths[i] for i in selected]
                    print("[DEBUG] Selected paths:")
                    for i, path in enumerate(top_paths):
                        print(f"Selected Path {i}: {' -> '.join(path)}")
                    candidates.extend([node for path in top_paths for node in path])
                else:
                    print("[WARNING] No valid candidates available, skipping selection.")

                # Step 9: Verification Score Improvement
                verification_scores = sorted(
                    [(path_targets[i], float(scores[i].item())) for i in range(len(path_targets))],
                    key=lambda x: -x[1]
                )
                print(f"[DEBUG] Top 5 Verification scores: {verification_scores[:5]}")

            except Exception as e:
                print(f"[ERROR] Reasoning error: {e}")
                import traceback
                traceback.print_exc()

        # Step 10: Sort candidates by frequency before returning
        candidate_counts = Counter(candidates)
        sorted_candidates = [k for k, v in sorted(candidate_counts.items(), key=lambda x: -x[1])]
        print(f"[DEBUG] Final sorted candidates: {sorted_candidates[:10]}")
        return sorted_candidates

    def _hallucination_detection(self, candidates, original_text):
        print(f"[DEBUG] Starting hallucination detection with {len(candidates)} candidates.")
        if not candidates:
            print("[DEBUG] No candidates provided. Returning empty list.")
            return []

        print(f"[DEBUG] Original text length: {len(original_text)} characters")
        print(f"[DEBUG] First few candidates: {candidates[:5]}")

        text_embedding = self.multilingual_model.encode(original_text)
        print(f"[DEBUG] Original text embedding shape: {text_embedding.shape}")

        candidate_embeddings = self.multilingual_model.encode([str(c) for c in candidates])
        print(f"[DEBUG] Candidate embeddings shape: {candidate_embeddings.shape}")

        similarities = cosine_similarity([text_embedding], candidate_embeddings)[0]
        print(f"[DEBUG] Similarity scores shape: {similarities.shape}")
        print(f"[DEBUG] Similarity score range: min={similarities.min():.4f}, max={similarities.max():.4f}")

        threshold = np.percentile(similarities, 75)
        print(f"[DEBUG] Similarity threshold (75th percentile): {threshold:.4f}")

        verified = [(cand, sim) for cand, sim in zip(candidates, similarities) if sim > threshold]
        print(f"[DEBUG] Number of verified candidates: {len(verified)}")
        print(f"[DEBUG] Top 10 verification scores: {verified[:10]}")

        # Additional statistics
        verified_similarities = [sim for _, sim in verified]
        if verified_similarities:
            print(f"[DEBUG] Verified similarities stats: min={min(verified_similarities):.4f}, "
                  f"max={max(verified_similarities):.4f}, mean={np.mean(verified_similarities):.4f}, "
                  f"median={np.median(verified_similarities):.4f}")

        # Distribution of similarity scores
        hist, bin_edges = np.histogram(similarities, bins=10)
        print("[DEBUG] Distribution of similarity scores:")
        for i, (start, end) in enumerate(zip(bin_edges[:-1], bin_edges[1:])):
            print(f"  {start:.2f} - {end:.2f}: {hist[i]} candidates")

        # Check for potential outliers
        q1, q3 = np.percentile(similarities, [25, 75])
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        outliers = [sim for sim in similarities if sim < lower_bound or sim > upper_bound]
        print(f"[DEBUG] Number of potential outliers: {len(outliers)}")

        sorted_verified = sorted(verified, key=lambda x: -x[1])
        result = [cand for cand, _ in sorted_verified]
        print(f"[DEBUG] Returning {len(result)} verified candidates")

        return result

    def process_text(self, text, query):
        try:
            self.original_text = text

            detected_lang = self._detect_language(text)
            self.current_lang_index = self.get_language_index(detected_lang)

            chunks = self._split_text(text)
            elements = self._extract_elements(chunks)
            graph = self._build_graph(elements)
            communities = self._detect_communities_leiden(graph)

            self._visualize_graph(graph, communities)

            candidates = self._multi_hop_reasoning(graph, query)
            verified = self._hallucination_detection(candidates[:50], text)

            if not verified:
                print("[WARNING] Using community-based fallback")
                verified = [f"Community {i+1}" for i in range(min(3, len(communities)))]

            community_summaries = []
            for i, community in enumerate(communities):
                members = ', '.join(community)
                try:
                    response = self.client.chat.completions.create(
                        model=self.model_name,
                        messages=[
                            {"role": "system", "content": f"Summarize with verification. Verified concepts: {', '.join(verified[:5])}"},
                            {"role": "user", "content": f"Community {i+1} members: {members}\nQuery context: {query}"},
                        ]
                    )
                    community_summaries.append(response.choices[0].message.content)
                except Exception as e:
                    print(f"[ERROR] Community summary error: {e}")
                    community_summaries.append(f"Community {i+1} summary unavailable")

            try:
                final_response = self.client.chat.completions.create(
                    model=self.model_name,
                    messages=[
                        {"role": "system", "content": f"Synthesize answer using verified concepts focus on user query to generate the answer: {', '.join(verified[:5])}"},
                        {"role": "user", "content": f"Query: {query}\nVerified Data: {' '.join(community_summaries)}"},
                    ]
                )
                return final_response.choices[0].message.content
            except Exception as e:
                print(f"[ERROR] Final synthesis failed: {e}")
                return "Response generation failed. Please try again."

        except Exception as e:
            print(f"[CRITICAL] Pipeline failure: {e}")
            return "System error occurred during processing."

def main():
    GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "your-groq-api-key-here")
    query = "Tell me latest news of Virat Kholi in 10 lines"

    text

    try:
        pipeline = GraphRAGPipeline(GROQ_API_KEY)
        result = pipeline.process_text(text, query)
        print("\nResult:")
        print(result)
    except Exception as e:
        print(f"Error processing text: {e}")

if __name__ == "__main__":
    main()

In [None]:
from IPython.display import HTML, display
import os
from google.colab import output
import base64
import asyncio
def display_network_graph():
    try:
        # Get the absolute path
        file_path = os.path.abspath('/content/graph_visualization/graph.html')

        if not os.path.exists(file_path):
            print(f"Error: File not found at {file_path}")
            # List contents of the current directory to help debug
            print("\nContents of current directory:")
            print(os.listdir('.'))
            if os.path.exists('graph_visualization'):
                print("\nContents of graph_visualization directory:")
                print(os.listdir('graph_visualization'))
            return

        # Read the file content
        with open(file_path, 'r', encoding='utf-8') as f:
            html_content = f.read()

        # Create a temporary HTML file with necessary styling
        temp_html = f"""
        <div style="width:100%; height:600px; overflow:hidden;">
            <iframe src="data:text/html;base64,{base64.b64encode(html_content.encode()).decode()}"
                    width="100%"
                    height="100%"
                    frameborder="0"
                    allowfullscreen>
            </iframe>
        </div>
        """

        # Clear the current output
        output.clear()

        # Display the visualization
        display(HTML(temp_html))

    except Exception as e:
        print(f"Error: {str(e)}")
        # Print additional debugging information
        print("\nDirectory structure:")
        for root, dirs, files in os.walk('.'):
            print(f"\nDirectory: {root}")
            print("Files:", files)

# Use the function
display_network_graph()