**Important Installations**  

Before running the application, you need to install the required libraries. The following command installs all necessary packages:

In [None]:
!pip install --upgrade --quiet  langchain langchain-community langchain-pinecone langchain-huggingface neo4j langchain-core tiktoken yfiles_jupyter_graphs newsapi-python requests huggingface_hub pinecone-client tqdm pinecone sentence_transformers py2neo gradio fastapi mistralai

In [None]:
# Filter warnings
import warnings
warnings.filterwarnings('ignore')

# Standard library imports
import os
import time
import json
import threading
from uuid import uuid4
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Tuple

# Third-party imports
import pinecone
import requests
import torch
from google.colab import output, userdata
from neo4j import GraphDatabase
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from yfiles_jupyter_graphs import GraphWidget
from pinecone import Pinecone, ServerlessSpec

# LangChain imports
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts.prompt import PromptTemplate
from langchain_core.runnables import (
    ConfigurableField,
    RunnableBranch,
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain import PromptTemplate, LLMChain
from langchain.chains import SimpleSequentialChain
from transformers import AutoTokenizer, AutoModelForCausalLM
from mistralai import Mistral

# Frontend - Backend libraries
from fastapi import FastAPI, HTTPException
from py2neo import Graph, Node, Relationship
from pydantic import BaseModel
import uvicorn
import threading
from typing import List, Dict
import gradio as gr
import requests

# Google Colab widget configuration
try:
    output.enable_custom_widget_manager()
except:
    pass

In [None]:
HUGGINGFACE_TOKEN = userdata.get('HUGGINGFACE_TOKEN')
NEWSAPI_KEY = userdata.get('NEWSAPI_KEY')
PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')
NEO4J_PASSWORD = userdata.get('NEO4J_PASSWORD')
MISTRAL_API_KEY = userdata.get('MISTRAL_API_KEY')
NEO4J_URI = userdata.get('NEO4J_URI')
NEO4J_USERNAME = "neo4j"

In [None]:
!huggingface-cli login --token $HUGGINGFACE_TOKEN

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
The token `giru-upgrad-news-agg-read-only` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `giru-upgrad-news-agg-read-only`


**Setup OS environment variables**

In [None]:
os.environ['HUGGINGFACE_TOKEN'] = HUGGINGFACE_TOKEN
os.environ['NEWSAPI_KEY'] = NEWSAPI_KEY
os.environ['PINECONE_API_KEY'] = PINECONE_API_KEY
os.environ['NEO4J_PASSWORD'] = NEO4J_PASSWORD
os.environ['NEO4J_URI'] = NEO4J_URI
os.environ['NEO4J_USERNAME'] = NEO4J_USERNAME

**Fetching & storing news articles from `NEWSAPI`**

In [None]:
def count_words(text: str) -> int:
    """Count words in a text string."""
    if not text:
        return 0
    return len(text.split())

In [None]:
def create_safe_filename(topic: str, title: str) -> str:
    """Create a safe filename from topic and title."""
    # Remove or replace invalid filename characters
    invalid_chars = '<>:"/\\|?*'
    safe_title = ''.join(c if c not in invalid_chars else '_' for c in title)
    safe_title = safe_title[:100]  # Limit length
    return f"{topic}_{safe_title}.txt"

In [None]:
def fetch_multiple_topics(api_key: str, topics: List[str], database_folder: str = "database") -> Dict[str, int]:
    """
    Fetch articles for multiple topics and save each article to a separate file.

    Args:
        api_key: NewsAPI key
        topics: List of topics to fetch articles for
        database_folder: Folder to store article files
        days_from: Number of days from today to fetch articles

    Returns:
        Dictionary with topics and their saved article counts
    """
    # Ensure database folder exists
    os.makedirs(database_folder, exist_ok=True)
    article_counts = {topic: 0 for topic in topics}

    for topic in topics:
        try:
            url = (
                f"https://newsapi.org/v2/everything"
                f"?q={topic}"
                f"&sortBy=popularity"
                f"&pageSize=100"
                f"&apiKey={api_key}"
            )

            response = requests.get(url)
            response.raise_for_status()

            articles = response.json().get('articles', [])

            for article in articles:
                title = article.get('title', 'No title')
                content = article.get('content', '')
                description = article.get('description', '')

                # Combine content and description for word count
                full_text = f"{content}\n{description}".strip()
                word_count = count_words(full_text)

                # Skip if content is too short
                if word_count < 10:
                    continue

                # Create filename using topic and title
                filename = create_safe_filename(topic, title)
                filepath = os.path.join(database_folder, filename)

                # Write article to file
                with open(filepath, 'w', encoding='utf-8') as file:
                    # Write metadata header
                    file.write("=" * 50 + "\n")
                    file.write(f"Topic: {topic}\n")
                    file.write(f"Title: {title}\n")
                    file.write(f"Published: {article.get('publishedAt', 'No date')}\n")
                    file.write(f"Source: {article.get('source', {}).get('name', 'Unknown')}\n")
                    file.write(f"URL: {article.get('url', 'No URL')}\n")
                    file.write(f"Word Count: {word_count}\n")
                    file.write("=" * 50 + "\n\n")

                    # Write content
                    file.write(full_text)

                article_counts[topic] += 1

            # Sleep to respect API rate limits
            time.sleep(1)

        except requests.exceptions.RequestException as e:
            print(f"Error fetching articles for {topic}: {str(e)}")
            continue

    return article_counts

**Processing Articles with RAG (Retrieval-Augmented Generation)**

The `ArticleRAG` class initializes the system to process articles and store embeddings.
- The constructor initializes embeddings using Hugging Face
- Then sets up a Pinecone index.
- The `process_articles` method reads each article from the particular folder

In [None]:
class ArticleRAG:
    def __init__(self, database_folder: str = "database", index_name: str = "articles-embeddings"):
        """
        Initialize RAG system using LangChain and HuggingFace embeddings with Pinecone integration.

        Args:
            database_folder: Folder containing article files
            index_name: Name for the Pinecone index
        """
        self.database_folder = database_folder

        # Initialize HuggingFace Embeddings
        print("Initializing HuggingFace Embeddings...")
        self.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

        # Initialize Pinecone with new pattern
        print("Initializing Pinecone...")
        self.pc = Pinecone(api_key=PINECONE_API_KEY)
        self.index_name = index_name

        # Check if index exists and create if needed
        if self.index_name not in self.pc.list_indexes().names():
            print(f"Creating new Pinecone index: {self.index_name}")
            self.pc.create_index(
                name=self.index_name,
                dimension=self.embeddings.embed_query("").shape[0],
                metric='cosine',
                spec=ServerlessSpec(
                    cloud='aws',
                    region='us-east-1'
                )
            )

        # Initialize vector store with the new index
        self.vector_store = PineconeVectorStore(
            index=self.pc.Index(self.index_name),
            embedding=self.embeddings
        )

        # Text splitter for chunking
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=500,  # Adjust for larger chunks
            chunk_overlap=100,
            length_function=len
        )

    def process_articles(self):
        """
        Read and process articles from the database folder, storing embeddings in Pinecone.
        """
        print("Processing articles from database folder...")
        docs = []
        for filename in os.listdir(self.database_folder):
            if filename.endswith(".txt"):
                filepath = os.path.join(self.database_folder, filename)
                with open(filepath, "r", encoding="utf-8") as file:
                    # Read the file content
                    content = file.read()

                    # Split content into metadata and body
                    parts = content.split("=" * 50)
                    if len(parts) < 3:
                        print(f"Skipping malformed file: {filename}")
                        continue

                    metadata_text = parts[1].strip()
                    article_content = parts[2].strip()

                    # Chunk the article content
                    chunks = self.text_splitter.split_text(article_content)

                    # Create Document objects with metadata
                    for i, chunk in enumerate(chunks):
                        doc = Document(
                            page_content=chunk,
                            metadata={
                                "source_file": filename,
                                "chunk_index": i,
                                "total_chunks": len(chunks),
                                "metadata_text": metadata_text
                            }
                        )
                        docs.append(doc)

        if docs:
            # Generate UUIDs for documents
            uuids = [str(uuid4()) for _ in range(len(docs))]

            # Add documents to Pinecone
            print("Adding documents to Pinecone...")
            self.vector_store.add_documents(documents=docs, ids=uuids)
        else:
            print("No valid documents to process.")

    def query(self, query_text: str, k: int = 5):
        """
        Query the Pinecone vector database.

        Args:
            query_text: Query string
            k: Number of top results to return

        Returns:
            List of relevant results with metadata
        """
        print("Querying Pinecone...")
        results = self.vector_store.similarity_search(query_text, k=k)
        formatted_results = [
            {
                "content": res.page_content,
                "metadata": res.metadata
            }
            for res in results
        ]
        return formatted_results

In [None]:
API_KEY = NEWSAPI_KEY  # Replace with your API key
topics = ["Artificial Intelligence", "Politics", "Business", "Technology", "Sports", "Entertainment", "Health"]

print("Fetching articles...")
results = fetch_multiple_topics(api_key=API_KEY, topics=topics, database_folder="database")

# Print summary
print("\nArticles saved:")
for topic, count in results.items():
    print(f"{topic}: {count} articles")

# Initialize and process articles with ArticleRAG
rag = ArticleRAG(database_folder="database", index_name="articles-embeddings")
rag.process_articles()

Fetching articles...

Articles saved:
Artificial Intelligence: 96 articles
Politics: 98 articles
Business: 90 articles
Technology: 86 articles
Sports: 93 articles
Entertainment: 83 articles
Health: 90 articles
Initializing HuggingFace Embeddings...
Initializing Pinecone...
Processing articles from database folder...
Adding documents to Pinecone...


**Backend**

In [None]:
app = FastAPI()

class FetchNewsRequest(BaseModel):
    user_id: str
    category: str

def fetch_documents(query: str) -> str:
    # Initialize docs_content inside the function
    docs_content = ''
    try:
        results = rag.query(query, k=5)
        for result in results:
            docs_content += result['metadata']
            docs_content += '\n'
            docs_content += result['content']
            docs_content += '\n'
    except Exception as e:
        print(f"Error fetching documents: {str(e)}")
    return docs_content

def combine_preferences_and_documents(preference: str, docs_content: str) -> dict:
    return {
        "user_preferences": [preference],  # Changed to list to match template
        "documents": [{"title": "Document", "content": docs_content}]  # Changed to match template
    }

def generate_prompt(combined_data: dict) -> str:
    prompt_template = PromptTemplate(
        input_variables=["user_preferences", "documents"],
        template="""
        User Preferences: {user_preferences}
        Documents:
        {documents}
        Based on the above preferences and documents, please provide a summary or insights tailored to the user's interests. Don't add the information that it's generated by you.
        Notes:
        1. Write your content like it's reported by a TV reporter.
        2. Don't output in markdown or HTML document, just give a final output as single paragraph.
        """
    )

    documents_str = "\n".join([f"- {doc['title']}: {doc['content']}" for doc in combined_data["documents"]])

    prompt = prompt_template.format(
        user_preferences=", ".join(combined_data["user_preferences"]),
        documents=documents_str
    )

    return prompt

def llm_response(prompt: str) -> str:
    model = "mistral-large-latest"
    client = Mistral(api_key = MISTRAL_API_KEY)
    response = client.chat.complete(
        model= model,
        messages = [
            {
                "role": "user",
                "content": prompt
            },
        ]
    )
    return response.choices[0].message.content

@app.post("/fetch_news/")
def fetch_news(request: FetchNewsRequest):
    try:
        query = f'Top news articles that match these {request.category}, including their titles, summaries, and publication dates and what is going on, nowadays, on the topic {request.category}. News around {request.category}. What is latest news in {request.category}?.'

        docs_content = fetch_documents(query)
        combined_data = combine_preferences_and_documents(request.category, docs_content)
        prompt = generate_prompt(combined_data)
        content = llm_response(prompt)

        news = [
            {"title": f"Latest in {request.category}", "content": content}
        ]
        return {"news": news}
    except Exception as e:
        return {"error": str(e)}, 500

# Run FastAPI server in a thread
def run_fastapi():
    uvicorn.run(app, host="0.0.0.0", port=8010)

if __name__ == "__main__":
    threading.Thread(target=run_fastapi, daemon=True).start()

INFO:     Started server process [10418]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8010 (Press CTRL+C to quit)


**Frontend**

In [None]:
backend_url = "http://0.0.0.0:8010/"  # Ensure this matches your backend's actual URL

# Function to send data to backend and fetch news
def fetch_news(user_id, category):
    payload = {
        "user_id": user_id,
        "category": category
    }
    try:
        # Backend endpoint to fetch news
        response = requests.post(f"{backend_url}/fetch_news/", json=payload)
        if response.status_code == 200:
            news = response.json().get("news", [])
            cards = "\n\n".join([f"{n['title']}\n{n['content']}" for n in news])
            return cards
        else:
            return f"Error fetching news! (Status Code: {response.status_code})"
    except Exception as e:
        return f"Error: {str(e)}"

# Gradio UI
users = ["1", "2", "3"]
categories = [
    "Artificial Intelligence", "Politics", "Business", "Technology",
    "Sports", "Entertainment", "Health"
]

with gr.Blocks() as ui:
    # User selection
    user_dropdown = gr.Dropdown(users, label="Select User", value="1")
    # Category selection
    category_dropdown = gr.Dropdown(categories, label="Select Category")
    # Button to fetch news
    fetch_button = gr.Button("Fetch News")
    # News display area
    news_display = gr.Textbox(label="News", interactive=False, placeholder="News will appear here")

    # Button click event
    fetch_button.click(
        fn=fetch_news,
        inputs=[user_dropdown, category_dropdown],
        outputs=news_display
    )

ui.launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://5ce228d719541dfbdb.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


