<a href="https://colab.research.google.com/github/hosein9574/My-agents/blob/main/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install dependencies
!pip install transformers --upgrade
!pip install sentence-transformers --upgrade
!pip install chromadb --upgrade
!pip install gradio --upgrade

# Imports
import numpy as np
import pandas as pd
import transformers
import sentence_transformers
from sentence_transformers import SentenceTransformer
import chromadb
from datetime import datetime
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import gradio as gr

# Check versions
print("Transformers version:", transformers.__version__)
print("Sentence-Transformers version:", sentence_transformers.__version__)
print("ChromaDB version:", chromadb.__version__)

# Set up Kaggle API (make sure kaggle.json is uploaded in Colab first)
from google.colab import files
uploaded = files.upload()

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download and unzip BBC News dataset
!kaggle datasets download -d gpreda/bbc-news
!unzip -o bbc-news.zip

# Load data
news = pd.read_csv('./bbc_news.csv')
MAX_NEWS = 1000
DOCUMENT = "description"
TOPIC = "title"
news["id"] = news.index
subset_news = news.head(MAX_NEWS)

# Setup ChromaDB
chroma_client = chromadb.PersistentClient(path="./chromadb")
collection_name = "news_collection_" + datetime.now().strftime("%s")

# Safe check for existing collection
existing_names = [col.name for col in chroma_client.list_collections()]
if collection_name in existing_names:
    chroma_client.delete_collection(name=collection_name)

collection = chroma_client.create_collection(name=collection_name)

# Embed and add to ChromaDB
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = embedding_model.encode(subset_news[DOCUMENT].tolist(), convert_to_numpy=True)

collection.add(
    documents=subset_news[DOCUMENT].tolist(),
    metadatas=[{TOPIC: topic} for topic in subset_news[TOPIC].tolist()],
    ids=[f"id{x}" for x in range(MAX_NEWS)],
    embeddings=embeddings.tolist(),
)

# Load language model
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_id)
lm_model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)

pipe = pipeline(
    "text-generation",
    model=lm_model,
    tokenizer=tokenizer,
    max_new_tokens=256,
    device_map="auto",  # Use GPU if available
)

# QA function
def answer_question(user_question):
    try:
        results = collection.query(query_texts=[user_question], n_results=5)
        context = "\n".join(results["documents"][0])
        context = context[:5120]  # Truncate if needed

        prompt = f"""
        Relevant context: {context}
        Considering the relevant context, answer the question.
        Question: {user_question}
        Answer: """

        response = pipe(prompt)
        answer = response[0]["generated_text"].split("Answer:")[-1].strip()
        return answer
    except Exception as e:
        return f"Error: {str(e)}"

# Gradio Chat Interface
def chat_function(message, history):
    return answer_question(message)

gr.ChatInterface(fn=chat_function, title="📰 News QA Bot", description="Ask anything about recent news articles.").launch()