<a href="https://colab.research.google.com/github/harjeet88/LLM_experiemnts/blob/main/RAG/LLM_RAG_sql_lite_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import sqlite3
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel, pipeline
from bs4 import BeautifulSoup
import requests

In [2]:
# Define the model and tokenizer
model_name = "sentence-transformers/all-mpnet-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
# Define the text generation model and pipeline
generation_model_name = "EleutherAI/gpt-neo-1.3B"
generation_tokenizer = AutoTokenizer.from_pretrained(generation_model_name)
generation_model = pipeline("text-generation", model=generation_model_name, tokenizer=generation_tokenizer)


Device set to use cpu


In [4]:
# Setting up SQLite connection
conn = sqlite3.connect('rag_website_example.db')
cur = conn.cursor()

In [5]:
# Create table for storing documents and their embeddings
cur.execute("""
CREATE TABLE IF NOT EXISTS documents (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    content TEXT,
    embedding BLOB
);
""")
conn.commit()

In [6]:
# Function to compute embeddings
def compute_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()  # Use mean pooling to get a fixed-size embedding

In [7]:
# Function to scrape text content from a website
def scrape_website(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    paragraphs = soup.find_all('p')
    return "\n".join([para.get_text() for para in paragraphs])

# Function to insert document into the database
def insert_document(content):
    embedding = compute_embedding(content).tobytes()  # Convert numpy array to bytes
    cur.execute("INSERT INTO documents (content, embedding) VALUES (?, ?)", (content, embedding))
    conn.commit()

In [8]:
# Function to retrieve documents based on query
def retrieve_documents(query, top_k=3):
    query_embedding = compute_embedding(query)
    cur.execute("SELECT content, embedding FROM documents")
    rows = cur.fetchall()

    # Compute cosine similarity between query embedding and document embeddings
    similarities = []
    for content, embedding in rows:
        doc_embedding = np.frombuffer(embedding, dtype=np.float32)
        similarity = np.dot(query_embedding, doc_embedding) / (np.linalg.norm(query_embedding) * np.linalg.norm(doc_embedding))
        similarities.append((content, similarity))

    # Sort by similarity and return top_k results
    similarities.sort(key=lambda x: x[1], reverse=True)
    return [content for content, _ in similarities[:top_k]]

In [9]:
# Function to generate a response
# Function to generate a response
def generate_response(query):
    retrieved_docs = retrieve_documents(query)
    combined_text = " ".join(retrieved_docs)
    response = generation_model(f"Query: {query}\nDocuments: {combined_text}\nAnswer:", max_new_tokens=100)
    return response[0]['generated_text']

In [None]:
# Example website URL
url = "https://en.wikipedia.org/wiki/Harry_Potter"
content = scrape_website(url)
insert_document(content)

# Example query
query = "Tell me about the content on the website."
response = generate_response(query)
print(response)

Token indices sequence length is longer than the specified maximum sequence length for this model (39632 > 2048). Running this sequence through the model will result in indexing errors
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:

# Close the database connection
cur.close()
conn.close()