In [20]:
import requests
from bs4 import BeautifulSoup

def scrape_data(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    title = soup.find('h1', {'id': 'firstHeading'}).text if soup.find('h1', {'id': 'firstHeading'}) else 'No Title'
    content_div = soup.find('div', {'class': 'mw-parser-output'})
    paragraphs = content_div.find_all('p') if content_div else []
    content = ' '.join([p.text for p in paragraphs])

    return {'title': title, 'content': content}

# Example Wikipedia URL
url = "https://en.wikipedia.org/wiki/Generative_artificial_intelligence"
data = scrape_data(url)
print(f"Scraped Title: {data['title']}\n")
print(f"Scraped Content:\n{data['content'][:500]}...\n") 


Scraped Title: Generative artificial intelligence

Scraped Content:

 Generative artificial intelligence (generative AI, GenAI,[1] or GAI) is a subset of artificial intelligence that uses generative models to produce text, images, videos, or other forms of data.[2][3][4] These models learn the underlying patterns and structures of their training data and use them to produce new data[5][6] based on the input, which often comes in the form of natural language prompts.[7][8]
 Improvements in transformer-based deep neural networks, particularly large language models...



In [21]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load pre-trained embedding model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Convert Wikipedia content into embeddings
stored_texts = []  # Store text for retrieval
embeddings = []

# Split content into chunks (Faiss works better with small segments)
content_chunks = data['content'].split('. ')  # Split by sentence

for chunk in content_chunks:
    if len(chunk) > 20:  # Avoid storing very short texts
        stored_texts.append(chunk)
        embeddings.append(model.encode(chunk, convert_to_numpy=True))

# Convert to NumPy array
embeddings = np.array(embeddings)
print(f"Stored {len(embeddings)} text chunks as embeddings.")


INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: paraphrase-MiniLM-L6-v2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Stored 59 text chunks as embeddings.


In [23]:
import faiss

# Initialize Faiss index
embedding_dim = embeddings.shape[1]  # Dimension of embeddings
db = faiss.IndexFlatL2(embedding_dim)  # L2 (Euclidean) similarity index

# Add embeddings to Faiss
db.add(embeddings)
print(f"Stored {db.ntotal} vectors in Faiss.")

faiss.write_index(db,"faiss_index.bin")
np.save("text.npy",stored_texts)
print("faiss index saved to faiss_index.bin")

Stored 59 vectors in Faiss.
faiss index saved to faiss_index.bin


In [24]:
def retrieve_context(query, k=3):
    query_embedding = model.encode([query], convert_to_numpy=True)
    distances, indices = db.search(query_embedding, k)  # Retrieve top k matches
    
    retrieved_texts = [stored_texts[i] for i in indices[0]]  # Retrieve original text
    return retrieved_texts

# Test retrieval
query = "gen ai is what?"
retrieved_texts = retrieve_context(query)

print("\n🔹 Retrieved Context:")
for i, text in enumerate(retrieved_texts):
    print(f"{i+1}. {text}\n")


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


🔹 Retrieved Context:
1. A team from Microsoft Research controversially argued that it "could reasonably be viewed as an early (yet still incomplete) version of an artificial general intelligence (AGI) system."[49] However, this assessment was contested by other scholars who maintained that generative AI remained "still far from reaching the benchmark of 'general human intelligence'" as of 2023.[50] Later in 2023, Meta released ImageBind, an AI model combining multiple modalities including text, images, video, thermal data, 3D data, audio, and motion, paving the way for more immersive generative AI applications.[51]
 In December 2023, Google unveiled Gemini, a multimodal AI model available in four versions: Ultra, Pro, Flash, and Nano.[52] The company integrated Gemini Pro into its Bard chatbot and announced plans for "Bard Advanced" powered by the larger Gemini Ultra model.[53] In February 2024, Google unified Bard and Duet AI under the Gemini brand, launching a mobile app on Android 

In [49]:
from langchain.vectorstores import FAISS 
#from langchain.vectorstores import FAISSAdapter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
import numpy as np

# Load FAISS database
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
index = faiss.read_index("faiss_index.bin")
stored_texts = np.load("text.npy", allow_pickle=True)

# Create a mapping between FAISS index IDs and document IDs
index_to_docstore_id = list(range(len(stored_texts)))

# Wrap FAISS in a supported adapter (assuming FAISSAdapter exists)
faiss_adapter = FAISSAdapter(faiss_db=FAISS(embedding_model, index, stored_texts, index_to_docstore_id))

# Load LLaMA 2 model
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
generation_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)
llm = HuggingFacePipeline(pipeline=generation_pipeline)

# Define the RAG pipeline
prompt_template = """
Given the following context, answer the question:

Context:
{context}

Question:
{question}

Answer:
"""
prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

# Create RetrievalQA chain with the adapter
qa_chain = RetrievalQA.from_llm(llm, retriever=faiss_adapter.as_retriever(search_kwargs={"k": 5}), prompt=prompt)

# Example Query
query = "What is RAG in NLP?"
response = qa_chain.run(query)
print(response)

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


NameError: name 'FAISSAdapter' is not defined

In [28]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFacePipeline
from langchain.chains import create_retrieval_chain
from langchain.prompts import PromptTemplate
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch

# Load FAISS database
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
index = faiss.read_index("faiss_index.bin")
stored_texts=np.load("text.npy", allow_pickle=True)
faiss_db = FAISS(embedding_model,index)
retriever = faiss_db.as_retriever(search_kwargs={"k":5})

# Load LLaMA 2 model
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
generation_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)
llm = HuggingFacePipeline(pipeline=generation_pipeline)

# Define the RAG pipeline
prompt_template = PromptTemplate(
    template="""
    Given the retrieved context, answer the following question:
    Context: {context}
    Question: {question}
    Answer:
    """,
    input_variables=["context", "question"]
)
qa_chain = create_retrieval_chain(llm=llm, retriever=retriever, prompt=prompt_template)

# Example Query
query = "What is RAG in NLP?"
response = qa_chain.run(query)
print(response)


INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


TypeError: FAISS.__init__() missing 2 required positional arguments: 'docstore' and 'index_to_docstore_id'

In [43]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFacePipeline
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import faiss
import numpy as np
import torch

# === 1. Load Stored FAISS Index & Text Data ===
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Load FAISS index
index = faiss.read_index("faiss_index.bin")

# Load corresponding text chunks (the texts themselves)
stored_texts = np.load("text.npy", allow_pickle=True)

# === 2. Create a Document Store and Map Indices to Texts ===
# Create docstore as a dictionary
docstore = {i: text for i, text in enumerate(stored_texts)}

# Map FAISS index to docstore IDs (corresponding text indices)
index_to_docstore_id = {i: str(i) for i in range(len(stored_texts))}

# === 3. Initialize FAISS Vector Store with Docstore ===
faiss_db = FAISS(embedding_function=embedding_model, index=index, docstore=docstore, index_to_docstore_id=index_to_docstore_id)

# Create retriever from FAISS vector store
retriever = faiss_db.as_retriever(search_kwargs={"k": 5})  # Top 5 results

# === 4. Load LLaMA 2 Model ===
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")

# Define the generation pipeline
generation_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)
llm = HuggingFacePipeline(pipeline=generation_pipeline)

# === 5. Define RAG Retrieval Chain ===
prompt_template = PromptTemplate(
    template="""
    You are an AI assistant. Given the retrieved context, answer the user's question as accurately as possible.

    Context: {context}
    Question: {question}
    Answer:
    """,
    input_variables=["context", "question"]
)

# Create RAG retrieval chain
qa_chain = ConversationalRetrievalChain(retriever)

# === 6. Retrieve Real-Time Query ===
query = "What is RAG in NLP?"
response = qa_chain.run(query)

# Show response
print("\n=== AI Response ===\n", response)


INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use mps


TypeError: BaseModel.__init__() takes 1 positional argument but 2 were given

In [40]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFacePipeline
from langchain.chains import create_retrieval_chain
from langchain.prompts import PromptTemplate
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# Load pre-trained embedding model
embedding_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Load FAISS database
index = faiss.read_index("faiss_index.bin")
stored_texts = np.load("text.npy", allow_pickle=True)

# Create a custom retriever using FAISS and sentence-transformers
class CustomFAISSRetriever:
    def __init__(self, index, stored_texts, embedding_model, k=5):
        self.index = index
        self.stored_texts = stored_texts
        self.embedding_model = embedding_model
        self.k = k

    def retrieve(self, query):
        query_embedding = self.embedding_model.encode([query], convert_to_numpy=True)
        distances, indices = self.index.search(query_embedding, self.k)
        retrieved_texts = [self.stored_texts[i] for i in indices[0]]
        return retrieved_texts

retriever = CustomFAISSRetriever(index, stored_texts, embedding_model)

# Load LLaMA 2 model
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
generation_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)
llm = HuggingFacePipeline(pipeline=generation_pipeline)

# Define the RAG pipeline
prompt_template = PromptTemplate(
    template="""
    Given the retrieved context, answer the following question:
    Context: {context}
    Question: {question}
    Answer:
    """,
    input_variables=["context", "question"]
)

def create_retrieval_chain(llm, retriever, prompt):
    def rag_pipeline(query):
        retrieved_texts = retriever.retrieve(query)
        context = " ".join(retrieved_texts)
        prompt_filled = prompt.format(context=context, question=query)
        response = llm(prompt_filled)
        return response
    return rag_pipeline

qa_chain = create_retrieval_chain(llm=llm, retriever=retriever, prompt=prompt_template)

# Example Query
query = "What is RAG in NLP?"
response = qa_chain(query)
print(response)


INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: paraphrase-MiniLM-L6-v2


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use mps


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  response = llm(prompt_filled)


KeyboardInterrupt: 

In [6]:
import requests
from bs4 import BeautifulSoup
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from typing import List
import logging

# Function to scrape data from Wikipedia
def scrape_data(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    title = soup.find('h1', {'id': 'firstHeading'}).text if soup.find('h1', {'id': 'firstHeading'}) else 'No Title'
    content_div = soup.find('div', {'class': 'mw-parser-output'})
    paragraphs = content_div.find_all('p') if content_div else []
    content = ' '.join([p.text for p in paragraphs])

    return {'title': title, 'content': content}

# Load pre-trained embedding model
sentence_transformer = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Initialize LLaMA model for RAG (Retrieval-Augmented Generation)
class LlamaRAGChatbot:
    def __init__(self, model_name: str = "meta-llama/Llama-2-7b-chat-hf", 
                 device: str = "cuda" if torch.cuda.is_available() else "cpu"):
        self.device = device
        # Initialize LLaMA tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if device == "cuda" else torch.float32,
            device_map="auto"
        )
        
        # Initialize storage for texts and FAISS index
        self.stored_texts = []
        self.embeddings = []
        self.faiss_index = None
        
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)

    def initialize_knowledge_base(self, url: str):
        """Initialize the knowledge base from a single URL"""
        all_embeddings = []

        try:
            # Scrape data from the URL
            data = scrape_data(url)
            content_chunks = data['content'].split('. ')  # Split by sentence
            
            for chunk in content_chunks:
                if len(chunk) > 20:  # Avoid very short chunks
                    self.stored_texts.append(chunk)
                    embedding = sentence_transformer.encode(chunk, convert_to_numpy=True)
                    all_embeddings.append(embedding)

            self.logger.info(f"Processed URL: {url}")

            # Initialize FAISS index
            embeddings_array = np.array(all_embeddings)
            embedding_dim = embeddings_array.shape[1]
            self.faiss_index = faiss.IndexFlatL2(embedding_dim)
            self.faiss_index.add(embeddings_array)

            self.logger.info(f"Knowledge base initialized with {len(self.stored_texts)} chunks")

        except Exception as e:
            self.logger.error(f"Error processing URL {url}: {str(e)}")

    def generate_prompt(self, query: str, context: List[str]) -> str:
        """Generate a prompt combining the query and retrieved context"""
        context_str = "\n".join(context)
        prompt = f"""Based on the following context, please provide a detailed and accurate answer to the question.

Context:
{context_str}

Question: {query}

Answer:"""
        return prompt

    def get_response(self, query: str, max_length: int = 1024) -> str:
        """Generate a response using RAG and LLaMA"""
        try:
            # Retrieve relevant context
            retrieved_texts = self.retrieve_context(query)
            
            # Generate prompt
            prompt = self.generate_prompt(query, retrieved_texts)
            
            # Tokenize and generate response
            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
            
            outputs = self.model.generate(
                inputs["input_ids"],
                max_length=max_length,
                temperature=0.7,
                top_p=0.95,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id
            )
            
            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            
            # Extract only the answer part (after "Answer:")
            response = response.split("Answer:")[-1].strip()
            
            return response
            
        except Exception as e:
            self.logger.error(f"Error generating response: {str(e)}")
            return "I apologize, but I encountered an error while processing your question."

    def retrieve_context(self, query: str, k: int = 3) -> List[str]:
        """Retrieve relevant context using Faiss"""
        query_embedding = sentence_transformer.encode([query], convert_to_numpy=True)
        distances, indices = self.faiss_index.search(query_embedding, k)
        return [self.stored_texts[i] for i in indices[0]]

    def chat(self):
        """Interactive chat interface"""
        print("🤖 LLaMA RAG Chatbot initialized. Type 'exit' to end the conversation.")
        
        while True:
            user_input = input("\n👤 You: ").strip()
            
            if user_input.lower() == 'exit':
                print("🤖 Goodbye!")
                break
                
            response = self.get_response(user_input)
            print(f"\n🤖 Bot: {response}")

# Example of how to use
url = "https://en.wikipedia.org/wiki/Generative_artificial_intelligence"
chatbot = LlamaRAGChatbot()
chatbot.initialize_knowledge_base(url)
chatbot.chat()


INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: paraphrase-MiniLM-L6-v2


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__:Processed URL: https://en.wikipedia.org/wiki/Generative_artificial_intelligence
INFO:__main__:Knowledge base initialized with 59 chunks


🤖 LLaMA RAG Chatbot initialized. Type 'exit' to end the conversation.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


🤖 Bot: Generative artificial intelligence (GenAI or GAI) is a subset of artificial intelligence that uses generative models to produce text, images, videos, or other forms of data. These models learn the underlying patterns and structures of their training data and use them to produce new data based on the input, which often comes in the form of natural language prompts.

GenAI is a rapidly developing field, with significant improvements in transformer-based deep neural networks, particularly large language models (LLMs), driving an AI boom in the early 2020s. While some have argued that generative AI systems could reasonably be viewed as an early version of an artificial general intelligence (AGI) system, others maintain that these systems are still far from reaching the benchmark of 'general human intelligence' as of 2023.

The construction of a GenAI system typically involves applying unsupervised machine learning techniques, such as neural network architectures like generative adv

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from accelerate import init_empty_weights, load_checkpoint_and_dispatch

class LlamaRAGChatbot:
    def __init__(self, model_name: str = "meta-llama/Llama-2-7b-chat-hf", 
                 device: str = "cuda" if torch.cuda.is_available() else "cpu"):
        self.device = device
        
        # Initialize LLaMA tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        # Load model with disk offloading
        self.model = self.load_model_with_disk_offloading(model_name)

        # Initialize storage for texts and FAISS index
        self.stored_texts = []
        self.embeddings = []
        self.faiss_index = None

    def load_model_with_disk_offloading(self, model_name: str):
        """Load model with disk offloading to manage large model sizes."""
        # Initialize empty weights (so the model is not immediately loaded into memory)
        with init_empty_weights():
            model = AutoModelForCausalLM.from_pretrained(model_name, 
                                                        torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
                                                        device_map="auto")
        
        # Load model weights to device, with disk offloading
        model = load_checkpoint_and_dispatch(model, model_name, device_map="auto", offload_folder="offload")

        return model

    # Other methods remain the same (scraping, embedding, etc.)

