In [12]:
import requests
from bs4 import BeautifulSoup

def scrape_data(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    title = soup.find('h1', {'id': 'firstHeading'}).text if soup.find('h1', {'id': 'firstHeading'}) else 'No Title'
    content_div = soup.find('div', {'class': 'mw-parser-output'})
    paragraphs = content_div.find_all('p') if content_div else []
    content = ' '.join([p.text for p in paragraphs])

    return {'title': title, 'content': content}

# Example Wikipedia URL
url = "https://en.wikipedia.org/wiki/Generative_artificial_intelligence"
data = scrape_data(url)
print(f"Scraped Title: {data['title']}\n")
print(f"Scraped Content:\n{data['content'][:50000]}...\n")  


Scraped Title: Generative artificial intelligence

Scraped Content:

 Generative artificial intelligence (generative AI, GenAI,[1] or GAI) is a subset of artificial intelligence that uses generative models to produce text, images, videos, or other forms of data.[2][3][4] These models learn the underlying patterns and structures of their training data and use them to produce new data[5][6] based on the input, which often comes in the form of natural language prompts.[7][8]
 Improvements in transformer-based deep neural networks, particularly large language models (LLMs), enabled an AI boom of generative AI systems in the early 2020s. These include chatbots such as ChatGPT, Copilot, Gemini, and LLaMA; text-to-image artificial intelligence image generation systems such as Stable Diffusion, Midjourney, and DALL-E; and text-to-video AI generators such as Sora.[9][10][11][12] Companies such as OpenAI, Anthropic, Microsoft, Google, and Baidu as well as numerous smaller firms have developed ge

In [1]:
import requests
from bs4 import BeautifulSoup

def scrape_data(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    title = soup.find('h1', {'id': 'firstHeading'}).text if soup.find('h1', {'id': 'firstHeading'}) else 'No Title'
    content_div = soup.find('div', {'class': 'mw-parser-output'})
    paragraphs = content_div.find_all('p') if content_div else []
    content = '\n'.join([p.text.strip() for p in paragraphs if p.text.strip()])

    return {'title': title, 'content': content}

def save_to_txt(data, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(f"Title: {data['title']}\n\n")
        file.write(data['content'])

# Example Wikipedia URL
url = "https://en.wikipedia.org/wiki/Generative_artificial_intelligence"
data = scrape_data(url)

# Save scraped data to a text file
filename = "scraped_data.txt"
save_to_txt(data, filename)

print(f"Scraped data saved to {filename}")


Scraped data saved to scraped_data.txt


In [18]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_wikipedia(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    # Extract main content while removing unnecessary elements
    content = soup.find(id="mw-content-text")
    # Clean and process text
    paragraphs = [p.text for p in content.find_all('p') if p.text.strip()]
    return ' '.join(paragraphs)

In [19]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

def create_faiss_index(texts):
    # Initialize the embedding model
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    
    # Create embeddings
    embeddings = model.encode(texts)
    
    # Initialize FAISS index
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    
    # Add vectors to index
    index.add(np.array(embeddings).astype('float32'))
    
    return index, model

In [20]:
from transformers import AutoTokenizer, AutoModelForCausalLM

def setup_rag_pipeline():
    # Initialize LLaMA model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
    model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
    
    return model, tokenizer

def query_rag(question, context, model, tokenizer):
    # Combine context and question
    prompt = f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
    
    # Generate response
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs, max_length=200)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return response

In [21]:
def main():
    # 1. Scrape Wikipedia
    wiki_url = "https://en.wikipedia.org/wiki/Generative_artificial_intelligence"
    content = scrape_wikipedia(wiki_url)
    
    # 2. Split content into chunks
    chunks = [content[i:i+512] for i in range(0, len(content), 512)]
    
    # 3. Create FAISS index
    faiss_index, embedding_model = create_faiss_index(chunks)
    
    # 4. Setup RAG pipeline
    llm_model, tokenizer = setup_rag_pipeline()
    
    # 5. Query handling
    def process_query(query):
        # Create query embedding
        query_embedding = embedding_model.encode([query])
        
        # Search in FAISS
        D, I = faiss_index.search(query_embedding, k=3)
        relevant_chunks = [chunks[i] for i in I[0]]
        
        # Generate response using RAG
        context = " ".join(relevant_chunks)
        response = query_rag(query, context, llm_model, tokenizer)
        
        return response

    # Example usage
    query = "what is generative AI"
    answer = process_query(query)
    print(answer)