### Crawling the all the webpages URL's into a json file.

In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import json
from concurrent.futures import ThreadPoolExecutor, as_completed

def is_valid_url(url):
    """Check if the URL is well-formed."""
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

def get_links_from_page(url, base_url, retries=3):
    """Fetch links from a single page with retry logic."""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    for attempt in range(retries):
        try:
            response = requests.get(url.strip(), headers=headers, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')

            # Extract all links and resolve full URLs
            links = set()
            for link in soup.find_all('a', href=True):
                href = urljoin(base_url, link['href'])
                if is_valid_url(href) and href.startswith(base_url):
                    links.add(href)
            return links
        except requests.exceptions.RequestException as e:
            print(f"Attempt {attempt + 1} failed for {url}: {e}")
            if attempt == retries - 1:
                print(f"Skipping {url} after {retries} attempts.")
    return set()

def crawl_website(base_url, max_workers=20):
    """Crawl the website to get all unique URLs."""
    visited = set()
    to_visit = {base_url}
    all_links = set()

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        while to_visit:
            # Submit tasks for the current batch of URLs
            futures = {executor.submit(get_links_from_page, url, base_url): url for url in to_visit}
            to_visit = set()  # Clear the queue for the next iteration

            for future in as_completed(futures):
                url = futures[future]
                try:
                    links = future.result()
                    new_links = links - visited
                    to_visit.update(new_links)
                    all_links.update(new_links)
                    visited.add(url)
                except Exception as e:
                    print(f"Error processing {url}: {e}")

    return list(all_links)

def main():
    base_url = "https://botpenguin.com/"
    print("Starting fast crawl...")

    # Crawl the website
    all_links = crawl_website(base_url, max_workers=50)
    print(f"Found {len(all_links)} unique pages.")

    # Save to JSON
    with open("botpenguin_urls.json", "w") as json_file:
        json.dump(all_links, json_file, indent=4)

    print("Page URLs saved to 'botpenguin_urls.json'")

if __name__ == "__main__":
    main()


Starting fast crawl...
Attempt 1 failed for https://botpenguin.com/chatbot-templates/pizza-delivery: 404 Client Error: Not Found for url: https://botpenguin.com/chatbot-templates/pizza-delivery
Attempt 2 failed for https://botpenguin.com/chatbot-templates/pizza-delivery: 404 Client Error: Not Found for url: https://botpenguin.com/chatbot-templates/pizza-delivery
Attempt 3 failed for https://botpenguin.com/chatbot-templates/pizza-delivery: 404 Client Error: Not Found for url: https://botpenguin.com/chatbot-templates/pizza-delivery
Skipping https://botpenguin.com/chatbot-templates/pizza-delivery after 3 attempts.
Attempt 1 failed for https://botpenguin.com/blogs/chatbots-in-spotlight-top-white-label-chatbot-resellers-2023: HTTPSConnectionPool(host='botpenguin.com', port=443): Read timed out. (read timeout=10)
Attempt 1 failed for https://botpenguin.com/glossary/masked-language-modeling: 405 Client Error: Not Allowed for url: https://botpenguin.com/glossary/masked-language-modeling
Attemp

### Content Extraction from the URL's

In [2]:
import requests
from bs4 import BeautifulSoup
import json

def fetch_page_content(url):
    """Fetch the content of a page."""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    try:
        response = requests.get(url.strip(), headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract main content (modify as per website structure)
        body_text = soup.get_text(separator='\n', strip=True)
        return body_text
    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch {url}: {e}")
        return None

def process_urls_and_save(input_file, output_file):
    """Read URLs from JSON, fetch their content, and save to a text file."""
    try:
        # Load URLs from JSON file
        with open(input_file, 'r') as json_file:
            urls = json.load(json_file)

        with open(output_file, 'w', encoding='utf-8') as text_file:
            for url in urls:
                print(f"Processing: {url}")
                content = fetch_page_content(url)
                if content:
                    # Write content to text file
                    text_file.write(f"URL: {url}\n")
                    text_file.write(content + "\n\n")
                else:
                    text_file.write(f"URL: {url}\nFailed to fetch content.\n\n")
    except Exception as e:
        print(f"Error processing URLs: {e}")

def main():
    input_file = "botpenguin_urls.json"  # JSON file with URLs
    output_file = "botpenguin_content.txt"  # Output text file for content

    print("Fetching content from URLs...")
    process_urls_and_save(input_file, output_file)
    print(f"Content saved to '{output_file}'")

if __name__ == "__main__":
    main()


Fetching content from URLs...
Processing: https://botpenguin.com/glossary/aiml
Processing: https://botpenguin.com/chatbot-integrations/close-crm
Processing: https://botpenguin.com/chatbot-templates/saas
Processing: https://botpenguin.com/glossary/bullwhip-effect
Processing: https://botpenguin.com/blogs/chatbot-use-cases-to-generate-leads-for-b2b
Processing: https://botpenguin.com/glossary/business-process-outsourcing
Processing: https://botpenguin.com/glossary/machine-learning-algorithms
Processing: https://botpenguin.com/alternatives/freshchat
Processing: https://botpenguin.com/alternatives/flowxo
Processing: https://botpenguin.com/glossary/chatbot-avatar
Processing: https://botpenguin.com/blogs/how-to-get-started-with-customized-whitelabel-chatgpt
Processing: https://botpenguin.com/glossary/engagement-rate
Processing: https://botpenguin.com/glossary/in-app-support
Processing: https://botpenguin.com/alternatives/manychat
Processing: https://botpenguin.com/whatsapp-business-api-pricing

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sentence_transformers import SentenceTransformer, util
import torch
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline
# tokenizer = AutoTokenizer.from_pretrained("nvidia/Llama-3.1-Nemotron-70B-Instruct-HF")
# model = AutoModelForCausalLM.from_pretrained("nvidia/Llama-3.1-Nemotron-70B-Instruct-HF")
# Load the Llama model and tokenizer
def load_llama_model():
    # model_name = "meta-llama/Llama-2-7b-chat-hf"  # Replace with "llama-3" when available
    tokenizer = AutoTokenizer.from_pretrained("nvidia/Llama-3.1-Nemotron-70B-Instruct-HF")
    model = AutoModelForCausalLM.from_pretrained("nvidia/Llama-3.1-Nemotron-70B-Instruct-HF")
    return model, tokenizer

# Preprocessing and embeddings
def preprocess_and_embed(data_file, model_name="sentence-transformers/all-MiniLM-L6-v2"):
    with open(data_file, 'r', encoding='utf-8') as file:
        data = file.readlines()

    # Chunk text
    chunks = []
    chunk_size = 500  # Limit chunk size for manageable context
    for line in data:
        for i in range(0, len(line), chunk_size):
            chunks.append(line[i:i + chunk_size])

    # Embed chunks
    embedding_model = SentenceTransformer(model_name)
    embeddings = embedding_model.encode(chunks, convert_to_tensor=True)
    return chunks, embeddings

# Search for relevant context
def search_query(query, chunks, embeddings, model_name="sentence-transformers/all-MiniLM-L6-v2"):
    embedding_model = SentenceTransformer(model_name)
    query_embedding = embedding_model.encode(query, convert_to_tensor=True)
    cosine_scores = util.pytorch_cos_sim(query_embedding, embeddings)

    # Retrieve the most relevant chunk
    top_idx = torch.argmax(cosine_scores).item()
    return chunks[top_idx]

# Generate a response using Llama
def generate_response(prompt, context, model, tokenizer):
    full_prompt = f"Context: {context}\n\nQuestion: {prompt}\n\nAnswer:"
    inputs = tokenizer(full_prompt, return_tensors="pt").to("cuda")  # Ensure tensors are on GPU
    outputs = model.generate(inputs["input_ids"], max_length=200, temperature=0.7)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def main():
    # Load Llama model and tokenizer
    llama_model, llama_tokenizer = load_llama_model()

    # Preprocess and embed website data
    data_file = "botpenguin_content.txt"
    chunks, embeddings = preprocess_and_embed(data_file)

    print("Chatbot is ready! Ask me a question about the website:")
    while True:
        query = input("You: ")
        if query.lower() in {"exit", "quit"}:
            print("Goodbye!")
            break

        # Find the relevant context
        context = search_query(query, chunks, embeddings)

        # Generate a response
        response = generate_response(query, context, llama_model, llama_tokenizer)
        print(f"Bot: {response}")

if __name__ == "__main__":
    main()


  from .autonotebook import tqdm as notebook_tqdm





Downloading shards:   0%|          | 0/30 [00:35<?, ?it/s]


KeyboardInterrupt: 

In [4]:
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFaceEndpoint
from langchain.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import TextLoader
import os
os.environ['HUGGINGFACEHUB_API_TOKEN'] = "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"
# 1. Initialize the LLM
llm = HuggingFaceEndpoint(
    endpoint_url="https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2",
    headers={"Authorization": f"Bearer YOUR_HUGGINGFACE_API_KEY"},
    max_length=3000,
    temperature=0.3,
)

# 2. Function to load and process text data
def load_and_embed_data(file_path, embedding_model="sentence-transformers/all-MiniLM-L6-v2"):
    """Load text data, split, and embed."""
    # Load text data
    loader = TextLoader(file_path)
    documents = loader.load()

    # Split text into chunks
    text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    texts = text_splitter.split_documents(documents)

    # Embed texts
    embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
    vector_store = FAISS.from_documents(texts, embeddings)

    # Save FAISS index
    vector_store.save_local("faiss_index")
    print("FAISS index created and saved.")
    return vector_store

# 3. Load FAISS index or create if not exists
def get_vector_store(file_path):
    """Load or create a FAISS vector store."""
    if os.path.exists("faiss_index"):
        vector_store = FAISS.load_local("faiss_index", HuggingFaceEmbeddings())
        print("FAISS index loaded.")
    else:
        vector_store = load_and_embed_data(file_path)
    return vector_store

# 4. Create RetrievalQA Chain
def create_retrieval_chain(vector_store, llm):
    """Create a RetrievalQA chain using FAISS and LLM."""
    retriever = vector_store.as_retriever()
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        return_source_documents=True,
        chain_type="stuff",  # Combine retrieved context into one prompt
    )
    return qa_chain

# 5. Process user input
def process_user_query(qa_chain, query):
    """Get an answer to a user's query."""
    response = qa_chain.run(query)
    print("\nAnswer:")
    print(response)

# Main Workflow
def main():
    # Path to text data
    data_file = r"C:\Users\rushabh.parikh\Desktop\Chatbotforwebsearch\botpenguin_content.txt"

    # Get vector store
    vector_store = get_vector_store(data_file)

    # Create QA chain
    qa_chain = create_retrieval_chain(vector_store, llm)

    print("Chatbot ready! Ask me anything about the website.")
    while True:
        query = input("\nYou: ")
        if query.lower() in {"exit", "quit"}:
            print("Goodbye!")
            break

        # Get answer
        process_user_query(qa_chain, query)

if __name__ == "__main__":
    main()


  warn_deprecated(
                    headers was transferred to model_kwargs.
                    Please make sure that headers is what you intended.
                    max_length was transferred to model_kwargs.
                    Please make sure that max_length is what you intended.


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\rushabh.parikh\.cache\huggingface\token
Login successful


RuntimeError: Error loading botpenguin_content.txt