In [37]:
from langchain.document_loaders import WebBaseLoader
from bs4 import BeautifulSoup
import re
import faiss
import numpy as np
from langchain_huggingface import HuggingFaceEmbeddings

# Function to clean extracted text
def clean_text(text):
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    text = re.sub(r'(Sign In|FAQ|Contact Us|Home|Book a Free Demo Now)', '', text)

    # Improved regex for detailed course extraction
    course_info = re.findall(r'(\$\d+.*?LEARN.*?)\d+\s*Lessons', text, re.DOTALL)
    
    # Enhanced formatting and removing redundant spaces
    formatted_courses = [re.sub(r'\s+', ' ', course).strip() for course in course_info]
    return "\n".join(formatted_courses) if formatted_courses else "No relevant data found."

# Function to extract and clean data
def extract_data(url):
    loader = WebBaseLoader(url)
    documents = loader.load()
    cleaned_data = [clean_text(doc.page_content) for doc in documents if doc.page_content.strip()]

    if cleaned_data:
        with open("data.txt", "w", encoding="utf-8") as file:
            file.write("\n".join(cleaned_data))
        print("Data extracted and cleaned successfully.")
    else:
        print("No valid data extracted from the URL.")

# Extract data from URL
url = "https://brainlox.com/courses/category/technical"
extract_data(url)

# Initialize embeddings and build FAISS index
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

with open("data.txt", "r", encoding="utf-8") as file:
    data = [line.strip() for line in file.readlines() if line.strip()]

data_embeddings = np.array([embeddings.embed_query(line) for line in data], dtype="float32")

# Create and save FAISS index
index = faiss.IndexFlatL2(data_embeddings.shape[1])
index.add(data_embeddings)
faiss.write_index(index, "faiss_index.bin")
print("FAISS index rebuilt successfully!")

# Function to retrieve answers from FAISS index
def get_answer(query):
    query_embedding = np.array([embeddings.embed_query(query)], dtype="float32")
    _, result_indices = index.search(query_embedding, k=3)

    # Collect multiple responses for better results
    answers = [data[i] for i in result_indices[0] if i < len(data)]
    return "\n".join(answers) if answers else "No relevant data found."

# Example usage
query = "What technical courses are available?"
answer = get_answer(query)
print("Answer:", answer)

Data extracted and cleaned successfully.
FAISS index rebuilt successfully!
Answer: $30per sessionLEARN ROBOTICS You can open all kinds of doors for advancement in so many careers with a basic understanding of el
$30per sessionLEARN SCRATCH PROGRAMING Scratch Course is the foundation of coding and is a building block of a coding journey. If you want
$30per sessionLEARN CORE JAVA PROGRAMMING ONLINE Java is a very popular high-level, class-based, object-oriented programming language that is design
