In [1]:
from langchain_community.document_loaders import UnstructuredURLLoader
import re
import string
import faiss
import numpy as np
import pickle

In [2]:
urls = ["https://brainlox.com/courses/category/technical"]

In [3]:
loader = UnstructuredURLLoader(urls=urls)

In [4]:
data = loader.load()

In [5]:
data[0].page_content = data[0].page_content.replace('\n','')

In [6]:
data[0].page_content = ''.join(word for word in data[0].page_content if word not in string.punctuation)

In [7]:
data[0].page_content = re.sub(r'[🤖🌟💡🎓🚀📈]','',data[0].page_content)

In [8]:
data[0].page_content = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', data[0].page_content)

In [9]:
data[0].page_content = re.sub(r'(?<=[a-zA-Z])(?=\$)', ' ', data[0].page_content)

In [10]:
data[0].page_content = re.sub(r'(?<=[a-zA-Z])(?=\d)', ' ', data[0].page_content)

In [11]:
data[0].page_content = re.sub(r'(?<=\d)(?=[a-zA-Z])', ' ', data[0].page_content)

In [12]:
data[0].page_content = re.sub(r'(?<=[A-Z])(?=[A-Z][a-z])', ' ', data[0].page_content)

In [13]:
data[0].page_content = data[0].page_content.title()

In [14]:
from sentence_transformers import SentenceTransformer
from langchain_text_splitters import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [16]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, 
                                              chunk_overlap = 100)

In [17]:
splits = text_splitter.split_documents(data)

In [18]:
split_texts = [chunk.page_content for chunk in splits]

In [19]:
split_embeddings = np.array([model.encode(text, convert_to_tensor=False) for text in split_texts])

In [20]:
embedding_dim = split_embeddings.shape[1]

In [21]:
index = faiss.IndexFlatL2(embedding_dim)

In [22]:
index.add(split_embeddings)

In [23]:
faiss.write_index(index, "faiss_index.bin")

In [24]:
with open("text_chunks.pkl", "wb") as f:
    pickle.dump(split_texts, f)

In [25]:
print("FAISS index and text chunks saved successfully!")

FAISS index and text chunks saved successfully!
