In [1]:
import requests
from bs4 import BeautifulSoup
from langchain.document_loaders import WebBaseLoader
from bs4 import SoupStrainer
from threading import Thread
import os
#import re

USER_AGENT environment variable not set, consider setting it to identify your requests.


## Extracting data from version 4.1 or higher

In [2]:
version = '4.5'
if float(version) <= 4: print('Warning, this method works only for blender verson 4.1 or higher')
base_url = f'https://docs.blender.org/manual/en/{version}/'

In [3]:
response = requests.get(base_url)
bs = BeautifulSoup(response.text,'lxml')

In [4]:
side_bar = bs.find('div',{'class':'sidebar-tree'})
bs_side_bar = BeautifulSoup(str(side_bar),'lxml')
side_bar_items = bs_side_bar.find_all('a')
len(side_bar_items)

2112

In [20]:
links = {}
for item in side_bar_items:
    link = item.get('href')
    if link and not link.endswith('index.html'):
        full_link = base_url + link
        if full_link not in links:
            links[full_link] = item.text

len(links)

1721

In [9]:
def extract_docs(links:dict,n_jobs=5):
    infos = list(links.keys())
    values = list(links.values())

    threads = []
    results = [0]*len(infos)
    n_jobs = n_jobs if n_jobs > 0 else os.cpu_count()
    for i in range(n_jobs):
        thread = Thread(target=worker, args=(infos,values,i,n_jobs,results ))
        threads.append(thread)
        thread.start()

    for p in threads:
        p.join()

    return [item for items in results for item in items ]


def worker(infos,values,start,inc,results):
    for i in range(start,len(infos),inc):
        web_loader = WebBaseLoader(values[i],bs_kwargs=dict( parse_only=SoupStrainer('article') ) )
        docs = web_loader.load()
        for doc in docs:
            doc.metadata['info'] = infos[i]
            doc.metadata['version'] = version #values[i]['version']
        results[i] = docs

In [10]:
docs = extract_docs(links,n_jobs=-1)

In [11]:
len(docs)

1292

In [12]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 2000,chunk_overlap = 300)
docs_split = text_splitter.split_documents(docs)

In [13]:
from langchain.vectorstores import FAISS
from langchain_ollama import OllamaEmbeddings

In [14]:
embedding = OllamaEmbeddings(base_url='localhost:11434',model='nomic-embed-text')

In [18]:
vectorstore = FAISS.from_documents(docs_split,embedding=embedding)
vectorstore.save_local(f'vectordb_{version}')