In [6]:
import requests
from lxml import html
import io, os
from PyPDF2 import PdfReader
from PyPDF2.errors import PdfReadError
import urllib
import pypdfium2 as pdfium
from pymongo import MongoClient
from openai import OpenAI
from urllib.error import HTTPError
from tenacity import retry
from tenacity import stop_after_delay
from tenacity import RetryError
from tenacity import stop_after_attempt
from tenacity import wait_exponential
from langchain.text_splitter import RecursiveCharacterTextSplitter
import tiktoken
import uuid 

openai_key = ''
web_url = "https://pakistancode.gov.pk/english/LGu0xVD.php"
mongo_db_uri = ""
database = "ai_assistant_builder"
collection = "data"
client =OpenAI(api_key=openai_key) 


def tiktoken_len(text):
    tokenizer = tiktoken.get_encoding('cl100k_base')
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)
    
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,  # number of tokens overlap between chunks
    length_function=tiktoken_len,
    separators=['\n\n', '\n', ' ', '']
)

def get_text_from_pdf(pdf_path):
    pdf = pdfium.PdfDocument(pdf_path)
    num_pages = len(pdf)
    pdf_content = ""
    for i in range(num_pages):
        page = pdf[i]
        text_page = page.get_textpage()
        page_text = text_page.get_text_bounded()
        pdf_content += page_text 
    pdf.close()
    return pdf_content

@retry(wait=wait_exponential(multiplier=1, min=4, max=10), stop=stop_after_attempt(10))
def create_embedding(client,text):
    try:
        response = client.embeddings.create(
            model="text-embedding-ada-002",
            input=text
        )
        return (response, 'success')     
    except Exception as e:
        if 'The server is currently overloaded with other requests' in e:
            raise Exception
        else:
            return (e, 'failed')


client_mongo = MongoClient(mongo_db_uri)
db = client_mongo[database]
collection = db[collection]

In [7]:
response = requests.get(web_url, timeout=60)
tree = html.fromstring(response.content)
urls = ["https://pakistancode.gov.pk/english/"+item for item in tree.xpath('//div[@id="category"]/ul/div/a/@href')]
categories = tree.xpath('//div[@id="category"]/ul/div/a/text()')

all_pdfs = {}
for idx,url in enumerate(urls):
    response = requests.get(url, timeout=60)
    tree = html.fromstring(response.content)
    if categories[idx] not in all_pdfs:
        all_pdfs[categories[idx]] = []
    pdfs_urls = ["https://pakistancode.gov.pk/english/"+item for item in tree.xpath('//div[@class="accordion"]//a/@href')]
    for pdf_url in pdfs_urls:
        pdf_url_response = requests.get(pdf_url, timeout=60)
        pdf_url_tree = html.fromstring(pdf_url_response.content)
        pdf_download_url = pdf_url_tree.xpath('//div[@id="download"]/a/@href')
        all_pdfs[categories[idx]].append(pdf_download_url[0])

In [28]:
try:
    os.makedirs("downloaded_pdfs")
except FileExistsError:
    pass
for category in list(all_pdfs.keys()):
    print(category)
    try:
        os.makedirs(f"downloaded_pdfs/{category}")
    except FileExistsError:
        pass
    for url in all_pdfs[category]:
        print(url)
        file_url = f"./downloaded_pdfs/{category}/{url.split('/')[-1]}"
        try:
            urllib.request.urlretrieve(url, file_url)
        except HTTPError:
            continue
        pdf_content = get_text_from_pdf(file_url)
        splitted_content=text_splitter.split_text(pdf_content)
        splitted_content = [' '.join(i.split()) for i in splitted_content if i.strip() != '']  
        vector_data=[]
        for chunk in splitted_content:
            temp_json={}
            try:            
                embedding_resp = create_embedding(client, chunk)
                if embedding_resp[1] == 'failed':
                    print(embedding_resp[0])
                else:
                    response = embedding_resp[0]
            except RetryError:
                print("Some error occured!")
             
            temp_json['embedding']=response.data[0].embedding
            temp_json['content']=chunk
            temp_json['_id']=str(uuid.uuid4())
            temp_json['category']=category
            temp_json['url']=url
            vector_data.append(temp_json)        
        collection.insert_many(vector_data, ordered=False)   

Civil Laws
https://pakistancode.gov.pk/pdffiles/administrator0f037525851391debc0835ec00e84cbf.pdf
https://pakistancode.gov.pk/pdffiles/administratorb60fe60d8c74ce862aafd15e48a035af.pdf
https://pakistancode.gov.pk/pdffiles/administrator6598dabbad120033d4d42d717dcf9755.pdf


KeyboardInterrupt: 