In [None]:
import os
import sys

import nest_asyncio

nest_asyncio.apply()
sys.path.append('../src')

# Step 0: Setup LLM and Embedding Model

In [None]:
import setup_utils

setup_utils.setup_llm()

setup_utils.setup_embed_model()

# Step 1: Clean PDF

In [None]:
from pathlib import Path
import random

import cleaning_utils

pdf_directory = "/home/demotime/DeciLM_RAG_Demo/SuperMicro_Solution_Brief"

pdf_files = Path(pdf_directory)

pdf_files_paths = list(pdf_files.iterdir())

# select two random solutions brief
two_pdfs = random_paths = random.sample(pdf_files_paths, 2)

In [None]:
two_pdfs

In [None]:
cleaned_texts = []

for pdf_file in two_pdfs:
    file_metadata = {'file_name': pdf_file.name}
    pdf_text = cleaning_utils.extract_and_process_text(str(pdf_file))
    file_metadata['publication_date'], pdf_text = cleaning_utils.extract_and_remove_all_dates(pdf_text)
    file_metadata['referenced_websites'], pdf_text = cleaning_utils.extract_and_remove_all_websites(pdf_text)
    pdf_text = cleaning_utils.remove_dot_sequences(pdf_text)
    pdf_text = cleaning_utils.remove_copyright_notice(pdf_text)
    pdf_text = cleaning_utils.scrub_text(pdf_text)
    pdf_text = cleaning_utils.remove_text_after_phrases(pdf_text)
    
    cleaned_texts.append((pdf_text, file_metadata))

In [None]:
cleaned_texts

# Step 2: Load PDF to LlamaIndex `Document` Object

In [None]:
import ingest

documents = ingest.create_documents_from_clean_text(cleaned_texts)

In [None]:
documents[0].__dict__

# Step 3: Split `Document` objects into chunks (`Nodes`) and attach metadata using `DeciLM-7B`

In [None]:
from llama_index.core.extractors import QuestionsAnsweredExtractor, SummaryExtractor
from llama_index.core.node_parser import TokenTextSplitter

text_splitter = TokenTextSplitter(
    separator=" ", 
    chunk_size=512, 
    chunk_overlap=32,   
)

qa_extractor = QuestionsAnsweredExtractor(
    questions=3, 
    num_workers=os.cpu_count(),
    kwargs = {"max_length":128, "temperature": 0.001, "do_sample": True}
)

summary = SummaryExtractor(
    summaries = ["self"], 
    num_workers=os.cpu_count(),
    kwargs = {"max_length":128, "temperature": 0.001, "do_sample": True}
)

transforms = [text_splitter, qa_extractor, summary]

In [None]:
from llama_index.core.ingestion import IngestionPipeline

pipeline = IngestionPipeline(transformations=transforms)

In [None]:
nodes = pipeline.run(documents=documents, in_place=True, show_progress=True)

In [None]:
len(nodes)

In [None]:
nodes[0].__dict__

# Step 4: Store `Nodes` in vector database, and instantiate a retriever

In [None]:
from llama_index.core import VectorStoreIndex

index = VectorStoreIndex(nodes)

In [None]:
query_engine = index.as_query_engine(streaming=True)

# Step 5: Query vector database

In [None]:
question = """ What are the potential challenges and limitations that businesses may\
face when implementing Cloudera Data Flow on Supermicro GrandTwin systems.
"""

query_engine.query(question).print_response_stream()