In [1]:
import os
import sys

import nest_asyncio

nest_asyncio.apply()
sys.path.append('../src')

# Step 0: Setup LLM and Embedding Model

In [2]:
import setup_utils

setup_utils.setup_llm()

setup_utils.setup_embed_model()

  warn(


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

# Step 1: Clean PDF

In [3]:
from pathlib import Path
import random

import cleaning_utils

pdf_directory = "/home/demotime/DeciLM_RAG_Demo/SuperMicro_Solution_Brief"

pdf_files = Path(pdf_directory)

pdf_files_paths = list(pdf_files.iterdir())

# select two random solutions brief
two_pdfs = random_paths = random.sample(pdf_files_paths, 2)

In [4]:
two_pdfs

[PosixPath('/home/demotime/DeciLM_RAG_Demo/SuperMicro_Solution_Brief/Solution-Brief_SuperCloud_Composer.pdf'),
 PosixPath('/home/demotime/DeciLM_RAG_Demo/SuperMicro_Solution_Brief/Solution-Brief_Workstations_Entertainment.pdf')]

In [5]:
cleaned_texts = []

for pdf_file in two_pdfs:
    file_metadata = {'file_name': pdf_file.name}
    pdf_text = cleaning_utils.extract_and_process_text(str(pdf_file))
    file_metadata['publication_date'], pdf_text = cleaning_utils.extract_and_remove_all_dates(pdf_text)
    file_metadata['referenced_websites'], pdf_text = cleaning_utils.extract_and_remove_all_websites(pdf_text)
    pdf_text = cleaning_utils.remove_dot_sequences(pdf_text)
    pdf_text = cleaning_utils.remove_copyright_notice(pdf_text)
    pdf_text = cleaning_utils.scrub_text(pdf_text)
    pdf_text = cleaning_utils.remove_text_after_phrases(pdf_text)
    
    cleaned_texts.append((pdf_text, file_metadata))

Processing /home/demotime/DeciLM_RAG_Demo/SuperMicro_Solution_Brief/Solution-Brief_SuperCloud_Composer.pdf...
Processing /home/demotime/DeciLM_RAG_Demo/SuperMicro_Solution_Brief/Solution-Brief_Workstations_Entertainment.pdf...


In [6]:
cleaned_texts

[('SUPERCLOUD COMPOSER Your Infrastructure Gateway SuperCloud Composer is a composable cloud management platform that provides a unified dashboard to administer software-defined data centers. Supermicro’s cloud infrastructure management software brings speed, agility, and simplicity to IT administration by integrating data center tasks into a single intelligent management solution. Our hybrid approach allows traditional paradigm data centers to continue to support their existing operations while allowing their current workloads to have the flexibility to move to a disaggregated infrastructure model. Our robust composer engine can orchestrate cloud workloads through a streamlined Redfish API. SuperCloud Composer also monitors and manages the broad portfolio of multi-generation Supermicro servers and third-party systems through its data center lifecycle management feature set from a single unified console. 1 Key Benefits 2 Features 3 Hardware and Software Requirements 11 Virtual Machine 

# Step 2: Load PDF to LlamaIndex `Document` Object

In [7]:
import ingest

documents = ingest.create_documents_from_clean_text(cleaned_texts)

In [8]:
documents[0].__dict__

{'id_': '018ae5b6-83a6-4257-9d8e-e9ac32c3424e',
 'embedding': None,
 'metadata': {'file_name': 'Solution-Brief_SuperCloud_Composer.pdf',
  'publication_date': 'January 2021',
  'referenced_websites': ['https://www.networkworld.com/article/2959532/startup-says-it-has-solved-server-']},
 'excluded_embed_metadata_keys': [],
 'excluded_llm_metadata_keys': ['file_name',
  'publication_date',
  'referenced_websites',
  'section_summary',
  'excerpt_keywords',
  'questions_this_excerpt_can_answer'],
 'relationships': {},
 'text': 'SUPERCLOUD COMPOSER Your Infrastructure Gateway SuperCloud Composer is a composable cloud management platform that provides a unified dashboard to administer software-defined data centers. Supermicro’s cloud infrastructure management software brings speed, agility, and simplicity to IT administration by integrating data center tasks into a single intelligent management solution. Our hybrid approach allows traditional paradigm data centers to continue to support thei

# Step 3: Split `Document` objects into chunks (`Nodes`) and attach metadata using `DeciLM-7B`

In [9]:
from llama_index.core.extractors import QuestionsAnsweredExtractor, SummaryExtractor
from llama_index.core.node_parser import TokenTextSplitter

text_splitter = TokenTextSplitter(
    separator=" ", 
    chunk_size=512, 
    chunk_overlap=32,   
)

qa_extractor = QuestionsAnsweredExtractor(
    questions=3, 
    num_workers=os.cpu_count(),
    kwargs = {"max_length":128, "temperature": 0.001, "do_sample": True}
)

summary = SummaryExtractor(
    summaries = ["self"], 
    num_workers=os.cpu_count(),
    kwargs = {"max_length":128, "temperature": 0.001, "do_sample": True}
)

transforms = [text_splitter, qa_extractor, summary]

In [10]:
from llama_index.core.ingestion import IngestionPipeline

pipeline = IngestionPipeline(transformations=transforms)

In [11]:
nodes = pipeline.run(documents=documents, in_place=True, show_progress=True)

Parsing nodes:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|                                                                                                                                                                             | 0/12 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open

In [12]:
len(nodes)

12

In [13]:
nodes[0].__dict__

{'id_': 'f080231e-306a-4ad2-bcb9-49274c6625c5',
 'embedding': None,
 'metadata': {'file_name': 'Solution-Brief_SuperCloud_Composer.pdf',
  'publication_date': 'January 2021',
  'referenced_websites': ['https://www.networkworld.com/article/2959532/startup-says-it-has-solved-server-'],
  'questions_this_excerpt_can_answer': '1. What are the key benefits of SuperCloud Composer?\n2. What are the main features of SuperCloud Composer?\n3. What are the hardware and software requirements for SuperCloud Composer?',
  'section_summary': 'The SuperCloud Composer is a cloud management platform that provides a unified dashboard to administer software-defined data centers. It offers key benefits such as speed, agility, and simplicity in IT administration, and integrates data center tasks into a single intelligent management solution. The platform supports a wide range of hardware and software requirements, including virtual machine appliances, supported server platforms, and SuperCloud Composer lice

# Step 4: Store `Nodes` in vector database, and instantiate a retriever

In [None]:
from llama_index.core import VectorStoreIndex

index = VectorStoreIndex(nodes)

In [None]:
query_engine = index.as_query_engine(streaming=True)

# Step 5: Query vector database

In [None]:
question = """ What are the potential challenges and limitations that businesses may\
face when implementing Cloudera Data Flow on Supermicro GrandTwin systems.
"""

query_engine.query(question).print_response_stream()