In [2]:
import sys
import os

sys.path.append('../src')

In [3]:
import nest_asyncio

nest_asyncio.apply()

In [4]:
import cleaning_utils
from pathlib import Path
from typing import List, Tuple, Dict
from llama_index.core import  Document

from llama_index.core.extractors import (
    QuestionsAnsweredExtractor,
    SummaryExtractor,
    KeywordExtractor
    
)

from llama_index.core.schema import MetadataMode
from llama_index.core.ingestion import IngestionPipeline
from llama_index.extractors.entity import EntityExtractor

from llama_index.core.node_parser import TokenTextSplitter, SentenceWindowNodeParser, SemanticSplitterNodeParser, SentenceSplitter
from llama_index.core.schema import BaseNode, TextNode

from transformers import BitsAndBytesConfig

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def create_documents_from_clean_text(cleaned_texts: List[Tuple[str, Dict]]) -> List[Document]:
    documents = [Document(text=t, 
                          metadata=m, 
                          metadata_seperator="\n\n", 
                          excluded_llm_metadata_keys=["file_name",
                                                      "publication_date", 
                                                      "referenced_websites", 
                                                      "section_summary", 
                                                      "excerpt_keywords",
                                                      "questions_this_excerpt_can_answer"
                                                     ]
                         ) for (t, m) in cleaned_texts]
    return documents

In [7]:
%%capture
cleaned_pdfs = cleaning_utils.clean_and_prepare_texts('../SuperMicro_Solution_Brief')

In [8]:
documents = create_documents_from_clean_text(cleaned_pdfs)

In [9]:
documents[0].__dict__

{'id_': 'c87702b2-6ce7-4c10-8f40-dbdaf06c77c0',
 'embedding': None,
 'metadata': {'file_name': 'Solution-Brief_Workstations_Entertainment.pdf',
  'publication_date': 'December 2021',
  'referenced_websites': ['https://www.supermicro.com/en/products/superworkstation']},
 'excluded_embed_metadata_keys': [],
 'excluded_llm_metadata_keys': ['file_name',
  'publication_date',
  'referenced_websites',
  'section_summary',
  'excerpt_keywords',
  'questions_this_excerpt_can_answer'],
 'relationships': {},
 'text': "1 Supermicro Workstation Family Media and Entertainment is a broad and diverse industry where companies are required to work and collaborate seamlessly to succeed. The ability to accelerate production workflows and gain value faster are top goals for today’s media companies. To stay ahead of the competition, leaders in the industry are implementing cutting-edge workstations to modernize their work environments. Advancements in virtual production, rendering, simulation, and artifici

In [10]:
from llama_index.core import PromptTemplate
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import Settings

query_wrapper_prompt = PromptTemplate(
    "Below is an instruction that describes a task. "
    "Write a response that appropriately completes the request.\n\n"
    "### Instruction:\n{query_str}\n\n### Response:"
)

llm = HuggingFaceLLM(
    context_window=4096,
    generate_kwargs={"temperature": 0.25, 
                     "do_sample": True, 
                     "top_p":0.80
                     },
    is_chat_model=True,
    system_prompt = "You are an AI assistant that follows instructions extremely well. Help as much as you can.",
    # query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="Deci/DeciLM-7B-instruct",
    model_name="Deci/DeciLM-7B-instruct",
    device_map="xpu",
    tokenizer_kwargs={"max_length": 4096},
    model_kwargs={"torch_dtype": "auto",
                  "trust_remote_code":True
                 },
)

  warn(
Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:13<00:00,  4.57s/it]


In [11]:
Settings.llm = llm

In [26]:
qa_prompt = """ Here is the contentual information from a solution brief by SuperMicro:
{context_str}

Given the contextual information, generate {num_questions} questions this context can provide \
specific answers about the products, software, hardware, and solutions discussed in this document\
which are unlikely to be found elsewhere.

Higher-level summaries of the surrounding context may be provided as well.  Try using these summaries to generate better questions that this context can answer."""

summary_prompt = """ Here is the content of the section, which is from a solution brief by SuperMicro:

{context_str}

Provide a Summary of the section. Also, identify the specific companies, technology products, software, hardware (GPUs, CPUs, memory, accelerators, etc), and solutions discussed in this section.

Summary: 


"""

text_splitter = TokenTextSplitter(
    separator=" ", 
    chunk_size=256, 
    chunk_overlap=8
)

qa_extractor = QuestionsAnsweredExtractor(
    questions=5, 
    prompt_template=qa_prompt,
    num_workers=os.cpu_count()
)

summary = SummaryExtractor(
    summaries = ["self"], 
    prompt_template=summary_prompt,
    num_workers=os.cpu_count()
)

key_words = KeywordExtractor(
    keywords=5,
    num_workers=os.cpu_count()
)

In [27]:
some_docs = documents[:1]

In [28]:
pipeline = IngestionPipeline(
    transformations=[text_splitter, summary, key_words, qa_extractor]
)

nodes = pipeline.run(
    documents=some_docs,
    in_place=True,
    show_progress=True,
    # num_workers=4
)

Parsing nodes: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 21.93it/s]
  0%|                                                                                                                                        | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end gen

In [29]:
nodes[2].__dict__['metadata']

{'file_name': 'Solution-Brief_Workstations_Entertainment.pdf',
 'publication_date': 'December 2021',
 'referenced_websites': ['https://www.supermicro.com/en/products/superworkstation'],
 'section_summary': "The section discusses the partnership between Supermicro and NVIDIA to provide high-performance, high-efficiency technology solutions for media companies. Supermicro is a global leader in high-performance, high-efficiency technology, offering the broadest product portfolio for robust workstations. The goal is to enable the success of all customers. Supermicro achieves this through extensive engineering expertise and the industry's broadest product portfolio, which offers green computing technologies that reduce energy costs, effectively allocate resources to tackle complex media workflows and drive down operational costs. In partnership with NVIDIA, they offer a range of performance-boosting solutions to help media companies work better, smarter, and faster. They build IT environmen

In [23]:
print(nodes[0].get_content(metadata_mode=MetadataMode.LLM))

1 Supermicro Workstation Family Media and Entertainment is a broad and diverse industry where companies are required to work and collaborate seamlessly to succeed. The ability to accelerate production workflows and gain value faster are top goals for today’s media companies. To stay ahead of the competition, leaders in the industry are implementing cutting-edge workstations to modernize their work environments. Advancements in virtual production, rendering, simulation, and artificial intelligence (AI) continue to propel the future of entertainment. Next generation workstations are the ideal foundation to reinvent how content is created, distributed, and consumed. These platforms combine robust compute technology, CPU and GPU acceleration, more memory, increased storage, and comprehensive management software to unlock high performance levels. 1 Partnering for Success 2 Modernizing A State-of-the-Art Production Environment 3 Summary 5 As a global leader in high performance, high efficien

In [24]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core import StorageContext
from llama_index.vector_stores.qdrant import QdrantVectorStore

import qdrant_client


In [25]:
type(nodes)

list

In [30]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
embed_model = HuggingFaceEmbedding(
    model_name="WhereIsAI/UAE-Large-V1",
    tokenizer_name="Deci/DeciLM-7B-instruct",
    device="xpu",
    trust_remote_code=True
)

In [31]:
Settings.embed_model = embed_model

In [53]:
client = qdrant_client.QdrantClient(
    path="../vector_store"
)

In [43]:
vector_store = QdrantVectorStore(
    client=client, 
    collection_name="SuperMicro Solutions Briefs",
    path="../vector_store"
)
storage_context = StorageContext.from_defaults(
    vector_store=vector_store,
    persist_dir="../vector_store"
)

In [44]:
vector_store

QdrantVectorStore(stores_text=True, is_embedding_query=True, flat_metadata=False, collection_name='test', path=None, url=None, api_key=None, batch_size=64, parallel=1, max_retries=3, client_kwargs={}, enable_hybrid=False)

In [45]:
index = VectorStoreIndex(
    nodes,
    storage_context=storage_context,
)

In [47]:
query_engine = index.as_query_engine()
response = query_engine.query("How does SuperMicro and NVIDIA work well together?")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [50]:
dir(query_engine)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_apply_node_postprocessors',
 '_aquery',
 '_as_query_component',
 '_get_prompt_modules',
 '_get_prompts',
 '_node_postprocessors',
 '_query',
 '_response_synthesizer',
 '_retriever',
 '_update_prompts',
 '_validate_prompts',
 'aquery',
 'aretrieve',
 'as_query_component',
 'asynthesize',
 'callback_manager',
 'from_args',
 'get_prompts',
 'query',
 'retrieve',
 'retriever',
 'synthesize',
 'update_prompts',
 'with_retriever']

In [38]:
dir(index)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__class_getitem__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__orig_bases__',
 '__parameters__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_add_nodes_to_index',
 '_aget_node_with_embedding',
 '_async_add_nodes_to_index',
 '_build_index_from_nodes',
 '_callback_manager',
 '_delete_node',
 '_docstore',
 '_embed_model',
 '_get_node_with_embedding',
 '_graph_store',
 '_index_struct',
 '_insert',
 '_insert_batch_size',
 '_is_protocol',
 '_object_map',
 '_service_context',
 '_show_progress',
 '_storage_context',
 '_store_nodes_override',
 '_transformations',
 '_use_async',
 '_vector_store',
 'as_chat_engine',
 'as_qu