In [2]:
import sys
import os

sys.path.append('../src')

In [3]:
import nest_asyncio

nest_asyncio.apply()

In [4]:
import cleaning_utils
from pathlib import Path
from typing import List, Tuple, Dict
from llama_index.core import  Document

from llama_index.core.extractors import (
    QuestionsAnsweredExtractor,
    SummaryExtractor,
    KeywordExtractor
    
)

from llama_index.core.schema import MetadataMode
from llama_index.core.ingestion import IngestionPipeline
from llama_index.extractors.entity import EntityExtractor

from llama_index.core.node_parser import TokenTextSplitter, SentenceWindowNodeParser, SemanticSplitterNodeParser, SentenceSplitter
from llama_index.core.schema import BaseNode, TextNode

from transformers import BitsAndBytesConfig

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def create_documents_from_clean_text(cleaned_texts: List[Tuple[str, Dict]]) -> List[Document]:
    documents = [Document(text=t, 
                          metadata=m, 
                          metadata_seperator="\n\n", 
                          excluded_llm_metadata_keys=["file_name",
                                                      "publication_date", 
                                                      "referenced_websites", 
                                                      "section_summary", 
                                                      "excerpt_keywords",
                                                      "questions_this_excerpt_can_answer"
                                                     ]
                         ) for (t, m) in cleaned_texts]
    return documents

In [7]:
%%capture
cleaned_pdfs = cleaning_utils.clean_and_prepare_texts('../SuperMicro_Solution_Brief')

In [8]:
documents = create_documents_from_clean_text(cleaned_pdfs)

In [9]:
documents[0].__dict__

{'id_': 'c87702b2-6ce7-4c10-8f40-dbdaf06c77c0',
 'embedding': None,
 'metadata': {'file_name': 'Solution-Brief_Workstations_Entertainment.pdf',
  'publication_date': 'December 2021',
  'referenced_websites': ['https://www.supermicro.com/en/products/superworkstation']},
 'excluded_embed_metadata_keys': [],
 'excluded_llm_metadata_keys': ['file_name',
  'publication_date',
  'referenced_websites',
  'section_summary',
  'excerpt_keywords',
  'questions_this_excerpt_can_answer'],
 'relationships': {},
 'text': "1 Supermicro Workstation Family Media and Entertainment is a broad and diverse industry where companies are required to work and collaborate seamlessly to succeed. The ability to accelerate production workflows and gain value faster are top goals for today’s media companies. To stay ahead of the competition, leaders in the industry are implementing cutting-edge workstations to modernize their work environments. Advancements in virtual production, rendering, simulation, and artifici

In [10]:
from llama_index.core import PromptTemplate
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import Settings

query_wrapper_prompt = PromptTemplate(
    "Below is an instruction that describes a task. "
    "Write a response that appropriately completes the request.\n\n"
    "### Instruction:\n{query_str}\n\n### Response:"
)

llm = HuggingFaceLLM(
    context_window=4096,
    generate_kwargs={"temperature": 0.25, 
                     "do_sample": True, 
                     "top_p":0.80
                     },
    is_chat_model=True,
    system_prompt = "You are an AI assistant that follows instructions extremely well. Help as much as you can.",
    # query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="Deci/DeciLM-7B-instruct",
    model_name="Deci/DeciLM-7B-instruct",
    device_map="xpu",
    tokenizer_kwargs={"max_length": 4096},
    model_kwargs={"torch_dtype": "auto",
                  "trust_remote_code":True
                 },
)

  warn(
Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:13<00:00,  4.57s/it]


In [11]:
Settings.llm = llm

In [12]:
qa_prompt = """ Here is the context:
{context_str}

Given the contextual information, generate {num_questions} questions this context can provide \
specific answers about the products, software, hardware, and solutions mentioned in this document\
which are unlikely to be found elsewhere.

Higher-level summaries of the surrounding context may be provided as well.  Try using these summaries to generate better questions that this context can answer."""

summary_prompt = """ Here is the content of the section:

{context_str}

Provide a Summary of key topics, entities, products, software, hardware, and solutions discussed in this section.

Summary: 

"""

text_splitter = TokenTextSplitter(
    separator=" ", 
    chunk_size=256, 
    chunk_overlap=8
)

qa_extractor = QuestionsAnsweredExtractor(
    questions=5, 
    prompt_template=qa_prompt,
    num_workers=os.cpu_count()
)

summary = SummaryExtractor(
    summaries = ["self"], 
    prompt_template=summary_prompt,
    num_workers=os.cpu_count()
)

key_words = KeywordExtractor(
    keywords=5,
    num_workers=os.cpu_count()
)

In [13]:
some_docs = documents[:1]

In [None]:
pipeline = IngestionPipeline(
    transformations=[text_splitter, summary, key_words, qa_extractor]
)

nodes = pipeline.run(
    documents=some_docs,
    in_place=True,
    show_progress=True,
    # num_workers=4
)

Parsing nodes: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 88.40it/s]
  0%|                                                                                                                                        | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end gen

In [None]:
nodes[2].__dict__['metadata']

In [None]:
print(nodes[0].get_content(metadata_mode=MetadataMode.LLM))

In [None]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core import StorageContext
from llama_index.vector_stores.qdrant import QdrantVectorStore

import qdrant_client


In [None]:
type(nodes)

In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
embed_model = HuggingFaceEmbedding(
    model_name="WhereIsAI/UAE-Large-V1",
    tokenizer_name
)

In [None]:
client = qdrant_client.QdrantClient(
    # you can use :memory: mode for fast and light-weight experiments,
    # it does not require to have Qdrant deployed anywhere
    # but requires qdrant-client >= 1.1.1
    location=":memory:"
    # otherwise set Qdrant instance address with:
    # uri="http://<host>:<port>"
    # set API KEY for Qdrant Cloud
    # api_key="<qdrant-api-key>",
)

In [None]:
vector_store = QdrantVectorStore(client=client, collection_name="test")
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [None]:
index = VectorStoreIndex(
    nodes=nodes,
    storage_context=storage_context,
)