In [1]:
import sys
import os

sys.path.append('../src')

In [2]:
import nest_asyncio

nest_asyncio.apply()

In [3]:
import cleaning_utils
from pathlib import Path
from typing import List, Tuple, Dict
from llama_index.core import  Document

from llama_index.core.extractors import (
    QuestionsAnsweredExtractor,
    SummaryExtractor,
    KeywordExtractor
    
)

from llama_index.core.schema import MetadataMode
from llama_index.core.ingestion import IngestionPipeline
from llama_index.extractors.entity import EntityExtractor

from llama_index.core.node_parser import TokenTextSplitter, SentenceWindowNodeParser, SemanticSplitterNodeParser, SentenceSplitter
from llama_index.core.schema import BaseNode, TextNode

from transformers import BitsAndBytesConfig

FileNotFoundError: [Errno 2] No such file or directory

In [None]:
def create_documents_from_clean_text(cleaned_texts: List[Tuple[str, Dict]]) -> List[Document]:
    documents = [Document(text=t, 
                          metadata=m, 
                          metadata_seperator="\n\n", 
                          excluded_llm_metadata_keys=["file_name",
                                                      "publication_date", 
                                                      "referenced_websites", 
                                                      "section_summary", 
                                                      "excerpt_keywords",
                                                      "questions_this_excerpt_can_answer"
                                                     ]
                         ) for (t, m) in cleaned_texts]
    return documents

In [None]:
%%capture
cleaned_pdfs = cleaning_utils.clean_and_prepare_texts('../SuperMicro_Solution_Brief')

Exception ignored in atexit callback: <bound method InteractiveShell.atexit_operations of <ipykernel.zmqshell.ZMQInteractiveShell object at 0x7f0016e40090>>
Traceback (most recent call last):
  File "/home/demotime/miniconda3/envs/decilm_rag/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3920, in atexit_operations
    self._atexit_once()
  File "/home/demotime/miniconda3/envs/decilm_rag/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3899, in _atexit_once
    self.reset(new_session=False)
  File "/home/demotime/miniconda3/envs/decilm_rag/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 1399, in reset
    self.history_manager.reset(new_session)
  File "/home/demotime/miniconda3/envs/decilm_rag/lib/python3.11/site-packages/IPython/core/history.py", line 607, in reset
    self.dir_hist[:] = [Path.cwd()]
                        ^^^^^^^^^^
  File "/home/demotime/miniconda3/envs/decilm_rag/lib/python3.11/pathlib.py", line 907, in

In [None]:
documents = create_documents_from_clean_text(cleaned_pdfs)

In [None]:
documents[0].__dict__

In [None]:
from llama_index.core import PromptTemplate
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import Settings

query_wrapper_prompt = PromptTemplate(
    "Below is an instruction that describes a task. "
    "Write a response that appropriately completes the request.\n\n"
    "### Instruction:\n{query_str}\n\n### Response:"
)

llm = HuggingFaceLLM(
    context_window=4096,
    generate_kwargs={"temperature": 0.25, 
                     "do_sample": True, 
                     "top_p":0.80
                     },
    is_chat_model=True,
    system_prompt = "You are an AI assistant that follows instructions extremely well. Help as much as you can.",
    # query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="Deci/DeciLM-7B-instruct",
    model_name="Deci/DeciLM-7B-instruct",
    device_map="xpu",
    tokenizer_kwargs={"max_length": 4096},
    model_kwargs={"torch_dtype": "auto",
                  "trust_remote_code":True
                 },
)

In [None]:
Settings.llm = llm

In [None]:
qa_prompt = """ Here is the context:
{context_str}

Given the contextual information, generate {num_questions} questions this context can provide \
specific answers about the products, software, hardware, and solutions mentioned in this document\
which are unlikely to be found elsewhere.

Higher-level summaries of the surrounding context may be provided as well.  Try using these summaries to generate better questions that this context can answer."""

summary_prompt = """ Here is the content of the section:

{context_str}

Provide a Summary of key topics, entities, products, software, hardware, and solutions discussed in this section.

Summary: 

"""

text_splitter = TokenTextSplitter(
    separator=" ", 
    chunk_size=256, 
    chunk_overlap=8
)

qa_extractor = QuestionsAnsweredExtractor(
    questions=5, 
    prompt_template=qa_prompt,
    num_workers=os.cpu_count()
)

summary = SummaryExtractor(
    summaries = ["self"], 
    prompt_template=summary_prompt,
    num_workers=os.cpu_count()
)

key_words = KeywordExtractor(
    keywords=5,
    num_workers=os.cpu_count()
)

In [None]:
some_docs = documents[:2]

In [None]:
pipeline = IngestionPipeline(
    transformations=[text_splitter, summary, key_words, qa_extractor]
)

nodes = pipeline.run(
    documents=some_docs,
    in_place=True,
    show_progress=True,
    # num_workers=4
)

In [None]:
nodes[2].__dict__['metadata']

In [None]:
print(nodes[0].get_content(metadata_mode=MetadataMode.LLM))

In [None]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core import StorageContext
from llama_index.vector_stores.qdrant import QdrantVectorStore

import qdrant_client


In [None]:
type(nodes)

In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
embed_model = HuggingFaceEmbedding(
    model_name="WhereIsAI/UAE-Large-V1",
    tokenizer_name
)

In [None]:
client = qdrant_client.QdrantClient(
    # you can use :memory: mode for fast and light-weight experiments,
    # it does not require to have Qdrant deployed anywhere
    # but requires qdrant-client >= 1.1.1
    location=":memory:"
    # otherwise set Qdrant instance address with:
    # uri="http://<host>:<port>"
    # set API KEY for Qdrant Cloud
    # api_key="<qdrant-api-key>",
)

In [None]:
vector_store = QdrantVectorStore(client=client, collection_name="test")
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [None]:
index = VectorStoreIndex(
    nodes=nodes,
    storage_context=storage_context,
)