# Notebook Showing How to Use Unstructured-Ingest to Pre-Process Local Documents

### Constants + Parameters

In [1]:
output_dir = 'my-docs'
weaviate_url = "http://localhost:8080"
embedding_model_name = 'all-MiniLM-L6-v2'
device = 'mps'

## Ingestion Pipeline for VectorDB (Weaviate)

### Helper Functions to Ingest Documents and Pre-Process Them

In [2]:
import subprocess
import os
from typing import List, Dict
from userpaths import get_my_documents


def process_local(output_dir: str, num_processes: int, input_path: str = get_my_documents()):
        command = [
          "unstructured-ingest",
          "local",
          "--input-path", input_path,
          "--output-dir", output_dir,
          "--num-processes", str(num_processes),
          "--recursive",
          "--verbose",
        ]

        # Run the command
        process = subprocess.Popen(command, stdout=subprocess.PIPE)
        output, error = process.communicate()

        # Print output
        if process.returncode == 0:
            print('Command executed successfully. Output:')
            print(output.decode())
        else:
            print('Command failed. Error:')
            print(error.decode())

def get_result_files(folder_path) -> List[Dict]:
    file_list = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)
                file_list.append(file_path)
    return file_list

### Run Desired Pipeline To Generate Clean JSON using Unstructured

In [3]:
process_local(output_dir=output_dir, num_processes=2)
files = get_result_files(output_dir)

2023-09-22 15:47:02,857 MainProcess DEBUG    options: {'input_path': '/Users/pravinsanthanam/Documents', 'output_dir': 'my-docs', 'num_processes': 2, 'recursive': True, 'verbose': True, 'file_glob': None, 'download_dir': None, 're_download': False, 'preserve_downloads': False, 'download_only': False, 'max_docs': None, 'pdf_infer_table_structure': False, 'strategy': 'auto', 'reprocess': False, 'ocr_languages': 'eng', 'encoding': None, 'fields_include': ['element_id', 'text', 'type', 'metadata'], 'flatten_metadata': False, 'metadata_include': [], 'metadata_exclude': [], 'partition_by_api': False, 'partition_endpoint': 'https://api.unstructured.io/general/v0/general', 'api_key': None}
2023-09-22 15:47:03,038 MainProcess INFO     All docs have structured outputs, nothing to do. Use --reprocess to process all.


Command executed successfully. Output:



### Helper Functions to Setup Weaviate Schema and Client

In [4]:
import uuid
import weaviate
from weaviate.util import get_valid_uuid

def create_local_weaviate_client(db_url: str):
    return weaviate.Client(
        url=db_url,
    )

def get_schema(vectorizer: str = "none"):
    return {
        "classes": [
            {
                "class": "Doc",
                "description": "A generic document class",
                "vectorizer": vectorizer,
                "properties": [
                    {
                        "name": "last_modified",
                        "dataType": ["text"],
                        "description": "Last modified date for the document",
                    },
                    {
                        "name": "player",
                        "dataType": ["text"],
                        "description": "Player related to the document",
                    },
                    {
                        "name": "position",
                        "dataType": ["text"],
                        "description": "Player Position related to the document",
                    },
                    {
                        "name": "text",
                        "dataType": ["text"],
                        "description": "Text content for the document",
                    },
                ],
            },
        ],
    }

def upload_schema(my_schema, weaviate):
    weaviate.schema.delete_all()
    weaviate.schema.create(my_schema)

def count_documents(client: weaviate.Client) -> Dict:
    response = (
        client.query
        .aggregate("Doc")
        .with_meta_count()
        .do()
    )
    count = response
    return count

### Setup Weaviate Client and Schema

In [5]:
client = create_local_weaviate_client(db_url=weaviate_url)
my_schema = get_schema()
upload_schema(my_schema, weaviate=client)

### Helper Functions to Stage Unstructured Documents for Indexing

In [6]:
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import DataSourceMetadata
from unstructured.partition.json import partition_json
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(embedding_model_name, device=device)

def compute_embedding(chunk_text: List[str]):
    embeddings = embedding_model.encode(chunk_text, device=device)
    return embeddings
    

def get_chunks(elements, chunk_under_n_chars=500, chunk_new_after_n_chars=1500):
    for element in elements:
        if not type(element.metadata.data_source) is DataSourceMetadata:
            delattr(element.metadata, "data_source")

        if hasattr(element.metadata, "coordinates"):
            delattr(element.metadata, "coordinates")

    chunks = chunk_by_title(
        elements,
        combine_under_n_chars=chunk_under_n_chars,
        new_after_n_chars=chunk_new_after_n_chars
    )

    for i in range(len(chunks)):
        position, last, first = chunks[i].metadata.filename.split('.pdf')[0].split('_')[-3:]
        player_name = first + " " + last
        chunks[i] = {"last_modified": chunks[i].metadata.last_modified, "text": chunks[i].text, "position": position, "player": player_name}

    chunk_texts = [x['text'] for x in chunks]
    embeddings = compute_embedding(chunk_texts)
    return chunks, embeddings


def add_data_to_weaviate(files, client, chunk_under_n_chars=500, chunk_new_after_n_chars=1500):
    for filename in files:
        try:
            elements = partition_json(filename=filename)
            chunks, embeddings = get_chunks(elements, chunk_under_n_chars, chunk_new_after_n_chars)
        except IndexError as e:
            print(e)
            continue

        print(f"Uploading {len(chunks)} chunks for {str(filename)}.")
        for i, chunk in enumerate(chunks):
            client.batch.add_data_object(
                data_object=chunk,
                class_name="doc",
                uuid=get_valid_uuid(uuid.uuid4()),
                vector=embeddings[i]
            )
        
    client.batch.flush()

  from .autonotebook import tqdm as notebook_tqdm


### Add Chunks to Weaviate

In [7]:
add_data_to_weaviate(
    files=files,
    client=client,
    chunk_under_n_chars=75,
    chunk_new_after_n_chars=200
)

print(count_documents(client=client)['data']['Aggregate']['Doc'])

[{'meta': {'count': 273}}]


## LangChain RAG Application

In [8]:
from langchain.llms import LlamaCpp
from langchain.vectorstores.weaviate import Weaviate
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.prompts import PromptTemplate

### Instantiate Local Llama 2 LLM

In [9]:
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
n_gpu_layers = 1  # Metal set to 1 is enough.
n_batch = 100  # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.
# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path="model_files/llama-2-7b-chat.Q4_K_S.gguf",
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
    callback_manager=callback_manager,
    verbose=True, # Verbose is required to pass to the callback manager
)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from model_files/llama-2-7b-chat.Q4_K_S.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q5_K     [ 11008,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q4_K     [  4096,  4096,   

### Helper Function to Run RAG Process

In [10]:
def question_answer(question: str, vectorstore: Weaviate):
    embedding = compute_embedding(question)
    similar_docs = vectorstore.max_marginal_relevance_search_by_vector(embedding)
    content = [x.page_content for x in similar_docs]
    prompt_template = PromptTemplate.from_template(
    """\
    Given context about the subject, answer the question based on the context provided to the best of your ability.
    Context: {context}
    Question:
    {question}
    Answer:
    """
    )
    prompt = prompt_template.format(context=content, question=question)
    answer = llm(prompt)
    return answer, similar_docs


### Run RAG on a Question

In [11]:
client = weaviate.Client(weaviate_url)
vectorstore = Weaviate(client, "Doc", "text")

question = "What is a Power RB?"

answer, similar_docs = question_answer(question, vectorstore)

print("\n\n\n-------------------------")
print(f"QUERY: {question}")
print("\n\n\n-------------------------")
print(f"Answer: {answer}")
print("\n\n\n-------------------------")
for index, result in enumerate(similar_docs):
    print(f"\n\n-- RESULT {index+1}:\n")
    print(result)




 A Power RB, according to the context provided, is a running back who is physically bigger and more capable of flourishing between the tackles. This type of running back is separated from the "speed RBs" group, which are smaller but faster and quicker. The term "Power RB" suggests that these players have the size and strength to handle a workhorse role in an NFL offense.


-------------------------
QUERY: What is a Power RB?



-------------------------
Answer:  A Power RB, according to the context provided, is a running back who is physically bigger and more capable of flourishing between the tackles. This type of running back is separated from the "speed RBs" group, which are smaller but faster and quicker. The term "Power RB" suggests that these players have the size and strength to handle a workhorse role in an NFL offense.



-------------------------


-- RESULT 1:

page_content='South Carolina\n\nOur RB grades can and will change as more information comes in from Pro Day workout


llama_print_timings:        load time =  2519.96 ms
llama_print_timings:      sample time =    66.74 ms /    88 runs   (    0.76 ms per token,  1318.47 tokens per second)
llama_print_timings: prompt eval time =  3748.92 ms /   393 tokens (    9.54 ms per token,   104.83 tokens per second)
llama_print_timings:        eval time =  2509.44 ms /    87 runs   (   28.84 ms per token,    34.67 tokens per second)
llama_print_timings:       total time =  6470.35 ms
