<a href="https://colab.research.google.com/github/khadkechetan/information_extraction/blob/main/LLM/llama_v2/Information_extraction_using_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Owner Name: **Chetan Khadke**

Email ID: khadkechetan@gmail.com

# 1. Install all the packages

In [None]:
!pip install -q transformers einops accelerate langchain bitsandbytes sentence_transformers llama-index pypdf python-dotenv

# 2. Packages imports

In [37]:
import logging
import sys
import torch
from pprint import pprint
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.llms import HuggingFaceLLM
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index import LangchainEmbedding, ServiceContext
from llama_index.prompts.prompts import SimpleInputPrompt

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# 3. Login to HuggingFace cli

In [3]:
!git config --global credential.helper store
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) Y
Token is valid (permission: read).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# 4. Load the Dataset

In [2]:
data_location = '/content/data' #@param
documents = SimpleDirectoryReader(data_location).load_data()

In [3]:
documents[0]

Document(id_='6c9a2aa1-c01d-473f-a13b-6d9e1a3d360c', embedding=None, metadata={'page_label': '1', 'file_name': 'invoice_107_charspace_108.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='4049e8f124ba189c6633d7ac95ff3b456fafd197c937f914e63e748b04a31007', text='Invoice no: 82545881\nDate of issue:\n09/25/2011\nSeller:\nCampbell, Callahan and Gomez\n2969 Todd Orchard Apt. 721\nPort James, FL 83598\nTax Id: 958-83-8233\nIBAN: GB86WKRJ04578791818338\nClient:\nKeller-Crosby\n280 Kim Valleys Suite 217\nAngelaburgh, DE 97356\nTax Id: 941-79-6209\nITEMS\nNo.\nDescription\nQty\nUM\nNet price\nNet worth\nVAT [%]\nGross\nworth\n \nUnder Armour UA Curry 3\nBlack/Black Size 6Y\n1.\n5,00\neach\n 4,90\n 24,50\n 10%\n 26,95\n \nSkechers Hypno Splash Light Up\nShoes Size 4 Medium Big Kid\nNew\n2.\n5,00\neach\n24,99\n 124,95\n 10%\n 137,44\n \nNative Charley White\nSandals~Size Infant/Child\n6~Boys/Girls~Very Good\nCondition\n3.\n5,00\neach\n13,00\n 65,00\n 1

In [4]:
system_prompt = "You are a data extractor. Extract the exact data from given document. If no information found please reply 'no information available'"
# This will wrap the default prompts that are internal to llama-index
query_wrapper_prompt = SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")

# 5. Load the Embeddings

In [5]:
# hyperparameters
context_window = 4096 #@param
temperature = 0.0 #@param
model_name = 'meta-llama/Llama-2-7b-chat-hf' #@param

In [6]:
llm = HuggingFaceLLM(
    context_window=context_window,
    max_new_tokens=256,
    generate_kwargs={"temperature": temperature, "do_sample": False},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name=model_name,
    model_name=model_name,
    device_map="auto",
    # uncomment this if using CUDA to reduce memory usage
    model_kwargs={"torch_dtype": torch.float16 , "load_in_8bit":True, "use_auth_token":True},

)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
embed_model = LangchainEmbedding(
  HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
)

In [8]:
service_context = ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
    embed_model=embed_model
)

#6. Index the datastore

In [9]:
index = VectorStoreIndex.from_documents(documents, service_context=service_context)

In [39]:
user_queries = ['Seller address in the document?' , 'Client address in the document?', 'seller Tax Id in the document?' ] #@param

In [40]:
answer = dict()
for i, user_query in enumerate(user_queries):
    query_engine = index.as_query_engine()
    response = query_engine.query(user_query)
    answer.update({user_query: response.response})

In [41]:
pprint(answer)

{'Client address in the document?': 'The client address in the document is:\n'
                                    'Keller-Crosby\n'
                                    '280 Kim Valleys Suite 217\n'
                                    'Angelaburgh, DE 97356\n'
                                    '\n'
                                    '\n'
                                    '\n',
 'Seller address in the document?': "The seller's address in the document is:\n"
                                    '2969 Todd Orchard Apt. 721\n'
                                    'Port James, FL 83598\n'
                                    '\n'
                                    '\n'
                                    '\n'
                                    '\n',
 'seller Tax Id in the document?': "The seller's Tax Id is 958-83-8233."}
