In [7]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.prompts import ChatPromptTemplate
from huggingface_hub import InferenceClient
import numpy as np
import config
import json
import re
from receipt_OCR_processor import extract_text_from_file



In [8]:
# Define the prompt template
prompt_template = """
The information you need to extract includes the following fields: Date, Vendor, Without tax total amount, Tax, Total amount, Country, Province. Sometimes, the country and province information may be missing, and you will need to infer the most likely country and province based on the other information in the text.

Here is an example of the receipt text:
---
Text from Original Image with Orientation Detection:
{receipt_text}
---

Please provide the extracted information in the following JSON format:
{{
    "Date": "2024 08 01 10:57am",
    "Vendor": "Church's Chicken",
    "Without tax total amount": "27.50",
    "Tax": "1.38",
    "Total amount": "28.88",
    "Country": "Canada",
    "Province": "British Columbia",
    "Comment":"I inferred the country as Canada and the province as British Columbia based on the vendor location "Surrey, Bila" which is a city in the province of British Columbia, Canada."
}}

Remember, if the any information is not explicitly mentioned, you should infer it from the other details in the receipt or leave it empty, and explain it in the Comment(be succint).
"""

# Create a function to generate the prompt
def generate_prompt(receipt_text):
    return prompt_template.format(receipt_text=receipt_text)

# Function to query HuggingFace
def query_huggingface(client, prompt):
    messages = [
        {"role": "system", "content": "You are an intelligent assistant tasked with extracting important information from receipt texts. "},
        {"role": "user", "content": f"{prompt}"}
    ]

    response_text = ""
    for message in client.chat_completion(
        messages=messages,
        max_tokens=500,
        stream=True,
    ):
        if "choices" in message and message["choices"]:
            response_text += message["choices"][0]["delta"].get("content", "")

    return response_text

# Use HuggingFace InferenceClient for question answering
client = InferenceClient(
    model="meta-llama/Meta-Llama-3-8B-Instruct",
    token=config.HuggingFace_Token_KEY
)

def extract_information_from_receipt(receipt_text):
    prompt = generate_prompt(receipt_text)
    response_text = query_huggingface(client, prompt)
    return response_text

def extract_json_from_result(result_string):
    # Use regular expression to find the JSON part
    json_match = re.search(r'\{.*\}', result_string, re.DOTALL)
    if json_match:
        json_str = json_match.group(0)
        try:
            # Load the string as JSON to validate and return it
            json_data = json.loads(json_str)
            return json_data
        except json.JSONDecodeError:
            print("Failed to decode JSON")
            return None
    else:
        print("No JSON found in the result string")
        return None

def main(file_path):
    # Extract text from the provided file (PDF or image)
    receipt_text = extract_text_from_file(file_path)
    # This function will be called with the extracted text from the receipt.
    extracted_information = extract_information_from_receipt(receipt_text)
    extracted_json = extract_json_from_result(extracted_information)
    return extracted_json



In [11]:
# Example usage
if __name__ == "__main__":
    sample_receipt_text = """
    OSE OE CO SELES!
ei oy ee
Sok aX é SE a ee

pee _ Zo a Z
cr Pie Hy th

   
  
    

Bee
Lg

at
ye See ee & ‘ees
Be _
RS

      
  
   

     

PPR a Se ASKS
RASA OSES ER DESI NSE

ey Te eh
“a

Store # 3154
Church's Chicken

120-9100 Blundell Road
Richmond BC

Tel. 604-244-0318

                          

Check : 688364
Table:
Server :server # 2
07/01/22 03:24pm
REE RAL ee a ee ee
30 $ 1.00 WING SPECIAL $30.00
1 LARGE FRENCH FRIES $9.25
1 3pc TENDERS $8.60
NESS EIT ORT ATEN Be EEN pr Se TTT peek TET og
Subtotal: $47.85
GST: $2.39
YE PERS sub w/Tax: $50.24
wy % | yD Total: $50.24
yy oD he Visa $60.24
UL [0 YOU LIKE FREE CHICKEN 2?
Cg TAKE OUR SURVEY AT
Ke, ww). Churchschickensurvey .com
ft Once Completed you will
ae receive a CODE

Bring it back on your next visit
and receive a free PIECE CHICKEN

GST #R104636592
    """
    result = main(sample_receipt_text)
    print(result)

{'Date': '07/01/22 03:24pm', 'Vendor': "Church's Chicken", 'Without tax total amount': '47.85', 'Tax': '2.39', 'Total amount': '50.24', 'Country': 'Canada', 'Province': 'British Columbia', 'Comment': "I used the location '120-9100 Blundell Road, Richmond BC' to infer the country as Canada and the province as British Columbia."}
