# Structure Reciept Data
The code cell below will parse all of the txt files in ./testset/txt and save the JSON results in ./testset/json with the same filename as the txt file except the .json extention instead

In [None]:
import os
from glob import glob
import json

#### MODEL ####
from langchain.chat_models import ChatOpenAI
# jnicolwathawiiAPIkey = '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'
os.environ['OPENAI_API_KEY'] = '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'
model = ChatOpenAI(model='gpt-3.5-turbo')

#### Prompt ####
from langchain.prompts import PromptTemplate
from langchain.prompts.few_shot import FewShotPromptTemplate
import time

# get examples
examples = []
jsonFiles = glob(os.path.join(os.path.join('data', 'receipts', 'json', 'actual'), '*.json'))
for jsonFile in jsonFiles:
    baseFn = os.path.basename(jsonFile.replace('.json', ''))
    txtFile = glob(os.path.join('data', 'receipts', 'text', f'{baseFn}.txt'))[0]
    with open(jsonFile, 'r') as f: JSONobj = f.read()
    with open(txtFile, 'r') as f: rawRecieptText = f.read()
    exampleDict = {
        "rawRecieptText": rawRecieptText,
        "JSONobj":JSONobj.replace('{', '{{{{').replace('}', '}}}}')
        }
    examples.append(exampleDict)


promptTemplateFile = os.path.join('receipt_parse', 'prompt_templates', 'prompt_template_2.txt')
with open(promptTemplateFile, 'r') as f: promptTemplate = f.read()
example_prompt = PromptTemplate(input_variables=["rawRecieptText", "JSONobj"], 
                            template=promptTemplate)

prompt = FewShotPromptTemplate(
    examples=[examples[0]],
    example_prompt=example_prompt,
    suffix="Get JSON for this:\n{input}",
    input_variables=["input"]
)

#   print(prompt.format(input="recieptTxt"))



#### Create Chain ####
chain = prompt | model # how to pass the prompt to the model (pipe prompt to model)

#### Run inference on reciepts ####
promptName = os.path.basename(promptTemplateFile).split('.txt')[0].replace('_', '')#.split('_')[-1] e.g. prompt_template_1
recieptFiles = glob(os.path.join('testset', 'txt' '*.txt'))
for recieptFn in recieptFiles:
    print(recieptFn)
    saveJson = os.path.join('testset', 'json', f'{os.path.basename(recieptFn).split(".tx")[0]}.json')
    if os.path.exists(saveJson):continue
    with open(recieptFn, 'r') as f: recieptTxt = f.read()
    try:
        response = chain.invoke({'input': recieptTxt})
    except:
        time.sleep(20) # assume its a RateLimitError
        response = chain.invoke({'input': recieptTxt})

    # print(response)
    try:
        data_dict = json.loads(response.content)
    except(json.decoder.JSONDecodeError):
        print(response.content)
    else:
        with open(saveJson, 'w') as f: json.dump(data_dict, f)


# Nervaluate

# Vender (merchant) Classification

In [1]:
import json
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def try_read_file(file_path):
    encodings = ['utf-8', 'latin-1', 'windows-1252']  # Add more if needed
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as file:
                return json.load(file)
        except UnicodeDecodeError:
            continue
    raise ValueError(f"File {file_path} has an unknown encoding.")

def json_to_string(data):
    data_copy = data.copy()
    data_copy['ReceiptInfo'].pop('merchantCategory', None)
    return json.dumps(data_copy, sort_keys=True)

# Path to the JSON files
json_folder_path = 'data/receipts/json/prompt2'

# Collect data for training
data = {'file_name': [], 'vendor_name': [], 'json_string': [], 'category': []}

# Iterate over JSON files
for file_name in os.listdir(json_folder_path):
    file_path = os.path.join(json_folder_path, file_name)
    try:
        data_json = try_read_file(file_path)
    except ValueError as e:
        print(e)
        continue

    merchant_category = data_json['ReceiptInfo'].get('merchantCategory')
    
    if not merchant_category:
        continue

    merchant_name = data_json['ReceiptInfo'].get('merchant', 'Unknown')
    json_string = json_to_string(data_json)
    data['file_name'].append(file_name)
    data['vendor_name'].append(merchant_name)
    data['json_string'].append(json_string)
    data['category'].append(merchant_category)

df = pd.DataFrame(data)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_df=0.7, min_df=3, ngram_range=(1, 3), stop_words='english')
X = vectorizer.fit_transform(df['json_string'])
y = df['category']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=10)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10, min_samples_leaf=1)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Model Accuracy: {accuracy_rf}")

# Predict and save results for all data
all_predictions = rf_model.predict(X)
results = pd.DataFrame({'Category': all_predictions, 'Vendor Name': df['vendor_name'], 'File Name':  df['file_name']})
results.sort_values(by=['Category', 'Vendor Name'], inplace=True)
results.to_csv('vendor_classification_results.csv', index=False)


Expecting value: line 1 column 1 (char 0)
Random Forest Model Accuracy: 0.8181818181818182


# Item Classification
The code cell below will create a pkl file for each receipt JSON file which is a list of each of the item categories. The file has the same name and location as the JSON file but has the extention .pkl instead of .json.

In [None]:
import pickle
prompt_str="""
Can you return the item category of the items. Just return one word which is one of the categories below that best fits the item.
Food: example dairy, sports drinks, eggs
Medicine: example cough drops
House supplies: building material and tools as well as house hold supplies
Hobbies: Sports, video games etc

Here is the item info:
{itemdata}
"""

prompt = PromptTemplate.from_template(prompt_str)

chain = prompt | model # how to pass the prompt to the model (pipe prompt to model)


for jsonFile in glob(os.path.join('testset', 'json', '*.json')):
    pickle_file = os.path.join('testset', 'json', base_name + '_itemclasses.pkl')
    if os.path.exists(pickle_file): continue
    with open(jsonFile, 'r') as file:
        data = json.load(file)
    itemCats = []
    for item in data['ReceiptInfo']['ITEMS']:
        itemCats.append(chain.invoke({'itemdata':str(item)}).content)
        try:
            itemCats.append(chain.invoke({'itemdata':str(item)}).content)
        except:
            time.sleep(40) # assume its a RateLimitError
            itemCats.append(chain.invoke({'itemdata':str(item)}).content)

    base = os.path.basename(jsonFile)
    base_name = os.path.splitext(base)[0].split('_')[0]
    with open(pickle_file, 'wb') as f:
        pickle.dump(itemCats, f)
