In [10]:
import json
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def try_read_file(file_path):
    encodings = ['utf-8', 'latin-1', 'windows-1252']  # Add more if needed
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as file:
                return json.load(file)
        except UnicodeDecodeError:
            continue
    raise ValueError(f"File {file_path} has an unknown encoding.")

def json_to_string(data):
    data_copy = data.copy()
    data_copy['ReceiptInfo'].pop('merchantCategory', None)
    return json.dumps(data_copy, sort_keys=True)

# Path to the JSON files
json_folder_path = 'data/receipts/json/prompt2'

# Collect data for training
data = {'json_string': [], 'category': []}

# Iterate over JSON files
for file_name in os.listdir(json_folder_path):
    file_path = os.path.join(json_folder_path, file_name)
    try:
        data_json = try_read_file(file_path)
    except ValueError as e:
        print(e)
        continue

    merchant_category = data_json['ReceiptInfo'].get('merchantCategory')
    
    if not merchant_category:
        continue

    json_string = json_to_string(data_json)
    data['json_string'].append(json_string)
    data['category'].append(merchant_category)

# Convert data to DataFrame
df = pd.DataFrame(data)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_df=0.7, min_df=3, ngram_range=(1, 3), stop_words='english')
X = vectorizer.fit_transform(df['json_string'])
y = df['category']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Model Accuracy: {accuracy_rf}")

# # Predict and save results for all data
# all_predictions = rf_model.predict(X)
# results = pd.DataFrame({'Category': all_predictions, 'Vendor Name': df['json_string'], 'File Name': df.index})
# results.sort_values(by=['Category', 'Vendor Name'], inplace=True)
# results.to_csv('vendor_classification_results.csv', index=False)


Expecting value: line 1 column 1 (char 0)
Model Accuracy: 0.5454545454545454
Random Forest Model Accuracy: 0.5909090909090909
