In [2]:
import json
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Path to the JSON files
json_folder_path = 'data/receipts/prompt2'

def extract_data_from_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

def json_to_string(data):
    # Convert JSON to string, excluding the merchantCategory field
    data_copy = data.copy()
    data_copy['ReceiptInfo'].pop('merchantCategory', None)
    return json.dumps(data_copy, sort_keys=True)

# Collect data for training
data = {'json_string': [], 'category': []}

# Iterate over JSON files
for file_name in os.listdir(json_folder_path):
    file_path = os.path.join(json_folder_path, file_name)
    data_json = extract_data_from_json(file_path)
    
    merchant_category = data_json['ReceiptInfo'].get('merchantCategory')
    
    # Skip if category is missing
    if not merchant_category:
        continue

    json_string = json_to_string(data_json)
    data['json_string'].append(json_string)
    data['category'].append(merchant_category)

# Convert data to DataFrame
df = pd.DataFrame(data)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['json_string'])
y = df['category']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train KNN Model
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train, y_train)

# Predict on the test set
y_pred = knn_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy}")

# # Predict and save results for all data
# all_predictions = knn_model.predict(X)
# results = pd.DataFrame({'Category': all_predictions, 'Vendor Name': df['json_string'], 'File Name': df.index})
# results.sort_values(by=['Category', 'Vendor Name'], inplace=True)
# results.to_csv('vendor_classification_results.csv', index=False)


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'data/receipts/prompt2'