## Fetch Data Analysis

In [29]:
import polars as pl
import os

cwd = os.getcwd()
print(cwd)

d:\Data-Engineering\fetch-homework-assignment\notebooks


In [50]:
import os
import json
import pandas as pd

# File path to the JSON file
file_path = "../data/receipts.json"
output_path = "../data/receipts_fixed.json"


def preprocess_json(file_path, output_path):
    """
    Preprocesses a malformed JSON file to wrap all objects into a valid JSON array.
    """
    try:
        with open(file_path, "r") as f:
            lines = f.readlines()  # Read the file line by line

        # Convert each line into a JSON object
        json_objects = [json.loads(line) for line in lines]

        # Write the objects into a proper JSON array
        with open(output_path, "w") as f:
            json.dump(json_objects, f, indent=4)

        print(f"File successfully converted to valid JSON: {output_path}")
        return output_path
    except Exception as e:
        print(f"An error occurred during preprocessing: {e}")
        return None


def load_data(file_path):
    """
    Loads JSON or NDJSON data into a Pandas DataFrame.
    """
    try:
        # Attempt to load as NDJSON
        print("Attempting to load the file as NDJSON...")
        df = pd.read_json(file_path, lines=True)
        print("File successfully loaded as NDJSON.")
        return df
    except ValueError:
        print("File is not NDJSON. Attempting to preprocess...")

        # Preprocess the file to fix formatting issues
        fixed_path = preprocess_json(file_path, output_path)
        if not fixed_path:
            raise Exception("Preprocessing failed. Could not fix the file.")

        # Load the fixed JSON file
        print("Loading the fixed JSON file...")
        df = pd.read_json(fixed_path)
        print("File successfully loaded after preprocessing.")
        return df
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None


# Main function
if __name__ == "__main__":
    # Check if the file exists
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
    else:
        # Load the data into a DataFrame
        df = load_data(file_path)

        if df is not None:
            # Display the first few rows of the DataFrame
            print("DataFrame loaded successfully!")
            print(df.head())

            # Optionally save the DataFrame to a CSV file
            csv_output_path = "../data/receipts_cleaned.csv"
            df.to_csv(csv_output_path, index=False)
            print(f"Data successfully saved to: {csv_output_path}")

Attempting to load the file as NDJSON...
File successfully loaded as NDJSON.
DataFrame loaded successfully!
                                    _id  bonusPointsEarned  \
0  {'$oid': '5ff1e1eb0a720f0523000575'}              500.0   
1  {'$oid': '5ff1e1bb0a720f052300056b'}              150.0   
2  {'$oid': '5ff1e1f10a720f052300057a'}                5.0   
3  {'$oid': '5ff1e1ee0a7214ada100056f'}                5.0   
4  {'$oid': '5ff1e1d20a7214ada1000561'}                5.0   

                             bonusPointsEarnedReason  \
0  Receipt number 2 completed, bonus point schedu...   
1  Receipt number 5 completed, bonus point schedu...   
2                         All-receipts receipt bonus   
3                         All-receipts receipt bonus   
4                         All-receipts receipt bonus   

                 createDate               dateScanned  \
0  {'$date': 1609687531000}  {'$date': 1609687531000}   
1  {'$date': 1609687483000}  {'$date': 1609687483000}   
2  {'$date'

In [None]:
# What are the top 5 brands by receipts scanned for most recent month?