In [1]:
import os
import json
import pandas as pd

def load_nested_json3(filepath, selected_columns=None):
    """Loads from dictionary containing a list of dictionaries nested json and flattens the data frame."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:  # Added encoding
            json_data = json.load(f)

        # Check for required keys before proceeding
        if 'status' not in json_data or 'data' not in json_data or 'results' not in json_data['data']:
            print(f"Warning: Missing key(s) in JSON file: {filepath}")
            return None  # Skip this file

        results = json_data['data']['results']  # access data by the key "results"
        df = pd.DataFrame(results)  # create a dataframe from the "results" value

        # --- Selective Extraction and Flattening ---
        # 1. Top-Level Columns (No flattening needed)
        df_top_level = df[['status', 'list_date', 'list_price', 'price_reduced_amount']].copy()

        # 2. Description (Extract all keys)
        if 'description' in df.columns:
          df_description = df['description'].apply(pd.Series)
          year_built = df_description['year_built']
          sold_date = df_description['sold_date']
          sold_price = df_description['sold_price']
          baths_3qtr = df_description['baths_3qtr']
          baths_full = df_description['baths_full']
          name = df_description['name']
          baths_half = df_description['baths_half']
          lot_sqft = df_description['lot_sqft']
          sqft = df_description['sqft']
          baths = df_description['baths']
          sub_type = df_description['sub_type']
          baths_1qtr = df_description['baths_1qtr']
          garage = df_description['garage']
          stories = df_description['stories']
          beds = df_description['beds']
          type = df_description['type']

        # 3. Branding (Extract name and type from the first entry in the list)
        if 'branding' in df.columns:
           def extract_branding(branding_list):
               if isinstance(branding_list, list) and len(branding_list) > 0:
                  branding_info = branding_list[0]
                  return pd.Series({'branding_name': branding_info.get('name', None), 'branding_type': branding_info.get('type', None)}) #Return name, type
               else:
                  return pd.Series({'branding_name': None, 'branding_type': None}) #Return None
           df_branding = df['branding'].apply(extract_branding)

        # 4. Location (State and City)
        if 'location' in df.columns:
           df_location = df['location'].apply(pd.Series)
           df_location_address = df_location['address'].apply(pd.Series) # select city, state from nested address
           city = df_location_address['city']
           state = df_location_address['state']

        # 5. Flags (take all flags)
        if 'flags' in df.columns:
           df_flags = df['flags'].apply(pd.Series)
        # 6. Tags (Take the whole column)
        if 'tags' in df.columns:
          df_tags = df['tags']

        # 7. source (Take the first agent)
        # Flatten 'branding' column (take the first branding entry if exists)
        if 'source' in df.columns:
           def extract_branding_name(source):
              if isinstance(source, list) and len(source) > 0:
                 agents = source[0]['agents']
                 return agents
              else:
                 return None
           df['agent'] = df['source'].apply(extract_branding_name)

        # 8. Concatenate
        concat = [df_top_level]
        columns = [year_built,sold_date,sold_price,baths_3qtr,baths_full,name,baths_half,lot_sqft,sqft,baths,sub_type,baths_1qtr,garage,stories,beds,type,
                  df_branding,city,state,
                  df_flags,
                  df_tags,
                  df['agent']]
        final_df = pd.concat(concat + [c for c in columns if c is not None], axis = 1)

        return final_df

    except FileNotFoundError:
        print(f"Error: File not found at {filepath}")
        return None  # Return None if the file isn't found
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON in file {filepath}: {e}")
        return None  # Return None if JSON decoding fails
    except KeyError as e:
        print(f"Error: Key '{e}' not found in JSON file {filepath}")
        return None  # Return None if a key is missing
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None


def process_folders(base_dir):
    """Iterates through folders, loads JSON files, and concatenates DataFrames."""
    all_data = []  # List to store DataFrames from each file

    for root, _, files in os.walk(base_dir):  # Walk through directory
        for file in files:
            if file.endswith(".json"):  # Process only JSON files
                filepath = os.path.join(root, file)  #  This line computes the full path
                df = load_nested_json3(filepath)  # Load data from file passing selected_columns

                if df is not None:  # Check if DataFrame was successfully created
                    cols = pd.Series(df.columns)
                    for dup in df.columns[df.columns.duplicated(keep=False)]:
                      cols[df.columns.get_loc(dup)] = f'{dup}_{file}' # Add the file name in the column name
                    df.columns = cols
                    all_data.append(df)  # Add to list of DataFrames

    if all_data:  # If any DataFrames were loaded
        combined_df = pd.concat(all_data, ignore_index=True)  # Combine all DataFrames
        return combined_df
    else:
        print("No JSON files found or loaded successfully.")
        return None


# Usage:
base_dir = r"C:\Users\johnk\DS-midterm_project"  # Replace with your base directory
#selected_columns = ['status', 'list_price', 'year_built', 'branding_name','name', 'city', 'lon', 'lat', 'is_new_construction', 'is_for_rent']  # Specify your desired columns
combined_df = process_folders(base_dir) # pass selected columns

if combined_df is not None:
    print(combined_df.head())
    print(combined_df.info())

Error: Key '"None of [Index(['status', 'list_date', 'list_price', 'price_reduced_amount'], dtype='object')] are in the [columns]"' not found in JSON file C:\Users\johnk\DS-midterm_project\data\HI_Honolulu_3.json
Error: Key '"None of [Index(['status', 'list_date', 'list_price', 'price_reduced_amount'], dtype='object')] are in the [columns]"' not found in JSON file C:\Users\johnk\DS-midterm_project\data\HI_Honolulu_4.json
Error: Key '"None of [Index(['status', 'list_date', 'list_price', 'price_reduced_amount'], dtype='object')] are in the [columns]"' not found in JSON file C:\Users\johnk\DS-midterm_project\data\ME_Augusta_0.json
Error: Key '"None of [Index(['status', 'list_date', 'list_price', 'price_reduced_amount'], dtype='object')] are in the [columns]"' not found in JSON file C:\Users\johnk\DS-midterm_project\data\ME_Augusta_1.json
Error: Key '"None of [Index(['status', 'list_date', 'list_price', 'price_reduced_amount'], dtype='object')] are in the [columns]"' not found in JSON file 

  combined_df = pd.concat(all_data, ignore_index=True)  # Combine all DataFrames
