Imports

In [8]:

from google.colab import widgets
from IPython.display import display, clear_output
import ipywidgets as ipy

# HTTP Requests
import requests

# Google API Client Libraries
from google.oauth2 import service_account
from googleapiclient.discovery import build
from google.colab import auth
from google.auth.transport.requests import Request

# File Handling
import os
import io
import zipfile
import json

# Data Handling
import pandas as pd

# Colab Utilities
from google.colab import files

# Miscellaneous Utilities
import time
from tqdm import tqdm
import logging


Upload jsons:

In [3]:
uploaded = files.upload()

for filename in uploaded.keys():
    if filename.endswith('.zip'):
        print(f"Unzipping {filename}...")
        with zipfile.ZipFile(io.BytesIO(uploaded[filename])) as zip_ref:
            zip_ref.extractall('./site_data')  # Extract to 'unzipped' folder
        print(f"Files extracted to ./site_data/")
    else:
        print(f"{filename} is not a zip file.")

Saving site_data_google.zip to site_data_google.zip
Unzipping site_data_google.zip...
Files extracted to ./site_data/


Json search

In [5]:
json_folder = 'site_data/site_data_google'

# Get all JSON filenames in the folder
json_files = [f for f in os.listdir(json_folder) if f.endswith('.json')]

print(f"Found {len(json_files)} JSON files.")

Found 88 JSON files.


update based on my preexisting excel ones

In [23]:
uploadedex = files.upload()
excel_path = list(uploadedex.keys())[0]  # Replace with actual path
df = pd.read_excel(excel_path,  dtype={'ID': str})

print(f"Loaded {len(df)} rows from Excel.")

Saving human_scores.xlsx to human_scores.xlsx
Loaded 30 rows from Excel.


In [24]:
json_folder = 'site_data/site_data_google'
updated_count = 0
missing_json = []

for idx, row in df.iterrows():
    json_filename = f"{row['ID']}.json"
    json_path = os.path.join(json_folder, json_filename)

    if os.path.exists(json_path):
        with open(json_path, 'r') as f:
            data = json.load(f)

        # Update human review fields
        data.setdefault('review', {}).setdefault('human', {})
        data['review']['human']['rating'] = str(row['Human Score']) if pd.notnull(row['Human Score']) else ''
        data['review']['human']['comments'] = str(row['Human Analysis']) if pd.notnull(row['Human Analysis']) else ''

        # Save updated JSON
        with open(json_path, 'w') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)

        updated_count += 1
    else:
        missing_json.append(json_filename)

print(f"✅ Updated {updated_count} JSON files based on Excel.")
if missing_json:
    print(f"⚠️ Missing JSON files for {len(missing_json)} IDs:")
    print(missing_json[:10])  # show first few missing ones

✅ Updated 30 JSON files based on Excel.


Edit json

In [25]:
all_json_files = [f for f in os.listdir(json_folder) if f.endswith('.json')]

# --- Step 1: Pre-filter files needing review ---
json_files = []  # Only files with empty/null human review

for json_file in all_json_files:
    file_path = os.path.join(json_folder, json_file)
    with open(file_path, 'r') as f:
        data = json.load(f)

    human_review = data.get('review', {}).get('human', {})
    rating = human_review.get('rating')
    comments = human_review.get('comments', '')

    # Check if both fields are empty/unfilled
    if (rating is None or rating == '') and (comments.strip() == ''):
        json_files.append(json_file)

# Track progress
file_index = 0
total_to_review = len(json_files)

print(f"📋 {total_to_review} files need review.")

📋 25 files need review.


In [26]:
def edit_json():
    global file_index

    if file_index >= total_to_review:
        clear_output()
        print("🎉 All files processed!")
        return

    json_file = json_files[file_index]
    file_path = os.path.join(json_folder, json_file)

    with open(file_path, 'r') as f:
        data = json.load(f)

    clear_output()
    print(f"--- Editing {json_file} ---")
    print(f"URL: {data.get('url', 'No URL')}\n")

    # Get existing (probably empty) human review data
    human_review = data.get('review', {}).get('human', {})
    existing_rating = human_review.get('rating', '')
    existing_comments = human_review.get('comments', '')

    # Input widgets
    rating_input = ipy.Text(
        value=str(existing_rating) if existing_rating is not None else '',
        placeholder='Enter rating',
        description='Rating:',
        disabled=False
    )
    comment_input = ipy.Textarea(
        value=existing_comments,
        placeholder='Enter comment',
        description='Comment:',
        layout={'width': '500px', 'height': '100px'},
        disabled=False
    )

    save_button = ipy.Button(description="💾 Save")
    next_button = ipy.Button(description="➡️ Next File")
    output = ipy.Output()
    counter_output = ipy.Output()

    # Save behavior
    def save_clicked(b):
        data.setdefault('review', {}).setdefault('human', {})
        data['review']['human']['rating'] = rating_input.value
        data['review']['human']['comments'] = comment_input.value

        with open(file_path, 'w') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)

        with output:
            clear_output()
            print(f"✅ Saved changes to {json_file}")

    # Next button behavior
    def next_clicked(b):
        global file_index
        file_index += 1
        edit_json()

    save_button.on_click(save_clicked)
    next_button.on_click(next_clicked)

    display(rating_input, comment_input, ipy.HBox([save_button, next_button]), output)

    # Show counter
    with counter_output:
        clear_output()
        print(f"🔢 Remaining files: {total_to_review - file_index - 1}")
    display(counter_output)

运行 Run

In [27]:
edit_json()

🎉 All files processed!


download existing so as to not lose

In [29]:
!zip -r final_site_data.zip site_data/site_data_google
from google.colab import files
files.download("final_site_data.zip")

  adding: site_data/site_data_google/ (stored 0%)
  adding: site_data/site_data_google/4261262117.json (deflated 49%)
  adding: site_data/site_data_google/6034589709.json (deflated 42%)
  adding: site_data/site_data_google/6057992841.json (deflated 47%)
  adding: site_data/site_data_google/3362896755.json (deflated 49%)
  adding: site_data/site_data_google/5622634388.json (deflated 50%)
  adding: site_data/site_data_google/6649034752.json (deflated 50%)
  adding: site_data/site_data_google/0042714234.json (deflated 52%)
  adding: site_data/site_data_google/3129068608.json (deflated 37%)
  adding: site_data/site_data_google/4903961193.json (deflated 45%)
  adding: site_data/site_data_google/7892482169.json (deflated 49%)
  adding: site_data/site_data_google/0287215483.json (deflated 48%)
  adding: site_data/site_data_google/8140771831.json (deflated 51%)
  adding: site_data/site_data_google/6274604123.json (deflated 42%)
  adding: site_data/site_data_google/4572251083.json (deflated 49%

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>