In [4]:
import re
import json

# Read the file as raw text
with open("endomondoHR.json", "r") as file:
    raw_content = file.read()

# Step 1: Replace single quotes with double quotes
fixed_content = raw_content.replace("'", '"')

# Step 2: Remove trailing commas if present
fixed_content = re.sub(r",\s*]", "]", fixed_content)  # Trailing commas in lists
fixed_content = re.sub(r",\s*}", "}", fixed_content)  # Trailing commas in dicts

# Optional: Validate fixed content
try:
    data = json.loads(fixed_content)
    print("JSON loaded successfully!")
except json.JSONDecodeError as e:
    print(f"Error fixing JSON: {e}")
    with open("debug_output.json", "w") as debug_file:
        debug_file.write(fixed_content)  # Save for debugging


Error fixing JSON: Extra data: line 2 column 1 (char 35542)


In [9]:
import re
import json

def preprocess_json(file_path, output_path):
    with open(file_path, "r") as file:
        content = file.read()

    # Step 1: Replace single quotes with double quotes
    content = content.replace("'", '"')

    # Step 2: Fix missing commas between objects
    # Add commas between closing and opening braces if they are missing
    content = re.sub(r'}\s*{', r'},{', content)

    # Step 3: Wrap the objects in a list if they are not already in one
    if not content.strip().startswith("["):
        content = f"[{content}]"

    # Step 4: Remove trailing commas inside lists and objects
    content = re.sub(r",\s*]", "]", content)  # Trailing commas in lists
    content = re.sub(r",\s*}", "}", content)  # Trailing commas in dicts

    # Step 5: Save the fixed JSON for validation
    with open(output_path, "w") as outfile:
        outfile.write(content)

    # Step 6: Validate the fixed JSON
    try:
        data = json.loads(content)
        print(f"Successfully fixed and loaded JSON! Contains {len(data)} objects.")
    except json.JSONDecodeError as e:
        print(f"JSON validation failed: {e}")
        with open("debug_output.json", "w") as debug_file:
            debug_file.write(content)  # Save for manual debugging

# Run the function
preprocess_json("endomondoHR.json", "fixed_endomondoHR.json")


JSON validation failed: Expecting ',' delimiter: line 1 column 1829765123 (char 1829765122)


In [10]:
import re
import json

def extract_first_object(file_path, output_path):
    with open(file_path, "r") as file:
        content = file.read()
    
    # Regular expression to match the first JSON object
    match = re.search(r'\{.*?\}', content, re.DOTALL)
    if match:
        first_object = match.group(0)
        try:
            # Validate the extracted object
            parsed_object = json.loads(first_object.replace("'", '"'))  # Replace single quotes if present
            print("Successfully extracted the first JSON object!")
            with open(output_path, "w") as outfile:
                json.dump(parsed_object, outfile, indent=4)
        except json.JSONDecodeError as e:
            print(f"Error parsing the extracted JSON object: {e}")
    else:
        print("No valid JSON object found.")

# Example usage
extract_first_object("endomondoHR.json", "first_object.json")


Successfully extracted the first JSON object!


In [4]:
import re
import json

def extract_first_5_users_with_run(file_path, output_prefix, max_users=20, max_instances_per_user=10, min_runs_per_user=10):
    # Open the file and read its content
    with open(file_path, "r") as file:
        content = file.read()
    
    # Regular expression to match any {} block containing the word "run"
    pattern = r'\{[^{}]*\brun\b[^{}]*\}'
    
    # Find all matches
    matches = re.findall(pattern, content, re.DOTALL)
    
    if not matches:
        print("No matches containing the word 'run' found in the file.")
        return
    
    print(f"Found {len(matches)} matches containing the word 'run'.")

    # Dictionary to track runs for each user
    user_run_count = {}

    for match in matches:
        try:
            # Parse JSON object
            json_object = json.loads(match.replace("'", '"'))  # Replace single quotes if present
            
            # Extract userId
            user_id = json_object.get("userId")
            if not user_id:
                print("No userId found in match.")
                continue
            
            # Initialize or update user run count
            if user_id not in user_run_count:
                user_run_count[user_id] = []
            
            user_run_count[user_id].append(json_object)
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON: {e}")
            continue

    # Filter users with at least min_runs_per_user
    eligible_users = {user_id: runs for user_id, runs in user_run_count.items() if len(runs) >= min_runs_per_user}
    
    if not eligible_users:
        print(f"No users found with at least {min_runs_per_user} runs.")
        return
    
    # Restrict to max_users
    selected_users = list(eligible_users.items())[:max_users]

    # Export runs for each user
    for user_id, runs in selected_users:
        for i, run in enumerate(runs[:max_instances_per_user]):
            output_path = f"user_{user_id}_run_{i + 1}.json"
            with open(output_path, "w") as outfile:
                json.dump(run, outfile, indent=4)
            print(f"Exported instance {i + 1} containing 'run' for user {user_id} to {output_path}")

# Specify the path to your JSON file
file_path = "endomondoHR.json"
output_prefix = "run_keyword_activity"
extract_first_5_users_with_run(file_path, output_prefix, max_users=20, max_instances_per_user=10, min_runs_per_user=10)



Found 33314 matches containing the word 'run'.
Exported instance 1 containing 'run' for user 4969375 to user_4969375_run_1.json
Exported instance 2 containing 'run' for user 4969375 to user_4969375_run_2.json
Exported instance 3 containing 'run' for user 4969375 to user_4969375_run_3.json
Exported instance 4 containing 'run' for user 4969375 to user_4969375_run_4.json
Exported instance 5 containing 'run' for user 4969375 to user_4969375_run_5.json
Exported instance 6 containing 'run' for user 4969375 to user_4969375_run_6.json
Exported instance 7 containing 'run' for user 4969375 to user_4969375_run_7.json
Exported instance 8 containing 'run' for user 4969375 to user_4969375_run_8.json
Exported instance 9 containing 'run' for user 4969375 to user_4969375_run_9.json
Exported instance 10 containing 'run' for user 4969375 to user_4969375_run_10.json
Exported instance 1 containing 'run' for user 279317 to user_279317_run_1.json
Exported instance 2 containing 'run' for user 279317 to user_2

In [1]:
import re
import json
import math
from datetime import datetime, timedelta
from collections import defaultdict

# Haversine formula to calculate distance between two GPS points
def haversine(lat1, lon1, lat2, lon2):
    R = 6371000  # Earth radius in meters
    lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    return R * c

# Calculate pace (min/mile) and total distance (miles) for a single run
def calculate_pace_and_distance(latitudes, longitudes, timestamps):
    total_distance = 0
    total_time = 0

    for i in range(1, len(latitudes)):
        distance = haversine(latitudes[i - 1], longitudes[i - 1], latitudes[i], longitudes[i])
        total_distance += distance
        time_diff = timestamps[i] - timestamps[i - 1]
        total_time += time_diff

    if total_time == 0 or total_distance == 0:
        return float('inf'), 0  # Infinite pace, no movement

    pace_minutes_per_mile = (total_time / 60) / (total_distance / 1609.34)
    return pace_minutes_per_mile, total_distance / 1609.34

# Process endomondoHR.json to calculate metrics for each user with runs
def process_endomondoHR(file_path, min_runs_per_user=10):
    # Open the file and read its content
    with open(file_path, "r") as file:
        content = file.read()
    
    # Regular expression to match any {} block containing the word "run"
    pattern = r'\{[^{}]*\brun\b[^{}]*\}'
    
    # Find all matches
    matches = re.findall(pattern, content, re.DOTALL)
    
    if not matches:
        print("No matches containing the word 'run' found in the file.")
        return
    
    print(f"Found {len(matches)} matches containing the word 'run'.")

    # Dictionary to track runs for each user
    user_data = defaultdict(lambda: {
        "total_heart_rate": 0,
        "heart_rate_count": 0,
        "paces": [],
        "distances": [],
        "weekly_distances": defaultdict(float)  # Keyed by week (year-week number)
    })

    for match in matches:
        try:
            # Parse JSON object
            json_object = json.loads(match.replace("'", '"'))  # Replace single quotes if present
            
            # Extract userId and validate
            user_id = json_object.get("userId")
            if not user_id or json_object.get("sport") != "run":
                continue
            
            # Extract heart rate
            if "heart_rate" in json_object:
                user_data[user_id]["total_heart_rate"] += sum(json_object["heart_rate"])
                user_data[user_id]["heart_rate_count"] += len(json_object["heart_rate"])

            # Calculate pace and distance
            if "latitude" in json_object and "longitude" in json_object and "timestamp" in json_object:
                latitudes = json_object["latitude"]
                longitudes = json_object["longitude"]
                timestamps = json_object["timestamp"]

                # Calculate pace and distance for this run
                pace, distance = calculate_pace_and_distance(latitudes, longitudes, timestamps)
                user_data[user_id]["paces"].append(pace)
                user_data[user_id]["distances"].append(distance)

                # Calculate weekly distance
                start_date = datetime.utcfromtimestamp(timestamps[0])
                year_week = (start_date.year, start_date.isocalendar()[1])
                user_data[user_id]["weekly_distances"][year_week] += distance
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON: {e}")
            continue

    # Summarize metrics for each user with sufficient runs
    for user_id, metrics in user_data.items():
        if len(metrics["distances"]) < min_runs_per_user:
            continue

        average_heart_rate = (
            metrics["total_heart_rate"] / metrics["heart_rate_count"]
            if metrics["heart_rate_count"] > 0 else None
        )
        average_pace = (
            sum(metrics["paces"]) / len(metrics["paces"])
            if metrics["paces"] else None
        )
        fastest_pace = min(metrics["paces"]) if metrics["paces"] else None
        max_distance = max(metrics["distances"]) if metrics["distances"] else None
        max_weekly_distance = max(metrics["weekly_distances"].values()) if metrics["weekly_distances"] else None

        print(f"User ID: {user_id}")
        print(f"  - Average Heart Rate: {average_heart_rate}")
        print(f"  - Average Pace: {average_pace} min/mile")
        print(f"  - Fastest Pace: {fastest_pace} min/mile")
        print(f"  - Maximum Distance in One Run: {max_distance} miles")
        print(f"  - Maximum Distance in One Week: {max_weekly_distance} miles\n")

# Specify the path to the endomondoHR.json file
file_path = "endomondoHR.json"

# Process the file and calculate metrics
process_endomondoHR(file_path, min_runs_per_user=10)


Found 33314 matches containing the word 'run'.
User ID: 4969375
  - Average Heart Rate: 138.447718744416
  - Average Pace: 8.625013203218181 min/mile
  - Fastest Pace: 6.255239308832042 min/mile
  - Maximum Distance in One Run: 28.258671562824528 miles
  - Maximum Distance in One Week: 58.62512136140417 miles

User ID: 279317
  - Average Heart Rate: 144.4903110130336
  - Average Pace: 12.236860526429103 min/mile
  - Fastest Pace: 8.003499745196315 min/mile
  - Maximum Distance in One Run: 9.794875663192567 miles
  - Maximum Distance in One Week: 24.428693389867163 miles

User ID: 3905196
  - Average Heart Rate: 143.41849721771794
  - Average Pace: 8.11220294652807 min/mile
  - Fastest Pace: 6.40066368950364 min/mile
  - Maximum Distance in One Run: 20.27813823704967 miles
  - Maximum Distance in One Week: 31.407363889329183 miles

User ID: 854368
  - Average Heart Rate: 157.70769673073764
  - Average Pace: inf min/mile
  - Fastest Pace: 3.367712005176652 min/mile
  - Maximum Distance i