In [2]:
import os
import gpxpy
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
from haversine import haversine, Unit

In [3]:
# Directory containing GPX files
gpx_dir = "data/gpx_files"
output_dir = "data/gpx_parsed"

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Function to calculate haversine distance
def haversine_distance(lat1, lon1, lat2, lon2):
    return haversine((lat1, lon1), (lat2, lon2), unit=Unit.METERS)

# Function to parse metadata using ElementTree
def parse_metadata(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    namespace = {"default": "http://www.topografix.com/GPX/1/1"}

    # Extract link
    link = root.find("default:metadata/default:author/default:link", namespace)
    link = link.attrib["href"] if link is not None else "No link"

    return link

#Avoids haversine package
def fast_haversine_np(df):
    lat = np.radians(df["latitude"].to_numpy())
    lon = np.radians(df["longitude"].to_numpy())
    
    dlat = lat[1:] - lat[:-1]
    dlon = lon[1:] - lon[:-1]

    a = np.sin(dlat / 2.0) ** 2 + np.cos(lat[:-1]) * np.cos(lat[1:]) * np.sin(dlon / 2.0) ** 2
    c = 2 * np.arcsin(np.sqrt(a))
    R = 6371000  # Earth radius in meters
    dist = R * c

    return np.insert(dist, 0, 0)

# Function to parse a single GPX file and calculate derived fields
def parse_gpx_file(file_path):
    
    # Extract the file name (without extension) as the name
    name = os.path.splitext(os.path.basename(file_path))[0]
    # Parse metadata
    link = parse_metadata(file_path)

    # Parse track points using gpxpy
    with open(file_path, 'r') as gpx_file:
        gpx = gpxpy.parse(gpx_file)

    route_info = []
    for track in gpx.tracks:
        for segment in track.segments:
            for point in segment.points:
                route_info.append({
                    "latitude": point.latitude,
                    "longitude": point.longitude,
                    "elevation": point.elevation,
                    "time": point.time,
                    "name": name,
                    "link": link
                })

    # Convert to DataFrame
    route_df = pd.DataFrame(route_info)

    
    route_df["distance"] = fast_haversine_np(route_df)
    # Calculate elevation difference
    route_df['elevation_diff'] = route_df['elevation'].diff().fillna(0)

    # Calculate cumulative elevation and cumulative distance
    route_df['cum_elevation'] = route_df['elevation_diff'].cumsum()
    route_df['cum_distance'] = route_df['distance'].cumsum()

    return route_df




In [28]:
# Iterate over all GPX files in the directory
for file_name in os.listdir(gpx_dir):
    if file_name.endswith("2019 SKODA Tour de Luxembourg Stage 4.gpx"):
        file_path = os.path.join(gpx_dir, file_name)
        print(f"Processing {file_name}...")

        # Parse the GPX file and calculate derived fields
        df = parse_gpx_file(file_path)

        # Export to CSV
        output_file = os.path.join(output_dir, f"{os.path.splitext(file_name)[0]}.csv")
        df.to_csv(output_file, index=False)
        print(f"Saved to {output_file}")

print("All GPX files have been processed.")

Processing 2019 SKODA Tour de Luxembourg Stage 4.gpx...
Saved to data/gpx_parsed\2019 SKODA Tour de Luxembourg Stage 4.csv
All GPX files have been processed.


In [4]:
# Iterate over all GPX files in the directory
for file_name in os.listdir(gpx_dir):
    if file_name.endswith(".gpx"):
        file_path = os.path.join(gpx_dir, file_name)

        # Check if the corresponding CSV file already exists
        output_file = os.path.join(output_dir, f"{os.path.splitext(file_name)[0]}.csv")
        if os.path.exists(output_file):
            print(f"Skipping {file_name}, already processed.")
            continue  # Skip the file if it's already processed

        print(f"Processing {file_name}...")

        # Parse the GPX file and calculate derived fields
        df = parse_gpx_file(file_path)

        # Export to CSV
        output_file = os.path.join(output_dir, f"{os.path.splitext(file_name)[0]}.csv")
        df.to_csv(output_file, index=False)
        print(f"Saved to {output_file}")

print("All GPX files have been processed.")

Skipping 2017 4 Jours de Dunkerque .gpx, already processed.
Skipping 2017 99a Coppa Bernocchi - 42o GP BPM.gpx, already processed.
Skipping 2017 Abu Dhabi Tour Stage 1.gpx, already processed.
Skipping 2017 Abu Dhabi Tour Stage 2.gpx, already processed.
Skipping 2017 Abu Dhabi Tour Stage 3.gpx, already processed.
Skipping 2017 Abu Dhabi Tour Stage 4.gpx, already processed.
Skipping 2017 ACC Asian Road Championships - ITT.gpx, already processed.
Skipping 2017 ACC Asian Road Championships - TTT.gpx, already processed.
Skipping 2017 ACC Asian Road Championships.gpx, already processed.
Skipping 2017 Amgen Tour of California Stage 1.gpx, already processed.
Skipping 2017 Amgen Tour of California Stage 2.gpx, already processed.
Skipping 2017 Amgen Tour of California Stage 3.gpx, already processed.
Skipping 2017 Amgen Tour of California Stage 4.gpx, already processed.
Skipping 2017 Amgen Tour of California Stage 5.gpx, already processed.
Skipping 2017 Amgen Tour of California Stage 6.gpx, alrea

In [None]:
import os
import time
import gpxpy
import pandas as pd
import numpy as np
from haversine import haversine, Unit
import cProfile
import pstats

# Directory containing GPX files
gpx_dir = "data/gpx_files"
output_dir = "data/gpx_parsed"

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Function to calculate haversine distance
def haversine_distance(lat1, lon1, lat2, lon2):
    return haversine((lat1, lon1), (lat2, lon2), unit=Unit.METERS)

# Function to parse metadata using ElementTree
def parse_metadata(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    namespace = {"default": "http://www.topografix.com/GPX/1/1"}

    # Extract link
    link = root.find("default:metadata/default:author/default:link", namespace)
    link = link.attrib["href"] if link is not None else "No link"

    return link

# Avoids haversine package
def fast_haversine_np(df):
    lat = np.radians(df["latitude"].to_numpy())
    lon = np.radians(df["longitude"].to_numpy())
    
    dlat = lat[1:] - lat[:-1]
    dlon = lon[1:] - lon[:-1]

    a = np.sin(dlat / 2.0) ** 2 + np.cos(lat[:-1]) * np.cos(lat[1:]) * np.sin(dlon / 2.0) ** 2
    c = 2 * np.arcsin(np.sqrt(a))
    R = 6371000  # Earth radius in meters
    dist = R * c

    return np.insert(dist, 0, 0)

# Function to parse a single GPX file and calculate derived fields
def parse_gpx_file(file_path):
    
    # Extract the file name (without extension) as the name
    name = os.path.splitext(os.path.basename(file_path))[0]
    # Parse metadata
    link = parse_metadata(file_path)

    # Parse track points using gpxpy
    with open(file_path, 'r') as gpx_file:
        gpx = gpxpy.parse(gpx_file)

    route_info = []
    for track in gpx.tracks:
        for segment in track.segments:
            for point in segment.points:
                route_info.append({
                    "latitude": point.latitude,
                    "longitude": point.longitude,
                    "elevation": point.elevation,
                    "time": point.time,
                    "name": name,
                    "link": link
                })

    # Convert to DataFrame
    route_df = pd.DataFrame(route_info)

    # Profile the Haversine calculation and cumsum
    start_time = time.time()
    route_df["distance"] = fast_haversine_np(route_df)
    route_df['cum_distance'] = route_df['distance'].cumsum()
    elapsed_time = time.time() - start_time
    print(f"Time for Haversine and cumsum: {elapsed_time:.4f} seconds")

    return route_df

# # Example of profiling function
# def profile_script():
#     for file_path in os.listdir(gpx_dir)[:10]:  # Process first 10 GPX files
#         file_path = os.path.join(gpx_dir, file_path)
#         if file_path.endswith('.gpx'):
#             print(f"Processing {file_path}...")
#             parse_gpx_file(file_path)

# # Start profiling
# cProfile.run('profile_script()', 'profile_output.stats')

# # Print profiling results
# stats = pstats.Stats('profile_output.stats')
# stats.strip_dirs()
# stats.sort_stats('cumtime')  # Sort by cumulative time
# stats.print_stats(10)  # Print top 10 functions by cumulative time


Processing data/gpx_files\2017 4 Jours de Dunkerque .gpx...
Time for Haversine and cumsum: 0.0048 seconds
Processing data/gpx_files\2017 99a Coppa Bernocchi - 42o GP BPM.gpx...
Time for Haversine and cumsum: 0.0136 seconds
Processing data/gpx_files\2017 Abu Dhabi Tour Stage 1.gpx...
Time for Haversine and cumsum: 0.0000 seconds
Processing data/gpx_files\2017 Abu Dhabi Tour Stage 2.gpx...
Time for Haversine and cumsum: 0.0000 seconds
Processing data/gpx_files\2017 Abu Dhabi Tour Stage 3.gpx...
Time for Haversine and cumsum: 0.0000 seconds
Processing data/gpx_files\2017 Abu Dhabi Tour Stage 4.gpx...
Time for Haversine and cumsum: 0.0000 seconds
Processing data/gpx_files\2017 ACC Asian Road Championships - ITT.gpx...
Time for Haversine and cumsum: 0.0000 seconds
Processing data/gpx_files\2017 ACC Asian Road Championships - TTT.gpx...
Time for Haversine and cumsum: 0.0000 seconds
Processing data/gpx_files\2017 ACC Asian Road Championships.gpx...
Time for Haversine and cumsum: 0.0000 second

<pstats.Stats at 0x277813e9160>