In [1]:
import os
import pandas as pd
import gpxpy
import xml.etree.ElementTree as ET
from datetime import datetime, timezone

In [2]:
def standardize_activity_type(activity_type):
    activity_type = activity_type.lower()
    if activity_type in ['ride', 'biking', 'cycling']:
        return 'cycling'
    elif activity_type in ['run', 'running']:
        return 'running'
    elif activity_type in ['walk', 'walking']:
        return 'walking'
    else:
        return activity_type

In [3]:
def parse_gpx(file_path):
    with open(file_path, 'r') as gpx_file:
        gpx = gpxpy.parse(gpx_file)
    
    data = []
    activity_type = 'unknown'
    
    # Try to extract activity type from GPX metadata
    if gpx.tracks:
        for track in gpx.tracks:
            if track.type:
                activity_type = standardize_activity_type(track.type)
                break
    
    # If activity_type is still unknown, try parsing the XML directly
    if activity_type == 'unknown':
        try:
            tree = ET.parse(file_path)
            root = tree.getroot()
            namespace = {'gpx': 'http://www.topografix.com/GPX/1/1'}
            type_elem = root.find('.//gpx:trk/gpx:type', namespace)
            if type_elem is not None and type_elem.text:
                activity_type = standardize_activity_type(type_elem.text)
        except ET.ParseError:
            print(f"Warning: Unable to parse XML in {file_path}")
    
    for track in gpx.tracks:
        for segment in track.segments:
            for point in segment.points:
                data.append({
                    'timestamp': point.time.replace(tzinfo=timezone.utc) if point.time else None,
                    'latitude': point.latitude,
                    'longitude': point.longitude,
                    'elevation': point.elevation,
                    'heart_rate': point.extensions[0].get('hr', None) if point.extensions else None,
                    'file_name': os.path.basename(file_path),
                    'activity_type': activity_type
                })
    
    return pd.DataFrame(data)

In [33]:
def parse_tcx(file_path):
    with open(file_path, 'r') as file:
        content = file.read()
    
    xml_start = content.find('<?xml')
    if xml_start == -1:
        raise ValueError("No XML declaration found in the file")
    
    root = ET.fromstring(content[xml_start:])
    
    namespace = {'ns': 'http://www.garmin.com/xmlschemas/TrainingCenterDatabase/v2'}
    
    data = []
    activity_type = 'unknown'
    
    # Try to extract activity type from TCX
    activity_elem = root.find('.//ns:Activity', namespace)
    if activity_elem is not None:
        activity_type = standardize_activity_type(activity_elem.get('Sport', 'unknown'))
    
    for trackpoint in root.findall('.//ns:Trackpoint', namespace):
        time = trackpoint.find('ns:Time', namespace)
        position = trackpoint.find('ns:Position', namespace)
        elevation = trackpoint.find('ns:AltitudeMeters', namespace)
        heart_rate = trackpoint.find('.//ns:HeartRateBpm/ns:Value', namespace)
        
        if time is not None:
            point_data = {
                'timestamp': datetime.fromisoformat(time.text).replace(tzinfo=timezone.utc),
                'file_name': os.path.basename(file_path),
                'activity_type': activity_type
            }
            
            if position is not None:
                point_data['latitude'] = float(position.find('ns:LatitudeDegrees', namespace).text)
                point_data['longitude'] = float(position.find('ns:LongitudeDegrees', namespace).text)
            
            if elevation is not None:
                point_data['elevation'] = float(elevation.text)
            
            if heart_rate is not None:
                point_data['heart_rate'] = int(heart_rate.text)
            
            data.append(point_data)
    
    return pd.DataFrame(data)

In [34]:
def process_all_files():
    all_data = []
    
    for file in os.listdir('.'):
        if file.endswith('.gpx'):
            try:
                df = parse_gpx(file)
                all_data.append(df)
                print(f"Successfully processed GPX file: {file}")
            except Exception as e:
                print(f"Error processing GPX file {file}: {e}")
        
        elif file.endswith('.tcx'):
            try:
                df = parse_tcx(file)
                all_data.append(df)
                print(f"Successfully processed TCX file: {file}")
            except Exception as e:
                print(f"Error processing TCX file {file}: {e}")
    
    if not all_data:
        print("No GPX or TCX files found in the current directory.")
        return None
    
    combined_data = pd.concat(all_data, ignore_index=True)
    combined_data = combined_data.sort_values('timestamp')
    
    return combined_data

In [35]:
# Process all files and get combined data
all_activities_data = process_all_files()

if all_activities_data is not None:
    print("\nCombined Data Summary:")
    print(all_activities_data.describe())
    
    print("\nNumber of activities:", all_activities_data['file_name'].nunique())
    print("Date range:", all_activities_data['timestamp'].min().date(), "to", all_activities_data['timestamp'].max().date())
    
    print("\nActivity Types:")
    print(all_activities_data['activity_type'].value_counts())
    
    # Save combined data to CSV
    all_activities_data.to_csv('all_activities_data.csv', index=False)
    print("\nAll data has been saved to 'all_activities_data.csv'")
else:
    print("No data to process.")

Successfully processed TCX file: 10006096920.tcx
Successfully processed TCX file: 10071074570.tcx
Successfully processed TCX file: 10072554088.tcx
Successfully processed TCX file: 10075275124.tcx
Successfully processed TCX file: 10081995809.tcx
Successfully processed TCX file: 10092773149.tcx
Successfully processed TCX file: 10092773210.tcx
Successfully processed TCX file: 10118979612.tcx
Successfully processed TCX file: 10118979697.tcx
Successfully processed TCX file: 10118979773.tcx
Successfully processed TCX file: 10124686569.tcx
Successfully processed TCX file: 10124686608.tcx
Successfully processed TCX file: 10129599077.tcx
Successfully processed TCX file: 10131224403.tcx
Successfully processed TCX file: 10132062265.tcx
Successfully processed TCX file: 10140384350.tcx
Successfully processed TCX file: 10143559171.tcx
Successfully processed TCX file: 10155282539.tcx
Successfully processed TCX file: 10163154485.tcx
Successfully processed TCX file: 10165231865.tcx
Successfully process

  combined_data = pd.concat(all_data, ignore_index=True)



Combined Data Summary:
           latitude     longitude     heart_rate      elevation
count  1.507683e+06  1.507683e+06  435072.000000  581215.000000
mean   2.027047e+01 -7.632772e+01     114.594948    2577.884147
std    1.894437e+01  2.740995e+00      35.106919     245.260014
min    4.583551e+00 -7.976345e+01      43.000000    -149.000000
25%    4.699268e+00 -7.968901e+01      83.000000    2566.000000
50%    4.747527e+00 -7.409178e+01     114.000000    2581.000000
75%    4.354257e+01 -7.405813e+01     143.000000    2608.000000
max    4.363600e+01 -7.376197e+01     199.000000    3041.000000

Number of activities: 578
Date range: 2021-03-07 to 2024-07-26

Activity Types:
activity_type
cycling    1492528
walking      14785
running       1522
other          589
swim           186
Name: count, dtype: int64

All data has been saved to 'all_activities_data.csv'


In [36]:
#from the csv file I want to see all the activities that are tagged as unknown, but only show the file name and the activity type, and if the file name appears more than once, only show it once.  
#I also want to see the number of activities that are tagged as unknown.

# Load the combined data from the CSV file

all_activities_data = pd.read_csv('all_activities_data.csv')

# Filter the data for activities tagged as 'unknown'
unknown_activities = all_activities_data[all_activities_data['activity_type'] == 'unknown']

# Drop duplicates based on the 'file_name' column
unknown_activities = unknown_activities.drop_duplicates(subset='file_name', keep='first')

# Display the required information
print("\nUnknown Activities:")
print(unknown_activities[['file_name', 'activity_type']])
print("\nNumber of unknown activities:", unknown_activities.shape[0])

unknown_activities.to_csv('unknown_activities.csv', index=False)




Unknown Activities:
Empty DataFrame
Columns: [file_name, activity_type]
Index: []

Number of unknown activities: 0
