In [1]:
import os
import glob
import numpy as np
import pandas as pd
from scipy.interpolate import interp1d
from tqdm import tqdm


In [3]:
def smooth_elevation(df, spacing=50):
    cum_distance = df['cum_distance'].values
    elevation = df['elevation'].values

    # Step 1: Remove consecutive duplicate cum_distance values
    diff = np.diff(cum_distance, prepend=cum_distance[0] - 1)
    mask = diff > 0
    cum_distance = cum_distance[mask]
    elevation = elevation[mask]

    if len(cum_distance) < 2 or cum_distance[-1] - cum_distance[0] < spacing:
        raise ValueError("Not enough cumulative distance to resample")

    # Step 2: Interpolate elevation at uniform spacing
    new_distances = np.arange(cum_distance[0], cum_distance[-1], spacing)
    if len(new_distances) < 2:
        raise ValueError("Resampling resulted in too few points")

    interp_func = interp1d(cum_distance, elevation, kind='linear', fill_value="extrapolate")
    new_elevation = interp_func(new_distances)

    return pd.DataFrame({'cum_distance': new_distances, 'elevation': new_elevation})



In [4]:
def compute_profile_features(df):
    if df.empty or len(df) < 2:
        raise ValueError("Insufficient data after smoothing")

    elevation = df['elevation'].values
    cum_distance = df['cum_distance'].values

    total_distance = cum_distance[-1]
    delta_elevation = np.diff(elevation)
    delta_distance = np.diff(cum_distance)

    delta_distance = np.where(delta_distance == 0, 1e-6, delta_distance)
    gradients = (delta_elevation / delta_distance) * 100  # %

    if gradients.size == 0:
        raise ValueError("No gradients to compute")

    climbing_sections = gradients[gradients > 3]
    descending_sections = gradients[gradients < -3]

    return {
        "total_distance_m": total_distance,
        "elevation_gain_m": np.sum(np.clip(delta_elevation, 0, None)),
        "elevation_loss_m": -np.sum(np.clip(delta_elevation, None, 0)),
        "avg_gradient_pct": np.mean(gradients),
        "max_gradient_pct": np.max(gradients),
        "std_gradient_pct": np.std(gradients),
        "avg_uphill_gradient_pct": np.mean(climbing_sections) if len(climbing_sections) else 0,
        "avg_downhill_gradient_pct": np.mean(descending_sections) if len(descending_sections) else 0,
        "climbing_proportion": len(climbing_sections) / len(gradients),
        "descending_proportion": len(descending_sections) / len(gradients),
    }



In [5]:
# Directory containing .csvs
data_dir = "data/gpx_parsed"
results = []
skipped = []

for file in tqdm(os.listdir(data_dir)):
    if not file.endswith(".csv"):
        continue
    try:
        path = os.path.join(data_dir, file)
        df = pd.read_csv(path)

        required_cols = {'elevation', 'cum_distance', 'name'}
        if not required_cols.issubset(df.columns):
            raise ValueError("Missing required columns")

        if len(df) < 10:
            raise ValueError("Too few points")

        smoothed = smooth_elevation(df)
        features = compute_profile_features(smoothed)
        features['name'] = df['name'].iloc[0].strip().lower()
        results.append(features)

    except Exception as e:
        skipped.append((file, str(e)))

# Convert to DataFrame
feature_df = pd.DataFrame(results)

# Save to file
feature_df.to_csv("race_profile_features.csv", index=False)

# Print summary
print(f"\n✅ Processed {len(results)} races")
print(f"⚠️ Skipped {len(skipped)} files")
if skipped:
    print("\nSome skipped files:")
    for f, reason in skipped[:10]:
        print(f"- {f}: {reason}")

  9%|▉         | 757/8093 [00:09<01:34, 77.30it/s]


KeyboardInterrupt: 

In [6]:
# Load the structured course data
structured_course_data = pd.read_csv('data/structured_course_data.csv')

# Let's assume the structured_course_data has columns like:
# 'path_proportion', 'cycleway_proportion', 'cobble_proportion', etc.

# Check the columns in the structured_course_data
print(structured_course_data.columns)



Index(['Unnamed: 0', 'Race Name', 'Distance', 'Street', 'Road', 'Paved',
       'Asphalt', 'Path', 'Cycleway', 'Unpaved', 'State Road', 'Cobblestones',
       'Unknown', 'Compacted Gravel', 'Off-grid (unknown)', 'Singletrack',
       'Access Road', 'Alpine', 'Net Gain', 'Lowest Elevation',
       'Highest Elevation', 'Vertical Gain', 'Downhill'],
      dtype='object')


In [7]:
structured_course_data = structured_course_data.drop(columns=['Net Gain', 'Lowest Elevation',
       'Highest Elevation', 'Vertical Gain', 'Downhill'])



In [23]:
print(structured_course_data.columns)

Index(['Unnamed: 0', 'Race Name', 'pct_distance', 'pct_street', 'pct_road',
       'pct_paved', 'pct_asphalt', 'pct_path', 'pct_cycleway', 'pct_unpaved',
       'pct_state_road', 'pct_cobblestones', 'pct_unknown',
       'pct_compacted_gravel', 'pct_off_grid', 'pct_singletrack',
       'pct_access_road', 'pct_alpine'],
      dtype='object')


In [8]:
# Optionally rename columns to make them more consistent if needed:
structured_course_data = structured_course_data.rename(columns={
    'Distance':'pct_distance',
    'Street':'pct_street',
    'Road':'pct_road',
    'Paved':'pct_paved',
    'Asphalt':'pct_asphalt', 
    'Path':'pct_path', 
    'Cycleway':'pct_cycleway',
    'Unpaved':'pct_unpaved',
    'State Road':'pct_state_road',
    'Cobblestones':'pct_cobblestones',
    'Unknown':'pct_unknown',
    'Dirt':'pct_dirt', 
    'Compacted Gravel':'pct_compacted_gravel',
    'Off-grid (unknown)':'pct_off_grid',
    'Singletrack':'pct_singletrack',
    'Access Road': 'pct_access_road',
     'Alpine': 'pct_alpine'
    # Add more columns as necessary
})


In [9]:
structured_course_data['Race Name'] = structured_course_data['Race Name'].str.lower()
structured_course_data.head()

Unnamed: 0.1,Unnamed: 0,Race Name,pct_distance,pct_street,pct_road,pct_paved,pct_asphalt,pct_path,pct_cycleway,pct_unpaved,pct_state_road,pct_cobblestones,pct_unknown,pct_compacted_gravel,pct_off_grid,pct_singletrack,pct_access_road,pct_alpine
0,0,2022 a travers les hauts de france,117.44,2.44,115.0,11.5,106.0,,,,,,,,,,,
1,1,2022 ag tour de la semois stage 1,110.56,1.56,109.0,14.4,96.1,,,,,,,,,,,
2,2,2022 ag tour de la semois stage 2,113.26,5.55,101.0,11.6,101.0,2.68,4.03,1.02,,,,,,,,
3,3,2022 ain bugey valromey tour stage 1,104.283,,88.5,3.45,101.0,,0.983,,14.8,,,,,,,
4,4,2022 ain bugey valromey tour stage 2,98.06,,96.1,,98.0,,,,1.96,,,,,,,


In [10]:
feature_df = pd.read_csv('data/race_profile_features.csv')

In [11]:
# Assuming the 'name' column exists in both DataFrames, we merge on 'name'
merged_df = pd.merge(feature_df, structured_course_data, left_on='name', right_on='Race Name', how='left')

merged_df.head()

Unnamed: 0,total_distance_m,elevation_gain_m,elevation_loss_m,avg_gradient_pct,max_gradient_pct,std_gradient_pct,avg_uphill_gradient_pct,avg_downhill_gradient_pct,climbing_proportion,descending_proportion,...,pct_cycleway,pct_unpaved,pct_state_road,pct_cobblestones,pct_unknown,pct_compacted_gravel,pct_off_grid,pct_singletrack,pct_access_road,pct_alpine
0,192200.0,704.090843,662.090843,0.021852,10.135114,1.311374,4.119957,-4.275384,0.020552,0.01717,...,,,,,,,,,,
1,197500.0,2117.482644,2128.482644,-0.00557,23.214817,3.936218,7.88391,-6.331164,0.103291,0.115696,...,2.67,,5.58,0.122,0.366,,0.366,,,
2,146800.0,530.212581,536.212581,-0.004087,15.3428,1.469611,5.034483,-4.858354,0.026226,0.027929,...,,,122.0,,,,,,,
3,114850.0,129.460974,134.460974,-0.004354,6.701216,0.627047,3.93747,-3.853359,0.003483,0.004354,...,,,,,,,,,,
4,149800.0,1148.663969,399.059374,0.500404,20.583697,2.463394,8.250595,-5.135312,0.061749,0.008011,...,,,106.0,0.18,1.12,,1.08,,0.309,


In [12]:
merged_df['Unnamed: 0']

0          NaN
1       1862.0
2       1863.0
3       1864.0
4       1865.0
         ...  
8087    1856.0
8088    1857.0
8089    1858.0
8090    1859.0
8091    1860.0
Name: Unnamed: 0, Length: 8092, dtype: float64

In [13]:
print(merged_df.columns)

Index(['total_distance_m', 'elevation_gain_m', 'elevation_loss_m',
       'avg_gradient_pct', 'max_gradient_pct', 'std_gradient_pct',
       'avg_uphill_gradient_pct', 'avg_downhill_gradient_pct',
       'climbing_proportion', 'descending_proportion', 'name', 'Unnamed: 0',
       'Race Name', 'pct_distance', 'pct_street', 'pct_road', 'pct_paved',
       'pct_asphalt', 'pct_path', 'pct_cycleway', 'pct_unpaved',
       'pct_state_road', 'pct_cobblestones', 'pct_unknown',
       'pct_compacted_gravel', 'pct_off_grid', 'pct_singletrack',
       'pct_access_road', 'pct_alpine'],
      dtype='object')


In [14]:
from sklearn.preprocessing import StandardScaler

# Normalize the feature matrix (excluding 'name' column)
scaler = StandardScaler()
feature_columns = [
    'total_distance_m', 'elevation_gain_m', 'elevation_loss_m',
       'avg_gradient_pct', 'max_gradient_pct', 'std_gradient_pct',
       'avg_uphill_gradient_pct', 'avg_downhill_gradient_pct',
       'climbing_proportion', 'descending_proportion',
       'pct_distance', 'pct_street', 'pct_road', 'pct_paved',
       'pct_asphalt', 'pct_path', 'pct_cycleway', 'pct_unpaved',
       'pct_state_road', 'pct_cobblestones', 'pct_unknown',
       'pct_compacted_gravel', 'pct_off_grid', 'pct_singletrack',
       'pct_access_road', 'pct_alpine'
]

# Apply scaling
scaled_features = scaler.fit_transform(merged_df[feature_columns])

# Replace original columns with normalized values
normalized_df = pd.DataFrame(scaled_features, columns=feature_columns)
normalized_df['name'] = merged_df['name']


In [15]:
normalized_df_filled = normalized_df.fillna(0)

In [16]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Assume normalized_df is your DataFrame with normalized features and 'name' column
feature_matrix = normalized_df_filled.drop(columns=['name']).values
race_names = normalized_df_filled['name'].values

# Compute pairwise cosine similarity matrix
cos_sim_matrix = cosine_similarity(feature_matrix)

# Create a DataFrame for readability
cos_sim_df = pd.DataFrame(cos_sim_matrix, index=race_names, columns=race_names)


In [17]:
def get_top_similar_races(race_name, top_n=5):
    if race_name not in cos_sim_df.index:
        print("Race not found.")
        return []
    
    similarities = cos_sim_df.loc[race_name].drop(race_name)
    return similarities.sort_values(ascending=False).head(top_n)

# Example usage
get_top_similar_races("2023 paris-roubaix")


2019 paris-roubaix                              0.998301
2018 paris - roubaix                            0.990967
2018 paris - roubaix espoirs                    0.960066
2023 paris-roubaix espoirs                      0.919108
2023 grand prix de denain - porte du hainaut    0.913616
Name: 2023 paris-roubaix, dtype: float64

In [18]:
get_top_similar_races("2022 tour de france stage 1")

2021 tour de suisse stage 1            0.995163
2019 uae tour stage 1                  0.991430
2018 boels ladies tour stage 5         0.976560
2020 visegrad 4 juniors stage 2-1      0.961724
2022 epz omloop van borsele stage 3    0.958771
Name: 2022 tour de france stage 1, dtype: float64

In [19]:
import umap
from sklearn.decomposition import PCA
import plotly.express as px
import pandas as pd
from sklearn.cluster import KMeans

# Assuming your normalized DataFrame is called normalized_df_filled
feature_cols = normalized_df_filled.columns.drop('name')
X = normalized_df_filled[feature_cols].values

# 1. PCA: Reduce to 2D
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# 2. UMAP: Reduce to 2D
umap_model = umap.UMAP(n_components=2, random_state=42)
X_umap = umap_model.fit_transform(X)

# 3. Clustering (e.g., 5 clusters)
kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(X)




  from .autonotebook import tqdm as notebook_tqdm
  warn(


In [20]:
# 4. Build DataFrame for plotting
plot_df = pd.DataFrame({
    'race_name': normalized_df_filled['name'],
    'cluster': clusters,
    'PCA_1': X_pca[:, 0],
    'PCA_2': X_pca[:, 1],
    'UMAP_1': X_umap[:, 0],
    'UMAP_2': X_umap[:, 1]
})

# Add features to hover info
for col in ['total_distance_m', 'elevation_gain_m', 'avg_gradient_pct', 'climbing_proportion']:
    if col in normalized_df_filled.columns:
        plot_df[col] = normalized_df_filled[col]

# 5. Plot PCA
fig_pca = px.scatter(
    plot_df,
    x='PCA_1', y='PCA_2',
    color='cluster',
    hover_data=['race_name', 'total_distance_m', 'elevation_gain_m', 'avg_gradient_pct', 'climbing_proportion'],
    title="PCA (Race Clusters)",
    opacity=0.8
)

# 6. Plot UMAP
fig_umap = px.scatter(
    plot_df,
    x='UMAP_1', y='UMAP_2',
    color='cluster',
    hover_data=['race_name', 'total_distance_m', 'elevation_gain_m', 'avg_gradient_pct', 'climbing_proportion'],
    title="UMAP (Race Clusters)",
    opacity=0.8
)



In [None]:
# Save PCA plot to HTML
fig_pca.write_html("outputs/pca_plot.html")

# Save UMAP plot to HTML
fig_umap.write_html("outputs/umap_plot.html")

In [21]:
import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np
import io
import base64
import pandas as pd

# 1. Helper function to generate elevation profile plot
def generate_elevation_profile(distance, elevation):
    fig, ax = plt.subplots()
    ax.plot(distance, elevation, color='blue')
    ax.set_xlabel('Distance (m)')
    ax.set_ylabel('Elevation (m)')
    ax.set_title('Elevation Profile')

    # Save plot to a BytesIO buffer and convert to base64
    buffer = io.BytesIO()
    fig.savefig(buffer, format='png')
    buffer.seek(0)
    img_str = base64.b64encode(buffer.read()).decode('utf-8')
    plt.close(fig)
    return f"data:image/png;base64,{img_str}"

# 2. Example dataframe with race names, distance, and elevation
# Replace this with your actual DataFrame
data = {
    'race_name': ['Race 1', 'Race 2', 'Race 3'],
    'distance': [
        np.array([0, 5000, 10000, 15000, 20000]),
        np.array([0, 4000, 8000, 12000, 16000]),
        np.array([0, 6000, 12000, 18000, 24000]),
    ],
    'elevation': [
        np.array([0, 50, 100, 200, 250]),
        np.array([0, 30, 70, 120, 150]),
        np.array([0, 60, 120, 180, 240]),
    ]
}
df = pd.DataFrame(data)

# 3. Generate elevation profiles for each race and encode them
df['elevation_profile'] = df.apply(lambda row: generate_elevation_profile(row['distance'], row['elevation']), axis=1)

# 4. Create plotly scatter plot
fig = px.scatter(
    df,
    x='distance', y='elevation',
    hover_name='race_name',
    hover_data={'distance': False, 'elevation': False},  # Don't display these columns in the hover
    title="Race Elevation Profiles"
)

# 5. Add the elevation profile images to the hover tooltips
fig.update_traces(
    hovertemplate=(
        "<b>%{hovertext}</b><br>"  # Race name
        "<img src='%{customdata}' width='200' height='150'><br>"  # Elevation profile image
        "<extra></extra>"  # Hides the extra information (like trace name)
    ),
    customdata=df['elevation_profile']
)

# 6. Show the plot
fig.show()
