<a href="https://colab.research.google.com/github/jfmalloy1/UltraMarathon_Prediction/blob/main/UltraRunning_Predictions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Cleaning Data

In [1]:
# prompt: Read in a csv file in the same google folder as this notebook

import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

# Assuming the CSV file is named 'your_file.csv'
# Replace 'your_file.csv' with the actual filename if different
try:
  df = pd.read_csv('/content/drive/MyDrive/Tech Learning/ML/Ultrarunning Predictions/TWO_CENTURIES_OF_UM_RACES.csv')
  print(df.head()) # Display the first few rows to confirm
except FileNotFoundError:
  print("Error: 'TWO_CENTURIES_OF_UM_RACES.csv' not found in the current directory.")
except pd.errors.ParserError:
  print("Error: Could not parse the CSV file. Please check its format.")
except Exception as e:
  print(f"An unexpected error occurred: {e}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  df = pd.read_csv('/content/drive/MyDrive/Tech Learning/ML/Ultrarunning Predictions/TWO_CENTURIES_OF_UM_RACES.csv')


   Year of event Event dates           Event name Event distance/length  \
0           2018  06.01.2018  Selva Costera (CHI)                  50km   
1           2018  06.01.2018  Selva Costera (CHI)                  50km   
2           2018  06.01.2018  Selva Costera (CHI)                  50km   
3           2018  06.01.2018  Selva Costera (CHI)                  50km   
4           2018  06.01.2018  Selva Costera (CHI)                  50km   

   Event number of finishers Athlete performance        Athlete club  \
0                         22           4:51:39 h               Tnfrc   
1                         22           5:15:45 h  Roberto Echeverría   
2                         22           5:16:44 h   Puro Trail Osorno   
3                         22           5:34:13 h            Columbia   
4                         22           5:54:14 h      Baguales Trail   

  Athlete country  Athlete year of birth Athlete gender Athlete age category  \
0             CHI                 19

In [2]:
# Only predicting on hours times, not distances
df = df[df['Athlete performance'].str.endswith('h', na=False)]

### Do some conversions
# Convert date fields (TODO: replace/drop NaT dates)
df['Event dates'] = pd.to_datetime(df['Event dates'], format='%m.%d.%Y', errors='coerce')

# Convert distance to numeric (removing 'km' and converting to float)
df['Event distance/length'] = df['Event distance/length'].str.replace(r'\D', '', regex=True).astype(float)

# Convert 'Athlete performance' to minutes
df['Performance_minutes'] = pd.to_timedelta(
    df['Athlete performance'].str.extract(r'(\d+:\d+:\d+)')[0]
).dt.total_seconds() / 60

In [3]:
### Add some new features

# Athlete age at start of race, not just category
df['Athlete age'] = df['Year of event'] - df['Athlete year of birth']

# TODO: Any others?

In [4]:
df.fillna({'Athlete club': 'Unknown'}, inplace=True)
df.dropna(subset=["Athlete country", "Athlete gender", "Athlete average speed"], inplace=True)

# TODO: check this, drop/fill others

In [5]:
### Encoding & transformations
from sklearn.preprocessing import LabelEncoder, StandardScaler

label_encoder = LabelEncoder()
df['Athlete gender'] = label_encoder.fit_transform(df['Athlete gender'])
df['Athlete country'] = label_encoder.fit_transform(df['Athlete country'])
df['Athlete age category'] = label_encoder.fit_transform(df['Athlete age category'])

In [6]:
### More new features
df['Day'] = df['Event dates'].dt.day
df['Month'] = df['Event dates'].dt.month

In [7]:
### Drop useless columns

columns_to_drop = ['Athlete performance', 'Event name', 'Athlete club', 'Athlete ID', 'Event dates'] #
df.drop(columns=columns_to_drop, inplace=True)

In [8]:
### TODO: be smarter about dropping missing values
df.dropna(inplace=True)
print(len(df))

1795570


In [9]:
# prompt: convert all unique values of the "Athlete average speed" column into floats

# Convert 'Athlete average speed' to numeric, coercing errors to NaN
df['Athlete average speed'] = pd.to_numeric(df['Athlete average speed'], errors='coerce')

# Display unique values after conversion
print(df['Athlete average speed'].unique())

[  10.286    9.501    9.472 ... 6690.    5954.    5767.   ]


## Random Forest Model

Testing some random forest regressions

In [10]:
from sklearn.model_selection import train_test_split

# Define features and target
X = df.drop(columns=['Performance_minutes'])  # Replace 'Target_Column' with your target variable
y = df['Performance_minutes']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')



In [11]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the model
model = RandomForestRegressor(n_estimators=100, n_jobs=-1, max_depth=15, bootstrap=False, random_state=42)  # For regression

# Train the model
model.fit(X_train, y_train)

In [12]:
from sklearn.metrics import mean_squared_error, r2_score

# Predict on the test set
y_pred = model.predict(X_test)

# Regression metrics
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

Mean Squared Error: 2510.447338475981
R² Score: 0.9493306590313311


In [None]:
import shap
import matplotlib.pyplot as plt

# Initialize SHAP explainer
explainer = shap.TreeExplainer(model)

# Calculate SHAP values for the test set
shap_values = explainer.shap_values(X_test)

# Summary plot of feature importance
shap.summary_plot(shap_values, X_test)

# Force plot for a single prediction
shap.force_plot(explainer.expected_value, shap_values[0], X_test.iloc[0])

In [None]:
import numpy as np

# Get feature importance
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.title("Feature Importances")
plt.bar(range(X.shape[1]), importances[indices], align="center")
plt.xticks(range(X.shape[1]), X.columns[indices], rotation=90)
plt.tight_layout()
plt.show()