In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns

In [37]:
# =========== Load & Preprocess Data ===========
data_path = "../feature_engineering/processed_swim_features.csv"
df = pd.read_csv(data_path)

# Filter only pre-18 data
df_pre18 = df[df["Age_at_time_of_Swim"] < 18].copy()

# Remove non-standard, rarely swum events
events_to_remove = ["50 BR SCY", "50 BK SCY", "50 FL SCY", "100 IM SCY", "400 FR LCM"]
df_pre18 = df_pre18[~df_pre18["Event"].isin(events_to_remove)]

print(f"Total unique events after removal: {df_pre18['Event'].nunique()}")


Total unique events after removal: 14


In [38]:
# =========== Feature Engineering: Determine Best Event Per Swimmer ===========
# Ensure there are no missing specialization scores
df_pre18 = df_pre18.dropna(subset=["Specialization_Score"])

# Identify the event with the highest specialization score per swimmer
best_event_per_swimmer = df_pre18.loc[df_pre18.groupby("Name")["Specialization_Score"].idxmax(), ["Name", "Event"]]

# Rename column to "Best_Event"
best_event_per_swimmer.rename(columns={"Event": "Best_Event"}, inplace=True)

# Ensure df_pre18 does not already contain "Best_Event" before merging
if "Best_Event" in df_pre18.columns:
    df_pre18 = df_pre18.drop(columns=["Best_Event"])

print("Merging Best_Event into df_pre18...")
# Merge "Best_Event" back into df_pre18
df_pre18 = df_pre18.merge(best_event_per_swimmer, on="Name", how="left")


Merging Best_Event into df_pre18...


In [39]:
# Ensure "Best_Event" has no missing values by assigning Specialty_1 as a fallback
df_pre18["Best_Event"] = df_pre18["Best_Event"].fillna(df_pre18["Specialty_1"])

# Avoid redundant merging—this dataset already has "Best_Event"
swimmer_df = df_pre18.copy()  # This now contains the correct "Best_Event" values

print(f"Updated dataset shape: {swimmer_df.shape}")
print("Sample of Best_Event assignments:\n", swimmer_df[["Name", "Best_Event", "Specialty_1"]].head())


Updated dataset shape: (38453, 12)
Sample of Best_Event assignments:
          Name Best_Event Specialty_1
0  Alex Walsh  50 FR SCY      200 IM
1  Alex Walsh  50 FR SCY      200 IM
2  Alex Walsh  50 FR SCY      200 IM
3  Alex Walsh  50 FR SCY      200 IM
4  Alex Walsh  50 FR SCY      200 IM


In [40]:
# =========== Feature & Target Selection ===========
# Define feature columns (exclude swimmer name and event columns)
exclude_cols = ["Name", "Best_Event", "Event", "Specialty_1", "Specialty_2"]
feature_cols = [col for col in swimmer_df.columns if col not in exclude_cols]
target_col = "Best_Event"

X = swimmer_df[feature_cols]
y = swimmer_df[target_col]

# Encode target variable (convert events into numerical labels)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


ValueError: could not convert string to float: '2018 Summer Nationals (LCM)'

In [None]:
# =========== Train & Evaluate Model Using Cross-Validation ===========
# Define k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize classifier
rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)

# Perform cross-validation
cv_scores = cross_val_score(rf_classifier, X_scaled, y_encoded, cv=kf, scoring="accuracy")

# Train final model on full dataset
rf_classifier.fit(X_scaled, y_encoded)

# Display cross-validation accuracy
print(f"\nCross-Validation Accuracy: {np.mean(cv_scores):.4f}")
