In [48]:
import pandas as pd
import numpy as np

# Load Dataset
data_file = "../usaa_swim_data/swimmers_cleaned.csv"
df_cleaned = pd.read_csv(data_file)

In [49]:
# Remove non-SCY events
df_cleaned = df_cleaned[df_cleaned['Event'].str.contains("SCY")]

# Remove Strokes_1, Strokes_2, Strokes_3 since only focusing on best event here
df_cleaned = df_cleaned.drop(columns=["Stroke_1"])
df_cleaned = df_cleaned.drop(columns=["Stroke_2"])
df_cleaned = df_cleaned.drop(columns=["Stroke_3"])


In [50]:
# **1. Time Cut Achievements**
# Convert time cuts to ordinal values
time_cut_mapping = {
    "Olympic Trials": 24,
    "Summer Nationals": 19,
    "Winter Nationals": 19,
    "Winter US Open": 15,
    "Summer Juniors": 12,
    "Winter Juniors": 10,
    "Futures": 8,
    "AAAA": 7,
    "AAA": 6,
    "AA": 5,
    "A": 4,
    "BB": 3,
    "B": 2,
    "Slower than B": 1  # Default for those without a recognized time cut
}


def extract_time_cut_category(time_cut):
    """Extracts the time cut category from the full time cut string"""
    if pd.isna(time_cut):
        return "Slower than B"
    for key in time_cut_mapping.keys():
        if key in time_cut:
            return key
    return "Slower than B"

df_cleaned["Time_cut_Category"] = df_cleaned["Time_cut"].apply(extract_time_cut_category)
df_cleaned["Time_cut_Score"] = df_cleaned["Time_cut_Category"].map(time_cut_mapping)


# Verify
print(df_cleaned[["Name", "Event", "Time_cut", "Time_cut_Category", "Time_cut_Score"]].head())


         Name      Event                     Time_cut Time_cut_Category  \
0  Alex Walsh  50 FR SCY  2018 Summer Nationals (LCM)  Summer Nationals   
1  Alex Walsh  50 FR SCY  2021 Summer Nationals (LCM)  Summer Nationals   
2  Alex Walsh  50 FR SCY  2019 Summer Nationals (LCM)  Summer Nationals   
3  Alex Walsh  50 FR SCY  2018 Summer Nationals (LCM)  Summer Nationals   
4  Alex Walsh  50 FR SCY  2018 Winter Nationals (LCM)  Winter Nationals   

   Time_cut_Score  
0              19  
1              19  
2              19  
3              19  
4              19  


In [51]:
# **2. Consistency Metrics via Standard Deviation**

# Compute standard deviation of swim times per swimmer-event
df_cleaned["Time_Std"] = df_cleaned.groupby(["Name", "Event"])["Time"].transform("std")

# Replace NaN values (for swimmers with only one race) without using inplace=True
df_cleaned["Time_Std"] = df_cleaned["Time_Std"].fillna(df_cleaned["Time_Std"].median())

# Verify
print(df_cleaned[["Name", "Event", "Time", "Time_Std"]].head())

         Name      Event   Time  Time_Std
0  Alex Walsh  50 FR SCY  22.08  3.449292
1  Alex Walsh  50 FR SCY  22.18  3.449292
2  Alex Walsh  50 FR SCY  22.24  3.449292
3  Alex Walsh  50 FR SCY  22.28  3.449292
4  Alex Walsh  50 FR SCY  22.35  3.449292


In [52]:
# Extract stroke and distance from event names
df_cleaned["Stroke"] = df_cleaned["Event"].str.extract(r"(FR|BK|BR|FL|IM)", expand=False)
df_cleaned["Distance"] = df_cleaned["Event"].str.extract(r"(\d+)", expand=False).astype(float)

# Check if any rows have missing strokes or distances
print(df_cleaned[df_cleaned["Stroke"].isna() | df_cleaned["Distance"].isna()])

Empty DataFrame
Columns: [Name, Event, Time, Age_at_time_of_Swim, Time_cut, Time_cut_points, Swim_date, Specialty_1, Specialty_2, Time_cut_Category, Time_cut_Score, Time_Std, Stroke, Distance]
Index: []


In [53]:
# COMPUTE SPECIALIZATION SCORES 

# Step 1: Get each swimmer's highest time cut score achieved in any event
df_cleaned["Best_Time_Cut_at_Age"] = df_cleaned.groupby(["Name", "Age_at_time_of_Swim"])["Time_cut_Score"].transform("max")


# Step 2: Compute Specialization Score Using This Variable
df_cleaned["Specialization_Score"] = df_cleaned["Time_cut_Score"] / df_cleaned["Best_Time_Cut_at_Age"]

# Step 3: Properly handle cases where Best_Time_Cut is zero (avoid division by zero)
df_cleaned["Specialization_Score"] = df_cleaned["Specialization_Score"].fillna(0)  

# Verify results
print(df_cleaned[["Name", "Age_at_time_of_Swim", "Event", "Time_cut_Score", "Best_Time_Cut_at_Age", "Specialization_Score"]].sample(10))



                    Name  Age_at_time_of_Swim       Event  Time_cut_Score  \
38756       Gabi Albiero                   12  200 IM SCY               6   
89845      Paige McKenna                   11   50 BK SCY               3   
55643     Emily Lundgren                   10  200 FR SCY               7   
631           Alex Walsh                   15  200 IM SCY               7   
11360        Isabel Ivey                   11   50 BK SCY               4   
65901  Katharine Berkoff                   10  200 IM SCY               5   
85094      Abby McCulloh                   12  400 IM SCY               6   
88256     Erica Sullivan                    9  200 FR SCY               3   
59779       Joleigh Crye                   10   50 BK SCY               4   
68996       Catie Choate                   19   50 BK SCY               2   

       Best_Time_Cut_at_Age  Specialization_Score  
38756                     7              0.857143  
89845                     6              0.50000

In [54]:
# Identify the Best Event for Each Swimmer at Each Age

# Step 1: Identify the index of the event that gave the swimmer their highest time cut score at each age
idx_best_event = df_cleaned.groupby(["Name", "Age_at_time_of_Swim"])["Time_cut_Score"].idxmax()

# Step 2: Create a DataFrame that contains the best event per swimmer per age
df_best_event = df_cleaned.loc[idx_best_event, ["Name", "Age_at_time_of_Swim", "Event"]]

# Step 3: Rename "Event" to "Best_Event" before merging (keeps original "Event" column intact)
df_best_event = df_best_event.rename(columns={"Event": "Best_Event"})

# Step 4: Merge this best event information back into df_cleaned
df_cleaned = df_cleaned.merge(df_best_event, on=["Name", "Age_at_time_of_Swim"], how="left")

In [55]:
# **4. Career Length**
# Compute the number of years a swimmer competed
df_cleaned["Career_Length"] = df_cleaned.groupby("Name")["Age_at_time_of_Swim"].transform(lambda x: x.max() - x.min())


In [56]:
# **Final Processing & Cleanup**

# Drop unnecessary columns
df_features = df_cleaned[["Name", "Event", "Time", "Age_at_time_of_Swim", "Time_cut", "Time_cut_Score", "Best_Event", "Time_Std", "Specialization_Score", "Career_Length", "Specialty_1", "Specialty_2"]]

# Save processed data for modeling
df_features.to_csv("processed_swim_features.csv", index=False)

# Display feature statistics
print(df_features.describe())

               Time  Age_at_time_of_Swim  Time_cut_Score      Time_Std  \
count  59672.000000         59672.000000    59672.000000  59672.000000   
mean     110.547549            13.979756        7.750436     11.418234   
std      131.509100             3.813050        5.161538      8.357733   
min       20.370000             4.000000        1.000000      0.000000   
25%       50.420000            11.000000        4.000000      6.673417   
50%       67.200000            14.000000        6.000000      9.403317   
75%      125.300000            17.000000        8.000000     13.274142   
max     1681.600000            34.000000       19.000000    119.252015   

       Specialization_Score  Career_Length  
count          59672.000000   59672.000000  
mean               0.681622      13.821122  
std                0.260934       2.391696  
min                0.052632       1.000000  
25%                0.500000      13.000000  
50%                0.714286      14.000000  
75%               