In [114]:
import warnings
warnings.filterwarnings('ignore')

In [115]:
import pandas as pd
import numpy as np
# import seaborn as sns
# import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error

In [116]:
# Load data with specified file paths
fielding = pd.read_csv('Fielding.csv')
batting = pd.read_csv('Batting.csv')
pitching = pd.read_csv('Pitching.csv')
savant_pitch_data = pd.read_csv('savant_pitch_data.csv')
players = pd.read_csv('People.csv', encoding='latin1')

# Ensure 'playerID' is of the same type (string) across all dataframes
fielding['playerID'] = fielding['playerID'].astype(str)
batting['playerID'] = batting['playerID'].astype(str)
pitching['playerID'] = pitching['playerID'].astype(str)
savant_pitch_data['player_id'] = savant_pitch_data['player_id'].astype(str)

# Rename 'player_id' in Savant data to match 'playerID'
savant_pitch_data.rename(columns={'player_id': 'playerID'}, inplace=True)

# Filter relevant columns for each dataset
# Pitching-specific columns
pitching_cols = [
    'playerID', 'yearID', 'teamID', 'stint', 'W', 'L', 'G', 'GS', 'CG', 'SHO', 'SV', 
    'IPouts', 'H', 'ER', 'HR', 'BB', 'SO', 'BAOpp', 'ERA', 'WP', 'BK', 'BFP', 'GF'
]

# Combine batting and fielding columns for position players
batting_cols = [
    'playerID', 'yearID', 'teamID', 'stint', 'G', 'AB', 'R', 'H', '2B', '3B', 'HR', 
    'RBI', 'SB', 'CS', 'BB', 'SO', 'IBB', 'HBP', 'SH', 'SF', 'GIDP'
]
fielding_cols = [
    'playerID', 'yearID', 'teamID', 'stint', 'POS', 'G', 'GS', 'InnOuts', 'PO', 'A', 
    'E', 'DP', 'PB', 'WP', 'SB', 'CS', 'ZR'
]

# Filter each dataset to keep only relevant columns
pitching = pitching[pitching_cols]
batting = batting[[col for col in batting_cols if col in batting.columns]]
fielding = fielding[[col for col in fielding_cols if col in fielding.columns]]

# Merge batting and fielding data to form a combined position player dataset
position_players = pd.merge(batting, fielding, on=['playerID', 'yearID', 'teamID', 'stint', 'G'], how='outer')

# Add birth year for age calculation to both datasets
players = players[['playerID', 'birthYear']]

# Merge position player stats with player info to calculate age
position_players = pd.merge(position_players, players, on='playerID', how='left')
position_players['age'] = position_players['yearID'] - position_players['birthYear']
position_players.drop(columns=['birthYear'], inplace=True)

position_players.to_csv('position_players_dataframe.csv')

# Merge pitching stats with player info to calculate age
pitchers = pd.merge(pitching, players, on='playerID', how='left')
pitchers['age'] = pitchers['yearID'] - pitchers['birthYear']
pitchers.drop(columns=['birthYear'], inplace=True)

pitchers.to_csv('pitchers_dataframe.csv')

# Feature Selection for Position Players
if not position_players.empty:
    # Fill NaN values with 0 or mean to avoid dropping columns
    X_pos = position_players.select_dtypes(include=[np.number]).fillna(0)  # Fills NaNs with 0

    # Perform feature selection with a random target
    selector_pos = SelectKBest(score_func=f_regression, k='all')
    selector_pos.fit(X_pos, np.random.rand(len(X_pos)))  # Use random target as placeholder
    selected_features_pos = X_pos.columns[selector_pos.get_support(indices=True)]

    print("\nPosition Players - Selected Features:")
    print(selected_features_pos)

    # Plot correlation matrix of selected features for position players
    # plt.figure(figsize=(10, 8))
    # sns.heatmap(X_pos[selected_features_pos].corr(), annot=False, cmap="coolwarm", cbar=True)
    # plt.title("Correlation Matrix of Selected Features for Position Players")
    # plt.show()


# Feature Selection for Pitchers
if not pitchers.empty:
    # Fill NaN values with 0 to avoid dropping columns
    X_pitch = pitchers.select_dtypes(include=[np.number]).fillna(0)

    # Perform feature selection with a random target
    selector_pitch = SelectKBest(score_func=f_regression, k='all')
    selector_pitch.fit(X_pitch, np.random.rand(len(X_pitch)))  # Use random target as placeholder
    selected_features_pitch = X_pitch.columns[selector_pitch.get_support(indices=True)]

    print("\nPitchers - Selected Features:")
    print(selected_features_pitch)

    # Plot correlation matrix of selected features for pitchers
    # plt.figure(figsize=(10, 8))
    # sns.heatmap(X_pitch[selected_features_pitch].corr(), annot=False, cmap="coolwarm", cbar=True)
    # plt.title("Correlation Matrix of Selected Features for Pitchers")
    # plt.show()


Position Players - Selected Features:
Index(['yearID', 'stint', 'G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SB_x',
       'CS_x', 'BB', 'SO', 'IBB', 'HBP', 'SH', 'SF', 'GIDP', 'GS', 'InnOuts',
       'PO', 'A', 'E', 'DP', 'PB', 'WP', 'SB_y', 'CS_y', 'ZR', 'age'],
      dtype='object')

Pitchers - Selected Features:
Index(['yearID', 'stint', 'W', 'L', 'G', 'GS', 'CG', 'SHO', 'SV', 'IPouts',
       'H', 'ER', 'HR', 'BB', 'SO', 'BAOpp', 'ERA', 'WP', 'BK', 'BFP', 'GF',
       'age'],
      dtype='object')


In [117]:
position_players.head(10)

Unnamed: 0,playerID,yearID,teamID,stint,G,AB,R,H,2B,3B,...,PO,A,E,DP,PB,WP,SB_y,CS_y,ZR,age
0,aardsda01,2004,SFN,1,11,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,,,,,,23.0
1,aardsda01,2006,CHN,1,45,2.0,0.0,0.0,0.0,0.0,...,1.0,5.0,0.0,1.0,,,,,,25.0
2,aardsda01,2007,CHA,1,25,0.0,0.0,0.0,0.0,0.0,...,2.0,4.0,1.0,0.0,,,,,,26.0
3,aardsda01,2008,BOS,1,47,1.0,0.0,0.0,0.0,0.0,...,3.0,6.0,0.0,0.0,,,,,,27.0
4,aardsda01,2009,SEA,1,73,0.0,0.0,0.0,0.0,0.0,...,2.0,5.0,0.0,1.0,,,,,,28.0
5,aardsda01,2010,SEA,1,53,0.0,0.0,0.0,0.0,0.0,...,2.0,3.0,1.0,0.0,,,,,,29.0
6,aardsda01,2012,NYA,1,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,,,,,,31.0
7,aardsda01,2013,NYN,1,43,0.0,0.0,0.0,0.0,0.0,...,1.0,5.0,0.0,0.0,,,,,,32.0
8,aardsda01,2015,ATL,1,33,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,,,,,,34.0
9,aaronha01,1954,ML1,1,116,,,,,,...,223.0,5.0,7.0,0.0,,,,,,20.0


In [118]:
#im taking out stint, yearid, G, and age from selected features
#we will use age later to set decline events
#selected_features_field = ['AB','R', 'H', 'RBI', 'BB', 'SO', 'SF', 'PO', 'A', 'E']

## remaking the datasets - i dont think we need this?

In [120]:
# s_data = pd.merge(fielding, batting, on=['playerID', 'yearID'], how='outer')
# s_data = pd.merge(s_data, pitching, on=['playerID', 'yearID'], how='outer')
# savant_pitch_data.rename(columns={'player_id': 'playerID'}, inplace=True)
# s_data = pd.merge(s_data, savant_pitch_data, on='playerID', how='left')
# s_data = pd.merge(s_data, players[['playerID', 'birthYear']], on='playerID', how='left')

# s_data['age'] = s_data['yearID'] - s_data['birthYear']

# print(s_data.columns)
# field_players = s_data[s_data.get('HBP', pd.Series(index=s_data.index)).notna()]
# pitchers = s_data[s_data.get('ERA', pd.Series(index=s_data.index)).notna()]
# field_players['OBP'] = (field_players['H_x'] + field_players['BB_x'] + field_players['HBP']) / (
#     field_players['AB'] + field_players['BB_x'] + field_players['HBP'] + field_players['SF'])
# field_players['OBP'] = field_players['OBP'].fillna(0)
# field_players['player_value'] = field_players['OBP']
# pitchers['ERA'] = pitchers['ERA'].fillna(0)
# pitchers['player_value'] = pitchers['ERA']

## Filtering the data sets

In [122]:
#drop columns that aren't selected features and playerid, yearid

field_required_columns = ['playerID', 'yearID', 'stint', 'G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SB_x',
       'CS_x', 'BB', 'SO', 'IBB', 'HBP', 'SH', 'SF', 'GIDP', 'GS', 'InnOuts',
       'PO', 'A', 'E', 'DP', 'PB', 'WP', 'SB_y', 'CS_y', 'ZR', 'age']

filtered_field_players = position_players[field_required_columns]

#drop rows with players that didn't play in at least min_games
min_games = 115
filtered_field_players = filtered_field_players[filtered_field_players['G']>= min_games]

#impute missing values- use data from previous year
filtered_field_players = filtered_field_players.sort_values(by=['playerID', 'yearID'])
filtered_field_players = filtered_field_players.groupby('playerID').apply(lambda group: group.ffill()).reset_index(drop=True)
filtered_field_players = filtered_field_players.drop_duplicates()

In [123]:
filtered_field_players.head(10)

Unnamed: 0,playerID,yearID,stint,G,AB,R,H,2B,3B,HR,...,PO,A,E,DP,PB,WP,SB_y,CS_y,ZR,age
0,aaronha01,1954,1,116,,,,,,,...,223.0,5.0,7.0,0.0,,,,,,20.0
1,aaronha01,1954,1,122,468.0,58.0,131.0,27.0,6.0,13.0,...,223.0,5.0,7.0,0.0,,,,,,20.0
2,aaronha01,1955,1,126,468.0,58.0,131.0,27.0,6.0,13.0,...,254.0,9.0,9.0,2.0,,,,,,21.0
3,aaronha01,1955,1,153,602.0,105.0,189.0,37.0,9.0,27.0,...,254.0,9.0,9.0,2.0,,,,,,21.0
4,aaronha01,1956,1,152,602.0,105.0,189.0,37.0,9.0,27.0,...,316.0,17.0,13.0,4.0,,,,,,22.0
5,aaronha01,1956,1,153,609.0,106.0,200.0,34.0,14.0,26.0,...,316.0,17.0,13.0,4.0,,,,,,22.0
6,aaronha01,1957,1,150,609.0,106.0,200.0,34.0,14.0,26.0,...,346.0,9.0,6.0,0.0,,,,,,23.0
7,aaronha01,1957,1,151,615.0,118.0,198.0,27.0,6.0,44.0,...,346.0,9.0,6.0,0.0,,,,,,23.0
8,aaronha01,1958,1,153,601.0,109.0,196.0,34.0,4.0,30.0,...,305.0,12.0,5.0,0.0,,,,,,24.0
9,aaronha01,1959,1,152,601.0,109.0,196.0,34.0,4.0,30.0,...,261.0,12.0,5.0,3.0,,,,,,25.0


## Find Peak Years for each feature for each player

In [125]:
#i want to make sure that the peak it finds is the latest possible year

high_better_field_features= ['AB','R', 'H', 'RBI', 'BB', 'SF', 'PO', 'A']
low_better_field_features= ['SO', 'E']

field_peak_data = []

# Process high-better features (max value)
for feature in high_better_field_features:
    # Find the max value per player for the current feature
    peak_values = filtered_field_players.groupby('playerID')[feature].transform('max')
    threshold_values = 0.9 * peak_values
    
    # Determine the year when each player reached their peak value for this feature
    filtered_field_players[f'peak_threshold_met_{feature}'] = filtered_field_players[feature] >= threshold_values
    # Sort by yearID descending so latest years are prioritized
    filtered_field_players_sorted = filtered_field_players.sort_values(by='yearID', ascending=False)
    
    # Get the latest year where the peak threshold was met
    
    peak_years = filtered_field_players_sorted[
        filtered_field_players_sorted[f'peak_threshold_met_{feature}']
    ][['playerID', 'yearID']].drop_duplicates('playerID', keep='first')  # 'keep=first' ensures the latest year is retained
    
    peak_years = peak_years.rename(columns={'yearID': f'peak_year_{feature}'})
    
    # Store the peak year and value in a list
    field_peak_data.append(peak_years.set_index('playerID'))

# Process low-better features (min value)
for feature in low_better_field_features:
    # Find the min value per player for the current feature
    peak_values = filtered_field_players.groupby('playerID')[feature].transform('min')
    threshold_values = 1.1 * peak_values  # Increase threshold to 110% of the peak for low-better features
    
    # Determine the year when each player reached their peak value for this feature
    filtered_field_players[f'peak_threshold_met_{feature}'] = filtered_field_players[feature] <= threshold_values
    filtered_field_players_sorted = filtered_field_players.sort_values(by='yearID', ascending=False)
    
    # Get the latest year where the peak threshold was met
    
    peak_years = filtered_field_players_sorted[
        filtered_field_players_sorted[f'peak_threshold_met_{feature}']
    ][['playerID', 'yearID']].drop_duplicates('playerID', keep='first')  # 'keep=first' ensures the latest year is retained
    
    peak_years = peak_years.rename(columns={'yearID': f'peak_year_{feature}'})
    
    # Store the peak year and value in a list
    field_peak_data.append(peak_years.set_index('playerID'))

# Merge all peak year data into one DataFrame
field_peak_info = pd.concat(field_peak_data, axis=1).reset_index()

# Merge the peak information back into the original DataFrame
filtered_field_players = filtered_field_players.merge(field_peak_info, on='playerID', how='left')

In [126]:
filtered_field_players.head(10)

Unnamed: 0,playerID,yearID,stint,G,AB,R,H,2B,3B,HR,...,peak_year_AB,peak_year_R,peak_year_H,peak_year_RBI,peak_year_BB,peak_year_SF,peak_year_PO,peak_year_A,peak_year_SO,peak_year_E
0,aaronha01,1954,1,116,,,,,,,...,1969,1967,1964,1967,1972,1961.0,1962.0,1956.0,1955.0,1968.0
1,aaronha01,1954,1,122,468.0,58.0,131.0,27.0,6.0,13.0,...,1969,1967,1964,1967,1972,1961.0,1962.0,1956.0,1955.0,1968.0
2,aaronha01,1955,1,126,468.0,58.0,131.0,27.0,6.0,13.0,...,1969,1967,1964,1967,1972,1961.0,1962.0,1956.0,1955.0,1968.0
3,aaronha01,1955,1,153,602.0,105.0,189.0,37.0,9.0,27.0,...,1969,1967,1964,1967,1972,1961.0,1962.0,1956.0,1955.0,1968.0
4,aaronha01,1956,1,152,602.0,105.0,189.0,37.0,9.0,27.0,...,1969,1967,1964,1967,1972,1961.0,1962.0,1956.0,1955.0,1968.0
5,aaronha01,1956,1,153,609.0,106.0,200.0,34.0,14.0,26.0,...,1969,1967,1964,1967,1972,1961.0,1962.0,1956.0,1955.0,1968.0
6,aaronha01,1957,1,150,609.0,106.0,200.0,34.0,14.0,26.0,...,1969,1967,1964,1967,1972,1961.0,1962.0,1956.0,1955.0,1968.0
7,aaronha01,1957,1,151,615.0,118.0,198.0,27.0,6.0,44.0,...,1969,1967,1964,1967,1972,1961.0,1962.0,1956.0,1955.0,1968.0
8,aaronha01,1958,1,153,601.0,109.0,196.0,34.0,4.0,30.0,...,1969,1967,1964,1967,1972,1961.0,1962.0,1956.0,1955.0,1968.0
9,aaronha01,1959,1,152,601.0,109.0,196.0,34.0,4.0,30.0,...,1969,1967,1964,1967,1972,1961.0,1962.0,1956.0,1955.0,1968.0


## Calculate league averages by season

In [128]:
league_averages = filtered_field_players.groupby('yearID')[selected_features_field].mean().reset_index()

# Rename columns to indicate they are league averages
league_averages = league_averages.rename(columns={feature: f'league_avg_{feature}' for feature in selected_features_field})
filtered_field_players = filtered_field_players.merge(league_averages, on='yearID')
filtered_field_players.head()

Unnamed: 0,playerID,yearID,stint,G,AB,R,H,2B,3B,HR,...,league_avg_AB,league_avg_R,league_avg_H,league_avg_RBI,league_avg_BB,league_avg_SO,league_avg_SF,league_avg_PO,league_avg_A,league_avg_E
0,aaronha01,1954,1,116,,,,,,,...,506.297872,74.609929,143.085106,68.964539,57.475177,51.0,4.95098,414.815068,158.869863,13.280822
1,aaronha01,1954,1,122,468.0,58.0,131.0,27.0,6.0,13.0,...,506.297872,74.609929,143.085106,68.964539,57.475177,51.0,4.95098,414.815068,158.869863,13.280822
2,aaronha01,1955,1,126,468.0,58.0,131.0,27.0,6.0,13.0,...,510.030075,75.902256,142.842105,71.052632,57.571429,54.045113,4.90625,421.488722,166.578947,12.736842
3,aaronha01,1955,1,153,602.0,105.0,189.0,37.0,9.0,27.0,...,510.030075,75.902256,142.842105,71.052632,57.571429,54.045113,4.90625,421.488722,166.578947,12.736842
4,aaronha01,1956,1,152,602.0,105.0,189.0,37.0,9.0,27.0,...,509.179856,74.985612,141.726619,71.323741,56.482014,59.309353,4.270073,424.719424,143.539568,12.28777


## Set Decline Threshold Based on Features Below Average

In [130]:
# Create the decline event column, but only trigger it after the player's peak year

decline_threshold = 0.5

filtered_field_players['decline_event'] = filtered_field_players.apply(
    lambda row: (
        # Count the number of high-better features below league average and after peak
        sum([
            (row['yearID'] > row[f'peak_year_{feature}']) and (row[feature] < row[f'league_avg_{feature}'])
            for feature in high_better_field_features
        ]) +
        # Count the number of low-better features above league average and after peak
        sum([
            (row['yearID'] > row[f'peak_year_{feature}']) and (row[feature] > row[f'league_avg_{feature}'])
            for feature in low_better_field_features
        ])
    ) / (len(high_better_field_features) + len(low_better_field_features))  # Normalize by total features
    >= decline_threshold  # Compare against the threshold
, axis=1)

# Check the decline event column for a few players
filtered_field_players[['playerID', 'yearID', 'decline_event']].head(20)

Unnamed: 0,playerID,yearID,decline_event
0,aaronha01,1954,False
1,aaronha01,1954,False
2,aaronha01,1955,False
3,aaronha01,1955,False
4,aaronha01,1956,False
5,aaronha01,1956,False
6,aaronha01,1957,False
7,aaronha01,1957,False
8,aaronha01,1958,False
9,aaronha01,1959,False


In [131]:
filtered_field_players['playerID'].nunique()

3762

## More Dataset filtering

In [133]:
#remove league average columns
columns_to_drop = [col for col in filtered_field_players.columns if col.startswith('league_avg_')]

# Drop the identified columns
filtered_field_players = filtered_field_players.drop(columns=columns_to_drop)

#create time_to_event column
first_years = filtered_field_players.groupby('playerID')['yearID'].min().rename('first_year')
decline_years = (
    filtered_field_players[filtered_field_players['decline_event']]
    .groupby('playerID')['yearID']
    .min()
    .rename('decline_year')
)

filtered_field_players = filtered_field_players.merge(first_years, on='playerID', how='left')
filtered_field_players = filtered_field_players.merge(decline_years, on='playerID', how='left')

# Step 4: Calculate time to event
filtered_field_players['time_to_event'] = (
    filtered_field_players['decline_year'] - filtered_field_players['first_year']
)

filtered_field_players['decline_year'] = filtered_field_players['decline_year'].where(
    filtered_field_players['decline_event'], np.nan
)

# Update time_to_event to NaN for rows where decline_event is False
filtered_field_players['time_to_event'] = filtered_field_players['time_to_event'].where(
    filtered_field_players['decline_event'], np.nan
)

# Optional: Drop intermediate columns if not needed
filtered_field_players.drop(columns=['first_year'], inplace=True)

#remove peak?

In [134]:
filtered_field_players.head(20)

Unnamed: 0,playerID,yearID,stint,G,AB,R,H,2B,3B,HR,...,peak_year_RBI,peak_year_BB,peak_year_SF,peak_year_PO,peak_year_A,peak_year_SO,peak_year_E,decline_event,decline_year,time_to_event
0,aaronha01,1954,1,116,,,,,,,...,1967,1972,1961.0,1962.0,1956.0,1955.0,1968.0,False,,
1,aaronha01,1954,1,122,468.0,58.0,131.0,27.0,6.0,13.0,...,1967,1972,1961.0,1962.0,1956.0,1955.0,1968.0,False,,
2,aaronha01,1955,1,126,468.0,58.0,131.0,27.0,6.0,13.0,...,1967,1972,1961.0,1962.0,1956.0,1955.0,1968.0,False,,
3,aaronha01,1955,1,153,602.0,105.0,189.0,37.0,9.0,27.0,...,1967,1972,1961.0,1962.0,1956.0,1955.0,1968.0,False,,
4,aaronha01,1956,1,152,602.0,105.0,189.0,37.0,9.0,27.0,...,1967,1972,1961.0,1962.0,1956.0,1955.0,1968.0,False,,
5,aaronha01,1956,1,153,609.0,106.0,200.0,34.0,14.0,26.0,...,1967,1972,1961.0,1962.0,1956.0,1955.0,1968.0,False,,
6,aaronha01,1957,1,150,609.0,106.0,200.0,34.0,14.0,26.0,...,1967,1972,1961.0,1962.0,1956.0,1955.0,1968.0,False,,
7,aaronha01,1957,1,151,615.0,118.0,198.0,27.0,6.0,44.0,...,1967,1972,1961.0,1962.0,1956.0,1955.0,1968.0,False,,
8,aaronha01,1958,1,153,601.0,109.0,196.0,34.0,4.0,30.0,...,1967,1972,1961.0,1962.0,1956.0,1955.0,1968.0,False,,
9,aaronha01,1959,1,152,601.0,109.0,196.0,34.0,4.0,30.0,...,1967,1972,1961.0,1962.0,1956.0,1955.0,1968.0,False,,


In [135]:
declined_players = filtered_field_players[filtered_field_players['decline_event']==True]
declined_players

Unnamed: 0,playerID,yearID,stint,G,AB,R,H,2B,3B,HR,...,peak_year_RBI,peak_year_BB,peak_year_SF,peak_year_PO,peak_year_A,peak_year_SO,peak_year_E,decline_event,decline_year,time_to_event
31,aaronha01,1972,1,129,449.0,75.0,119.0,10.0,0.0,34.0,...,1967,1972,1961.0,1962.0,1956.0,1955.0,1968.0,True,1972.0,18.0
32,aaronha01,1973,1,120,392.0,84.0,118.0,12.0,1.0,40.0,...,1967,1972,1961.0,1962.0,1956.0,1955.0,1968.0,True,1972.0,18.0
33,aaronha01,1975,1,137,465.0,45.0,109.0,16.0,2.0,12.0,...,1967,1972,1961.0,1962.0,1956.0,1955.0,1968.0,True,1972.0,18.0
40,abbated01,1907,1,147,496.0,63.0,130.0,14.0,7.0,2.0,...,1908,1908,,1905.0,1905.0,1903.0,1908.0,True,1907.0,4.0
44,abbeych01,1895,1,133,516.0,102.0,142.0,14.0,10.0,8.0,...,1894,1894,,1894.0,1895.0,1894.0,1895.0,True,1895.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26570,zimmehe01,1919,1,123,444.0,56.0,113.0,20.0,6.0,4.0,...,1917,1914,,1918.0,1918.0,1918.0,1919.0,True,1919.0,8.0
26582,zimmery01,2017,1,143,427.0,60.0,93.0,18.0,1.0,15.0,...,2017,2010,2010.0,2017.0,2009.0,2017.0,2017.0,True,2017.0,11.0
26594,ziskri01,1980,1,135,448.0,48.0,130.0,17.0,1.0,19.0,...,1977,1976,1977.0,1978.0,1982.0,1980.0,1978.0,True,1980.0,6.0
26595,ziskri01,1982,1,131,503.0,61.0,147.0,28.0,1.0,21.0,...,1977,1976,1977.0,1978.0,1982.0,1980.0,1978.0,True,1980.0,6.0


In [136]:
#filtered_field_players[filtered_field_players['playerID']=='gordosi01']

In [365]:
filtered_field_players

Index(['playerID', 'yearID', 'stint', 'G', 'AB', 'R', 'H', '2B', '3B', 'HR',
       'RBI', 'SB_x', 'CS_x', 'BB', 'SO', 'IBB', 'HBP', 'SH', 'SF', 'GIDP',
       'GS', 'InnOuts', 'PO', 'A', 'E', 'DP', 'PB', 'WP', 'SB_y', 'CS_y', 'ZR',
       'age', 'peak_threshold_met_AB', 'peak_threshold_met_R',
       'peak_threshold_met_H', 'peak_threshold_met_RBI',
       'peak_threshold_met_BB', 'peak_threshold_met_SF',
       'peak_threshold_met_PO', 'peak_threshold_met_A',
       'peak_threshold_met_SO', 'peak_threshold_met_E', 'peak_year_AB',
       'peak_year_R', 'peak_year_H', 'peak_year_RBI', 'peak_year_BB',
       'peak_year_SF', 'peak_year_PO', 'peak_year_A', 'peak_year_SO',
       'peak_year_E', 'decline_event', 'decline_year', 'time_to_event'],
      dtype='object')

# creating df for survival analysis 

In [399]:
survival_data = filtered_field_players.copy()
survival_data['decline_event'] = survival_data['decline_event'].astype(int)
survival_data = survival_data.drop(columns=['PB', 'WP', 'SB_y', 'CS_y', 'ZR', 'CS_x', 'peak_threshold_met_AB', 'peak_threshold_met_R',
       'peak_threshold_met_H', 'peak_threshold_met_RBI',
       'peak_threshold_met_BB', 'peak_threshold_met_SF',
       'peak_threshold_met_PO', 'peak_threshold_met_A',
       'peak_threshold_met_SO', 'peak_threshold_met_E', 'peak_year_AB',
       'peak_year_R', 'peak_year_H', 'peak_year_RBI', 'peak_year_BB',
       'peak_year_SF', 'peak_year_PO', 'peak_year_A', 'peak_year_SO',
       'peak_year_E',])
#survival_data

# random forest survival analysis 

In [231]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sksurv.ensemble import RandomSurvivalForest
from sksurv.util import Surv

In [None]:
# Step 1: Replace missing or invalid values in decline_event and time_to_event
# Fill missing 'decline_event' values with 0 (assuming NaN means no decline)
survival_data["decline_event"] = survival_data["decline_event"].fillna(0).astype(int)

# Ensure binary encoding (0 or 1) for decline_event
print("Unique values in decline_event before assertion:", survival_data["decline_event"].unique())
assert set(survival_data["decline_event"].unique()) <= {0, 1}, "decline_event must be binary (0 or 1)."

# Step 1: Fill time_to_event for censored players and adjust decline_year
def fill_time_and_decline_for_censored(data):
    # Identify censored players (decline_event == 0)
    censored_players = data[data["decline_event"] == 0]
    
    # Set decline_year for censored players as the max year they played
    censored_players["decline_year"] = censored_players.groupby("playerID")["yearID"].transform("max")
    
    # For censored players, fill time_to_event as the difference between max year and min year they played
    censored_players["time_to_event"] = censored_players.groupby("playerID")["yearID"].transform("max") - censored_players.groupby("playerID")["yearID"].transform("min")
    
    # For players who experienced a decline (decline_event == 1), we already have decline_year and time_to_event
    declined_players = data[data["decline_event"] == 1]

    # Combine the censored and declined players
    updated_data = pd.concat([censored_players, declined_players])

    # Sort by playerID and yearID to maintain order
    updated_data = updated_data.sort_values(by=["playerID", "yearID"])

    # Step 2: Fill all rows for each player with the same time_to_event and decline_year
    updated_data["time_to_event"] = updated_data.groupby("playerID")["time_to_event"].transform("first")
    updated_data["decline_year"] = updated_data.groupby("playerID")["decline_year"].transform("first")

    return updated_data

# Apply the function to fill time_to_event and decline_year
survival_data = fill_time_and_decline_for_censored(survival_data)

# Step 3: Verify that all rows for each player have the same time_to_event and decline_year
assert all(survival_data.groupby("playerID")["time_to_event"].nunique() == 1), "Not all rows for each player have the same time_to_event value."
assert all(survival_data.groupby("playerID")["decline_year"].nunique() == 1), "Not all rows for each player have the same decline_year value."

# Step 2: Impute missing feature values using custom player-specific logic
# Define feature columns
non_feature_cols = ["playerID", "yearID", "decline_year", "time_to_event", "decline_event",
                    ]
feature_cols = survival_data.drop(columns=non_feature_cols, errors='ignore').columns

# Function to impute missing values
def impute_with_ffill_bfill_and_league_avg(data):
    # Preserve decline_event separately
    decline_event_backup = data["decline_event"].copy()

    def impute_by_player(group):
        for feature in feature_cols:
            group[feature] = group[feature].fillna(method='ffill').fillna(method='bfill')
        return group

    # Apply the player-level imputation
    data = data.groupby("playerID", group_keys=False).apply(impute_by_player)

    # Impute remaining NaN values using league average grouped by year
    for feature in feature_cols:
        if data[feature].isnull().any():
            league_avg = data.groupby("yearID")[feature].transform('mean')
            data[feature] = data[feature].fillna(league_avg)

    # Impute any remaining NaNs with the global average
    for feature in feature_cols:
        if data[feature].isnull().any():
            global_avg = data[feature].mean()
            data[feature] = data[feature].fillna(global_avg)

    # Restore decline_event to ensure it is unchanged
    data["decline_event"] = decline_event_backup

    return data

# Apply the imputation function
survival_data = impute_with_ffill_bfill_and_league_avg(survival_data)

# Step 3: Validate and clean the dataset
# Check for any remaining NaNs in the dataset
print("Remaining NaNs in survival_data after imputation:")
print(survival_data.isnull().sum())


# Step 4: Prepare data for survival analysis
y = Surv.from_dataframe("decline_event", "time_to_event", survival_data)

# Extract features
features = survival_data.drop(columns=["playerID", "yearID",
                    ])
assert not features.isnull().any().any(), "NaNs remain in features!"

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(features)

# Step 5: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 6: Train the RSF model
rsf = RandomSurvivalForest(n_estimators=100, min_samples_split=10, min_samples_leaf=3, random_state=42)
rsf.fit(X_train, y_train)

# Step 7: Evaluate the model
train_score = rsf.score(X_train, y_train)
test_score = rsf.score(X_test, y_test)
print(f"Training Concordance Index: {train_score}")
print(f"Test Concordance Index: {test_score}")

# Step 8: Predict survival functions and decline years
survival_functions = rsf.predict_survival_function(X_test)

# Determine predicted decline years for each player
decline_years = []
for i, surv_func in enumerate(survival_functions):
    decline_year = next((t for t, s in zip(surv_func.x, surv_func.y) if s < 0.5), None)
    decline_years.append(2024 - decline_year if decline_year else "No Decline Detected")

# Extract actual time_to_event and event values from y_test
time_to_event_test = y_test["time_to_event"]
event_test = y_test["decline_event"]

# Step 9: Attach the predictions to the test results
test_results = pd.DataFrame(X_test, columns=features.columns)
test_results["predicted_decline_year"] = decline_years
test_results["actual_time_to_event"] = time_to_event_test
test_results["event"] = event_test

# Output the results
print(test_results)


Unique values in decline_event before assertion: [0 1]
Remaining NaNs in survival_data after imputation:
playerID         0
yearID           0
stint            0
G                0
AB               0
R                0
H                0
2B               0
3B               0
HR               0
RBI              0
SB_x             0
BB               0
SO               0
IBB              0
HBP              0
SH               0
SF               0
GIDP             0
GS               0
InnOuts          0
PO               0
A                0
E                0
DP               0
age              0
decline_event    0
decline_year     0
time_to_event    0
dtype: int64


In [407]:
from sklearn.metrics import mean_absolute_error

# Calculate the accuracy of predicted decline years by comparing them to the actual time to event
# First, we need to handle cases where "No Decline Detected" was returned as the prediction.
# We will assume that "No Decline Detected" should be treated as a failed prediction for evaluation.

# Convert predicted decline years to actual years
# If the predicted year is "No Decline Detected", we'll set it to None or NaN
predicted_decline_years = [
    2024 - year if year != "No Decline Detected" else None
    for year in decline_years
]

# Create a DataFrame with the predictions and actual time-to-event values
test_results["predicted_decline_years"] = predicted_decline_years

# Remove rows where predicted decline year is None (i.e., "No Decline Detected")
valid_predictions = test_results.dropna(subset=["predicted_decline_years"])

# Calculate Mean Absolute Error (MAE) between predicted and actual decline years
# Only for rows where we have valid predictions
mae = mean_absolute_error(valid_predictions["actual_time_to_event"], valid_predictions["predicted_decline_years"])
print(f"Mean Absolute Error (MAE): {mae}")

# Additionally, you can define a tolerance threshold for "correct" predictions
tolerance = 1  # Define the tolerance in years (e.g., within 1 year of the actual decline year)
correct_predictions = valid_predictions[
    abs(valid_predictions["actual_time_to_event"] - valid_predictions["predicted_decline_years"]) <= tolerance
]

# Calculate prediction accuracy within the tolerance
accuracy_within_tolerance = len(correct_predictions) / len(valid_predictions) * 100
print(f"Accuracy within {tolerance} years: {accuracy_within_tolerance}%")


Mean Absolute Error (MAE): 0.8498942917547568
Accuracy within 1 years: 81.18393234672304%
