<a href="https://colab.research.google.com/github/kbro4/Reliever-Value/blob/main/Individual_Starters.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import time
import warnings

In [None]:
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
# Datasets for each year
urls = {2013 : 'http://bbref.com/pi/shareit/zTQRZ',
        2014 : 'http://bbref.com/pi/shareit/BdM7N',
        2015 : 'http://bbref.com/pi/shareit/JTlEk',
        2016 : 'http://bbref.com/pi/shareit/d4JcY',
        2017 : 'http://bbref.com/pi/shareit/ORBJM',
        2018 : 'http://bbref.com/pi/shareit/ooBRY',
        2019 : 'http://bbref.com/pi/shareit/sbzjf',
        2021 : 'http://bbref.com/pi/shareit/hJVbP',
        2022 : 'http://bbref.com/pi/shareit/H8igd',
        2023 : 'http://bbref.com/pi/shareit/YeWWJ'
}

In [None]:
# Filters and cleans dataframe to include only starters
def get_top_starters(df):
  df = df[df['Rk'] != 'Rk']
  df = df.where(df['GS'].astype(float) / df['G'].astype(float) > 0.5)
  df = df.sort_values(by='WAR', ascending=False)
  df = df.dropna(subset=df.columns.difference(['gmLI']), how='all')
  return df

In [None]:
# Checks to see if player is already in dataframe, if not appends new row for that player
def update_database(new_df, total_df, year):
  for name in new_df['Name']:
    if name in total_df['Name'].values:
      total_df.loc[total_df['Name'] == name, year] = new_df.loc[new_df['Name'] == name]['WAR'].iloc[0]
    else:
            # Add a new row with the person's name and specified value for specific year
      new_row = {'Name': name, year: new_df.loc[new_df['Name'] == name]['WAR'].iloc[0]}
      total_df = total_df.append(new_row, ignore_index=True)
  return total_df

In [None]:
# Gets the top thirty starting pitchers by bWAR in a given year
def get_top_thirty(df):
  df = df[df['Rk'] != 'Rk']
  df = df.where(df['GS'].astype(float) / df['G'].astype(float) > 0.5)
  df = df.sort_values(by='WAR', ascending=False)
  df = df.dropna(subset=df.columns.difference(['gmLI']), how='all')
  df = df.head(30)
  return df

In [None]:
# Gets how many of the top thirty starters in a year are worth at least a given value one and two years later
def elite_consistency(df, total_df, year):
    sum_one_later = 0
    sum_two_later = 0
    num_rows = df.shape[0]  # Number of rows in the dataframe

    for index, row in df.iterrows():
        name = row['Name']

        # Check if the player's name exists in the total dataframe
        if name in total_df['Name'].values:
            player_data = total_df.loc[total_df['Name'] == name]
            # Check if data for year 2015 exists
            if year + 1 in player_data.columns:
              # Can hard code a value
                if player_data[year + 1].values[0] >= 4.0:
                  sum_one_later = sum_one_later + 1
            # Check if data for year 2016 exists
            if year + 2 in player_data.columns:
              # Can hard code a value
                if player_data[year + 2].values[0] >= 4.0:
                  sum_two_later = sum_two_later + 1

    return sum_one_later, sum_two_later


In [None]:
# Creates a master dataframe of all starting pitchers during the time frame and value by year
total_df = pd.DataFrame(columns=['Name'])

for key in urls:
  df = pd.read_html(urls[key])
  df = df[0]

  new_df = get_top_starters(df)
  total_df = update_database(new_df, total_df, key)
  time.sleep(2)


In [None]:
total_df

Unnamed: 0,Name,2013,2014,2015,2016,2017,2018,2019,2021,2022,2023
0,Clayton Kershaw*,8.1,7.7,7.3,5.8,4.9,3.3,3.3,2.4,3.8,3.7
1,Hisashi Iwakuma,7.0,2.4,2.6,2.4,0.3,,,,,
2,Cliff Lee*,6.6,0.8,,,,,,,,
3,Max Scherzer,6.5,5.8,6.9,6.2,7.2,7.9,5.2,6.1,5.2,3.2
4,Chris Sale*,6.5,6.1,3.4,4.9,5.9,6.5,2.1,0.9,0.0,1.7
...,...,...,...,...,...,...,...,...,...,...,...
789,Jackson Rutledge,,,,,,,,,,-0.2
790,Joey Estes,,,,,,,,,,-0.2
791,Matt Dermody*,,,,,,,,,,-0.1
792,Jared Shuster*,,,,,,,,,,-0.1


In [None]:
for column in total_df:
  if column != 'Name':
    total_df[column] = total_df[column].astype(float)

In [None]:
# Gets average performance variation from previous year
war_change = total_df.iloc[:, 1:].diff(axis=1)
avg_change = war_change.abs().mean()
avg_change

2013         NaN
2014    1.691250
2015    1.518563
2016    1.706627
2017    1.658896
2018    1.724684
2019    1.908333
2021    1.836029
2022    1.642775
2023    1.700000
dtype: float64

In [None]:
# Gets average performance variation in two years
for column in total_df:
  if column != 'Name' and column + 2!= 2020 and column != 2022:
    change_two = total_df[column + 2] - total_df[column]
    avg_change_two = change_two.abs().mean()
    print(column, avg_change_two)
  elif column == 2022:
    break

2013 1.8420289855072465
2014 1.7830882352941178
2015 1.920588235294118
2016 1.9541353383458644
2017 1.8055118110236221
2019 1.8360294117647058
2021 1.7205673758865245


In [None]:
# Prints number of top thirty starters will be worth a given value one and two years later
for key in urls:
  df = pd.read_html(urls[key])
  df = df[0]

  print(key, elite_consistency(get_top_thirty(df), total_df, key))
  time.sleep(2)

2013 (10, 9)
2014 (11, 12)
2015 (9, 7)
2016 (7, 4)
2017 (10, 6)
2018 (9, 0)
2019 (0, 6)
2021 (10, 4)
2022 (4, 0)
2023 (0, 0)
