In [170]:
import pandas as pd
import numpy as np
import pybaseball

In [171]:
from pybaseball import pitching_stats
pitcher_data = pitching_stats(2022, qual=5)
pitcher_data = pitcher_data[['IDfg', 'Name', 'Age', 'G', 'IP', 'WAR']]
pitcher_data['Age'] = pitcher_data['Age'] - 1

In [172]:
from pybaseball import batting_stats
batter_data = batting_stats(2022, qual=20)
batter_data = batter_data[['IDfg', 'Name', 'Age', 'G', 'AB', 'WAR']]
batter_data['Age'] = batter_data['Age'] - 1

In [173]:
salaries = pd.read_csv("../Data/MLB-Salaries 2000-23 - 2023.csv", header= 1).drop("MLS", axis = 1)
salaries = salaries[salaries['2023'].notna()]

In [174]:
appearances = pd.read_csv("../Data/PositionAppearances2022.csv")
appearances = appearances.drop(['yearID','teamID', 'lgID', 'G_batting', 'G_defense', "G_ph", "G_pr"], axis = 1).groupby('playerID', as_index=False).sum() 

In [175]:
from pybaseball import playerid_reverse_lookup
lookuptable = playerid_reverse_lookup(appearances['playerID'], key_type='bbref')

batter_data = batter_data.merge(lookuptable, left_on= "IDfg", right_on= "key_fangraphs")
batter_data.drop(['key_retro', 'key_mlbam', 'mlb_played_last'], axis = 1, inplace= True)
pitcher_data = pitcher_data.merge(lookuptable, left_on= "IDfg", right_on= "key_fangraphs")
pitcher_data.drop(['key_retro', 'key_mlbam', 'mlb_played_last'], axis = 1, inplace= True)


In [176]:
# This takes a while
#from pybaseball import playerid_lookup
#df = pd.DataFrame()
#for id, row in salaries.iterrows():
#    last = f"{row['Last']}"
#    first = f"{row['First'].strip()}" 
#    a = playerid_lookup(last, first, fuzzy= True).iloc[[0]]
#    df = pd.concat([df, a], ignore_index=True)
    

In [177]:
#df.to_csv('../Data/SalaryLookup.csv')
df = pd.read_csv('../Data/SalaryLookup.csv')

In [178]:
salaries["key_fangraphs"] = df["key_fangraphs"]

In [179]:
batter_data = batter_data.merge(salaries[['2023', 'key_fangraphs']],  on= "key_fangraphs")
pitcher_data = pitcher_data.merge(salaries[['2023', 'key_fangraphs']],  on= "key_fangraphs")

In [180]:
import statsapi
teams = statsapi.lookup_team("", activeStatus="Y") 

In [181]:
team_ids = [team['id'] for team in teams]

In [182]:
# Pulling rosters from Opening Day 2023
rosters_df = pd.DataFrame()
for id in team_ids: 
    roster_string = statsapi.roster(id,'40Man',date=statsapi.get('season',{'seasonId':2023,'sportId':1})['seasons'][0]['regularSeasonStartDate'])
    rows = [line.split(maxsplit=2) for line in roster_string.strip().split('\n')]
    df = pd.DataFrame(rows, columns=['Number', 'Position', 'Player'])
    df[['First Name', 'Last Name']] = df['Player'].str.split(n=1, expand=True)
    df['teamid'] = id
    rosters_df = pd.concat([rosters_df, df], ignore_index=True)


In [183]:
rosters_df
rosters_df = rosters_df.merge(pd.DataFrame(teams)[['id', 'shortName']], left_on= "teamid", right_on= "id").drop(["id"], axis = 1)

In [184]:
# This takes a while - mapping players with teams back to ids 
#from pybaseball import playerid_lookup
#mapped_rosters = pd.DataFrame()
#for id, row in rosters_df.iterrows():
#    last = f"{row['Last Name']}"
#    first = f"{row['First Name'].strip()}" 
#    a = playerid_lookup(last, first, fuzzy= True).iloc[[0]] 
#    a['Last Name'] = row['Last Name']
#    a['First Name'] = row['First Name']
#    a['shortName'] = row['shortName']
#    a['Number'] = row['Number']            
#    mapped_rosters = pd.concat([mapped_rosters, a], ignore_index=True)

In [185]:
#mapped_rosters.to_csv('../Data/RosterLookup.csv')
mapped_rosters = pd.read_csv('../Data/RosterLookup.csv')

In [186]:
mapped_rosters = mapped_rosters[['key_fangraphs', 'shortName' ]]

In [187]:
batter_data = batter_data.merge(mapped_rosters ,  on= "key_fangraphs")
pitcher_data = pitcher_data.merge(mapped_rosters , on= "key_fangraphs")

In [188]:
import os 

folder_path = '../Data/2023 OD Contracts/'
futurecontracts_df = pd.DataFrame()

# Loop through each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.xlsx'): 

        df = pd.read_excel(os.path.join(folder_path, filename), skiprows=7)
        df = df.dropna(subset=['Unnamed: 1'])

        df_subset = df[['Unnamed: 0', 'Unnamed: 1', 2024, 2025, 2026, '2027']].rename(columns={'Unnamed: 0': 'Name', 'Unnamed: 1': 'Position'})

        df_subset.loc[df_subset['Position'].str.contains('rhp-s|lhp-s', na=False, case=False), 'Position'] = 'Starter'
        df_subset.loc[df_subset['Position'].isin(['rhp', 'lhp', 'rhp-c', 'lhp-c']), 'Position'] = 'Reliever'
        df_subset.loc[~df_subset['Position'].isin(['Starter', 'Reliever']), 'Position'] = 'Batter'

        df_subset['S2'] = df[[2024, 2025, 2026, '2027']].eq('A4').any(axis=1)

        futurecontracts_df = pd.concat([futurecontracts_df, df_subset], ignore_index=True) 


In [189]:
futurecontracts_df[['Last', 'First']] = futurecontracts_df['Name'].str.split(', ', expand=True)

In [190]:
# This takes a while
from pybaseball import playerid_lookup
future_mapped = pd.DataFrame()
for id, row in futurecontracts_df.iterrows():
    last = f"{row['Last']}"
    first = f"{row['First']}" 
    a = playerid_lookup(last, first, fuzzy= True).iloc[[0]]
    a['2024'] = row[2024]
    a['S2'] = row['S2']
    a['SimplePosition'] = row['Position']     
    future_mapped = pd.concat([future_mapped, a], ignore_index=True)
 

No identically matched names found! Returning the 5 most similar names.
No identically matched names found! Returning the 5 most similar names.
No identically matched names found! Returning the 5 most similar names.
No identically matched names found! Returning the 5 most similar names.
No identically matched names found! Returning the 5 most similar names.
No identically matched names found! Returning the 5 most similar names.
No identically matched names found! Returning the 5 most similar names.
No identically matched names found! Returning the 5 most similar names.
No identically matched names found! Returning the 5 most similar names.
No identically matched names found! Returning the 5 most similar names.
No identically matched names found! Returning the 5 most similar names.
No identically matched names found! Returning the 5 most similar names.
No identically matched names found! Returning the 5 most similar names.
No identically matched names found! Returning the 5 most similar

In [191]:
future_mapped = future_mapped[['key_fangraphs', '2024', 'SimplePosition' , 'S2']]

In [192]:
batter_data = batter_data.merge(future_mapped ,  on= "key_fangraphs")
pitcher_data = pitcher_data.merge(future_mapped , on= "key_fangraphs")

In [193]:
batter_data = batter_data.merge(appearances ,  left_on= "key_bbref", right_on = "playerID") 

In [194]:
# From https://blogs.fangraphs.com/an-arbitration-compensation-update/
def calculate_2024Est(row):
    multipliers = {
        ('Batter', False): {'A1': 1.36, 'A2': 2.13, 'A3': 3.59 },
        ('Batter', True): {'A1': 1.08, 'A2': 1.86, 'A3': 2.66, 'A4': 4.19},
        ('Starter', False): {'A1': 1.38, 'A2': 2.35, 'A3': 3.34},
        ('Starter', True): {'A1': 1.11, 'A2': 1.97, 'A3': 2.97, 'A4': 3.88},
        ('Reliever', False): {'A1': 1.79, 'A2': 3.98, 'A3': 5.61},
        ('Reliever', True): {'A1': 1.57, 'A2': 3.11, 'A3': 3.98, 'A4': 7.60}
    }
    position = row['SimplePosition']
    s2 = row['S2']
    multiplier_key = (position, s2)
    if pd.isna(row['2024']) or row['2024'] == 'FA':  # Handle NaN and 'FA' values
        return 0
    elif '$' in row['2024']: 
        return float(row['2024'].replace('$', '').replace(',', '')) * 1000000
    else: 
        multiplier = multipliers.get(multiplier_key, {}).get(row['2024'], 0)  # Default to 0 if not found 
        if row['WAR'] >= 0:
            return 720000 + row['WAR'] * multiplier * 1000000
        else:
            return 720000

# Apply the function to create the '2024Est' column
pitcher_data['2024Est'] = pitcher_data.apply(calculate_2024Est, axis=1)
batter_data['2024Est'] = batter_data.apply(calculate_2024Est, axis=1)


In [195]:
pitcher_data['2023'] = pitcher_data['2023'].replace('[\$,]', '', regex=True).astype(float)
batter_data['2023'] = batter_data['2023'].replace('[\$,]', '', regex=True).astype(float)

In [198]:
pitcher_data['Selectable'] = pitcher_data['mlb_played_first'] <= 2019
batter_data['Selectable'] = batter_data['mlb_played_first'] <= 2019


In [None]:
def create_position_column(position):
    return batter_data[position] > 5

# Create new columns for each position
positions = ['c', '1b', '2b', '3b', 'ss', 'lf', 'cf', 'rf', 'of' ]
for position in positions:
    batter_data[f'Position_{position}'] = create_position_column(f'G_{position}')

batter_data.drop(columns=[f'G_{position}' for position in positions], inplace=True)
batter_data.drop(['G_dh',"G_p", "SimplePosition", "key_fangraphs", "playerID"], axis = 1, inplace=True)
pitcher_data['Position_Starter'] = pitcher_data["SimplePosition"] == "Starter"
pitcher_data['Position_Reliever'] = pitcher_data["SimplePosition"] == "Reliever"
pitcher_data.drop(["SimplePosition", "key_fangraphs"], axis = 1, inplace=True)

In [210]:
batter_data.to_csv('../Data/batterDataProduction.csv')
pitcher_data.to_csv('../Data/pitcherDataProduction.csv')