In [219]:
import pandas as pd
import numpy as np
import pybaseball

In [220]:
from pybaseball import pitching_stats
pitcher_data = pitching_stats(2022, qual=5)
pitcher_data = pitcher_data[['IDfg', 'Name', 'Age', 'G', 'IP', 'WAR']]

In [221]:
from pybaseball import batting_stats
batter_data = batting_stats(2022, qual=20)
batter_data = batter_data[['IDfg', 'Name', 'Age', 'G', 'AB', 'WAR']]

In [223]:
salaries = pd.read_csv("../Data/MLB-Salaries 2000-23 - 2023.csv", header= 1).drop("MLS", axis = 1)
salaries = salaries[salaries['2023'].notna()]

In [224]:
appearances = pd.read_csv("../Data/PositionAppearances2022.csv")
appearances = appearances.drop(['yearID','teamID', 'lgID', 'G_batting', 'G_defense', "G_ph", "G_pr"], axis = 1).groupby('playerID', as_index=False).sum() 

In [225]:
from pybaseball import playerid_reverse_lookup
lookuptable = playerid_reverse_lookup(appearances['playerID'], key_type='bbref')

batter_data = batter_data.merge(lookuptable, left_on= "IDfg", right_on= "key_fangraphs")
batter_data.drop(['key_retro', 'key_mlbam', 'mlb_played_last'], axis = 1, inplace= True)
pitcher_data = pitcher_data.merge(lookuptable, left_on= "IDfg", right_on= "key_fangraphs")
pitcher_data.drop(['key_retro', 'key_mlbam', 'mlb_played_last'], axis = 1, inplace= True)


In [65]:
# This takes a while
#from pybaseball import playerid_lookup
#df = pd.DataFrame()
#for id, row in salaries.iterrows():
#    last = f"{row['Last']}"
#    first = f"{row['First'].strip()}" 
#    a = playerid_lookup(last, first, fuzzy= True).iloc[[0]]
#    df = pd.concat([df, a], ignore_index=True)
    

In [226]:
#df.to_csv('../Data/SalaryLookup.csv')
df = pd.read_csv('../Data/SalaryLookup.csv')

In [227]:
salaries["key_fangraphs"] = df["key_fangraphs"]

In [228]:
batter_data = batter_data.merge(salaries[['2023', 'key_fangraphs']],  on= "key_fangraphs")
pitcher_data = pitcher_data.merge(salaries[['2023', 'key_fangraphs']],  on= "key_fangraphs")

In [229]:
import statsapi
teams = statsapi.lookup_team("", activeStatus="Y") 

In [230]:
team_ids = [team['id'] for team in teams]

In [231]:
# Pulling rosters from Opening Day 2023
rosters_df = pd.DataFrame()
for id in team_ids: 
    roster_string = statsapi.roster(id,'40Man',date=statsapi.get('season',{'seasonId':2023,'sportId':1})['seasons'][0]['regularSeasonStartDate'])
    rows = [line.split(maxsplit=2) for line in roster_string.strip().split('\n')]
    df = pd.DataFrame(rows, columns=['Number', 'Position', 'Player'])
    df[['First Name', 'Last Name']] = df['Player'].str.split(n=1, expand=True)
    df['teamid'] = id
    rosters_df = pd.concat([rosters_df, df], ignore_index=True)


In [232]:
rosters_df
rosters_df = rosters_df.merge(pd.DataFrame(teams)[['id', 'shortName']], left_on= "teamid", right_on= "id").drop(["id"], axis = 1)

In [233]:
# This takes a while - mapping players with teams back to ids 
#from pybaseball import playerid_lookup
#mapped_rosters = pd.DataFrame()
#for id, row in rosters_df.iterrows():
#    last = f"{row['Last Name']}"
#    first = f"{row['First Name'].strip()}" 
#    a = playerid_lookup(last, first, fuzzy= True).iloc[[0]] 
#    a['Last Name'] = row['Last Name']
#    a['First Name'] = row['First Name']
#    a['shortName'] = row['shortName']
#    a['Number'] = row['Number']            
#    mapped_rosters = pd.concat([mapped_rosters, a], ignore_index=True)

In [234]:
#mapped_rosters.to_csv('../Data/RosterLookup.csv')
mapped_rosters = pd.read_csv('../Data/RosterLookup.csv')

In [235]:
mapped_rosters = mapped_rosters[['key_fangraphs', 'shortName' ]]

In [237]:
batter_data = batter_data.merge(mapped_rosters ,  on= "key_fangraphs")
pitcher_data = pitcher_data.merge(mapped_rosters , on= "key_fangraphs")

In [239]:
import os 

folder_path = '../Data/2023 OD Contracts/'
futurecontracts_df = pd.DataFrame()

# Loop through each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.xlsx'): 

        df = pd.read_excel(os.path.join(folder_path, filename), skiprows=7)
        df = df.dropna(subset=['Unnamed: 0'])

        df_subset = df[['Unnamed: 0', 'Unnamed: 1', 2024]].head(40).rename(columns={'Unnamed: 0': 'Name', 'Unnamed: 1': 'Position'})

        df_subset.loc[df_subset['Position'].str.contains('rhp-s|lhp-s', na=False, case=False), 'Position'] = 'Starter'
        df_subset.loc[df_subset['Position'].isin(['rhp', 'lhp', 'rhp-c', 'lhp-c']), 'Position'] = 'Reliever'
        df_subset.loc[~df_subset['Position'].isin(['Starter', 'Reliever']), 'Position'] = 'Batter'

        futurecontracts_df = pd.concat([futurecontracts_df, df_subset], ignore_index=True) 


In [243]:
futurecontracts_df

Unnamed: 0,Name,Position,2024,Last,First
0,"Rendon, Anthony",Batter,$38.571,Rendon,Anthony
1,"Trout, Mike",Batter,$37.117,Trout,Mike
2,"Ohtani, Shohei",Starter,FA,Ohtani,Shohei
3,"Anderson, Tyler",Starter,$13.000,Anderson,Tyler
4,"Renfroe, Hunter",Batter,FA,Renfroe,Hunter
...,...,...,...,...,...
1195,"Saucedo, Tayler",Reliever,,Saucedo,Tayler
1196,"Speier, Gabe",Reliever,,Speier,Gabe
1197,"Then, Juan",Reliever,,Then,Juan
1198,"Topa, Justin",Reliever,A1,Topa,Justin


In [242]:
futurecontracts_df[['Last', 'First']] = futurecontracts_df['Name'].str.split(', ', expand=True)


In [None]:
# This takes a while
from pybaseball import playerid_lookup
future_mapped = pd.DataFrame()
for id, row in futurecontracts_df.iterrows():
    last = f"{row['Last']}"
    first = f"{row['First']}" 
    a = playerid_lookup(last, first, fuzzy= True).iloc[[0]]
    a['2024'] = row[2024]
    a['SimplePosition'] = row['Position']     
    future_mapped = pd.concat([future_mapped, a], ignore_index=True)
 

In [246]:
future_mapped

Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last,2024,SimplePosition
0,rendon,anthony,543685,renda001,rendoan01,12861,2013.0,2023.0,$38.571,Batter
1,trout,mike,545361,troum001,troutmi01,10155,2011.0,2023.0,$37.117,Batter
2,ohtani,shohei,660271,ohtas001,ohtansh01,19755,2018.0,2023.0,FA,Starter
3,anderson,tyler,542881,andet002,anderty01,12880,2016.0,2023.0,$13.000,Starter
4,renfroe,hunter,592669,renfh001,renfrhu01,15464,2016.0,2023.0,FA,Batter
...,...,...,...,...,...,...,...,...,...,...
1195,saucedo,tayler,642048,sauct001,sauceta01,17888,2021.0,2023.0,,Reliever
1196,speier,gabe,642100,speig001,speiega01,17170,2019.0,2023.0,,Reliever
1197,then,juan,672730,thenj001,thenju01,22828,2023.0,2023.0,,Reliever
1198,topa,justin,623437,topaj001,topaju01,15145,2020.0,2023.0,A1,Reliever


In [247]:
future_mapped = future_mapped[['key_fangraphs', '2024', 'SimplePosition' ]]

In [248]:
batter_data = batter_data.merge(future_mapped ,  on= "key_fangraphs")
pitcher_data = pitcher_data.merge(future_mapped , on= "key_fangraphs")

In [249]:
batter_data

Unnamed: 0,IDfg,Name,Age,G,AB,WAR,name_last,name_first,key_bbref,key_fangraphs,mlb_played_first,2023,shortName,2024,SimplePosition
0,11493,Manny Machado,29,150,578,7.5,machado,manny,machama01,11493,2012.0,"$21,090,909",San Diego,$17.091,Batter
1,9777,Nolan Arenado,31,148,557,7.2,arenado,nolan,arenano01,9777,2013.0,"$32,822,071",St. Louis,$35.000,Batter
2,5361,Freddie Freeman,32,159,612,7.1,freeman,freddie,freemfr01,5361,2010.0,"$25,290,674",LA Dodgers,$25.291,Batter
3,9218,Paul Goldschmidt,34,151,561,7.0,goldschmidt,paul,goldspa01,9218,2011.0,"$25,333,333",St. Louis,$25.333,Batter
4,19556,Yordan Alvarez,25,135,470,6.7,álvarez,yordan,alvaryo01,19556,2019.0,"$7,833,333",Houston,$10.833,Batter
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
387,12775,Brad Miller,32,81,222,-1.2,miller,brad,millebr02,12775,2013.0,"$4,000,000",Texas,FA,Batter
388,19864,Riley Adams,26,48,142,-1.2,adams,riley,adamsri03,19864,2021.0,"$728,800",Washington,,Batter
389,23378,Jose Barrero,24,48,165,-1.2,barrero,josé,garcijo02,23378,2020.0,"$730,000",Cincinnati,,Batter
390,11680,Elias Diaz,31,105,351,-1.5,díaz,elías,diazel01,11680,2015.0,"$5,500,000",Colorado,$6.000,Batter


In [250]:
pitcher_data

Unnamed: 0,IDfg,Name,Age,G,IP,WAR,name_last,name_first,key_bbref,key_fangraphs,mlb_played_first,2023,shortName,2024,SimplePosition
0,16149,Aaron Nola,29,32,205.0,6.3,nola,aaron,nolaaa01,16149,2015.0,"$16,250,000",Philadelphia,FA,Starter
1,16137,Carlos Rodon,29,31,178.0,6.2,rodón,carlos,rodonca01,16137,2015.0,"$22,833,333",NY Yankees,$27.833,Starter
2,8700,Justin Verlander,39,28,175.0,6.0,verlander,justin,verlaju01,8700,2005.0,"$43,333,333",NY Mets,$43.333,Starter
3,18684,Sandy Alcantara,26,32,228.2,5.8,alcántara,sandy,alcansa01,18684,2017.0,"$6,300,000",Miami,$9.300,Starter
4,14107,Kevin Gausman,31,31,174.2,5.7,gausman,kevin,gausmke01,14107,2013.0,"$21,000,000",Toronto,$24.000,Starter
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
443,18694,Kolby Allard,24,10,21.0,-0.7,allard,kolby,allarko01,18694,2018.0,"$750,000",Atlanta,A1,Reliever
444,17490,Genesis Cabrera,25,39,44.2,-0.8,cabrera,génesis,cabrege01,17490,2019.0,"$950,000",St. Louis,A2,Reliever
445,16933,Elieser Hernandez,27,20,62.1,-0.9,hernández,elieser,hernael01,16933,2018.0,"$1,600,000",NY Mets,A3,Reliever
446,6986,Ian Kennedy,37,57,50.1,-1.0,kennedy,ian,kenneia01,6986,2007.0,"$2,250,000",Texas,FA,Reliever


In [252]:
batter_data.to_csv('../Data/batterDataProduction.csv')
pitcher_data.to_csv('../Data/pitcherDataProduction.csv')

In [245]:
# To work on later 
# From https://blogs.fangraphs.com/an-arbitration-compensation-update/
#Player Type	$/WAR Arb1	$/WAR Arb2	$/WAR Arb3
#Batter	$1.36	$2.13	$3.59
#Starter	$1.38	$2.35	$3.34
#Reliever	$1.79	$3.98	$5.61