# Imports

In [43]:
import pybaseball as pyb
import pandas as pd
import time  # Import the time module
import inspect
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch import optim



# Functions

In [18]:
pyb.cache.enable()
pyb.cache.config.cache_type='csv'
pyb.cache.config.save()

In [19]:

# get the register data
chad = pyb.chadwick_register()

# get the register data and save to disk
chad = pyb.chadwick_register(save=True)

In [20]:
print(inspect.getsource(pyb.pitching_stats_range))

@cache.df_cache()
def pitching_stats_range(start_dt: Optional[str]=None, end_dt: Optional[str]=None) -> pd.DataFrame:
    """
    Get all pitching stats for a set time range. This can be the past week, the
    month of August, anything. Just supply the start and end date in YYYY-MM-DD
    format.
    """
    # ensure valid date strings, perform necessary processing for query
    start_dt_date, end_dt_date = sanitize_date_range(start_dt, end_dt)
    if start_dt_date.year < 2008:
        raise ValueError("Year must be 2008 or later")
    if end_dt_date.year < 2008:
        raise ValueError("Year must be 2008 or later")
    # retrieve html from baseball reference
    soup = get_soup(start_dt_date, end_dt_date)
    table = get_table(soup)
    table = table.dropna(how='all') # drop if all columns are NA
    #fix some strange formatting for percentage columns
    table = table.replace('---%', np.nan)
    #make sure these are all numeric
    for column in ['Age', '#days', 'G', 'GS', 'W', 'L',

In [21]:

def bref_batter(date1,date2):
# Initialize a list to store all data
    all_data = []

# Loop over the years from the beginning year to the current year
    for year in range(date1, date2):  # Replace 2024 with the current year
        try:
            data = pyb.batting_stats_bref(year)
            print(f"Batter Data fetched for year: {year}")
            all_data.append(data)
        except Exception as e:
            print(f"An error occurred for year: {year}. Error: {str(e)}")
    

    time.sleep(4)
# Concatenate all dataframes in the list
    all_data_df = pd.concat(all_data)
    return all_data_df


def bref_pitcher(date1,date2):
# Initialize a list to store all data
    all_data = []

# Loop over the years from the beginning year to the current year
    for year in range(date1, date2):  # Replace 2024 with the current year
        try:
            data = pyb.pitching_stats_bref(year)
            print(f"Pitcher Data fetched for year: {year}")
            all_data.append(data)
        except Exception as e:
            print(f"An error occurred for year: {year}. Error: {str(e)}")
    

    time.sleep(4)
# Concatenate all dataframes in the list
    all_data_df = pd.concat(all_data)
    return all_data_df


def bref_fielder(date1,date2):
# Initialize a list to store all data
    all_data = []

# Loop over the years from the beginning year to the current year
    for year in range(date1, date2):  # Replace 2024 with the current year
        try:
            data = pyb.statcast_outs_above_average(year, "all", 0)
            print(f"Fielding Data fetched for year: {year}")
            all_data.append(data)
        except Exception as e:
            print(f"An error occurred for year: {year}. Error: {str(e)}")
    

    time.sleep(4)
# Concatenate all dataframes in the list
    all_data_df = pd.concat(all_data)
    return all_data_df

def per_game_data(date1,date2):
# Initialize a list to store all data
    all_data = []

# Loop over the years from the beginning year to the current year
    for year in range(date1, date2):  # Replace 2024 with the current year
        try:
            data = pyb.season_game_logs(year)
            print(f"Data fetched for year: {year}")
            all_data.append(data)
        except Exception as e:
            print(f"An error occurred for year: {year}. Error: {str(e)}")
# Concatenate all dataframes in the list
    all_data_df = pd.concat(all_data)
    return all_data_df


In [22]:
test_date='2022'
date1=2008
date2=2024
batter_data=bref_batter(date1,date2)

pitcher_data=bref_pitcher(date1,date2)

fielder_data=bref_fielder(date1,date2)
fielder_data = fielder_data.rename(columns={'player_id': 'mlbID'})

# import os
# os.environ['GH_TOKEN'] = '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'
# per_game_data=per_game_data(date1,date2)

per_game_path='./pybaseball/pybaseball/data/Lahman_MLB_per_game_data.csv'
per_game_data=pd.read_csv(per_game_path)

Batter Data fetched for year: 2008
Batter Data fetched for year: 2009
Batter Data fetched for year: 2010
Batter Data fetched for year: 2011
Batter Data fetched for year: 2012
Batter Data fetched for year: 2013
Batter Data fetched for year: 2014
Batter Data fetched for year: 2015
Batter Data fetched for year: 2016
Batter Data fetched for year: 2017
Batter Data fetched for year: 2018
Batter Data fetched for year: 2019
Batter Data fetched for year: 2020
Batter Data fetched for year: 2021
Batter Data fetched for year: 2022
Batter Data fetched for year: 2023


Pitcher Data fetched for year: 2008
Pitcher Data fetched for year: 2009
Pitcher Data fetched for year: 2010
Pitcher Data fetched for year: 2011
Pitcher Data fetched for year: 2012
Pitcher Data fetched for year: 2013
Pitcher Data fetched for year: 2014
Pitcher Data fetched for year: 2015
Pitcher Data fetched for year: 2016
Pitcher Data fetched for year: 2017
Pitcher Data fetched for year: 2018
Pitcher Data fetched for year: 2019
Pitcher Data fetched for year: 2020
Pitcher Data fetched for year: 2021
Pitcher Data fetched for year: 2022
Pitcher Data fetched for year: 2023
Fielding Data fetched for year: 2008
Fielding Data fetched for year: 2009
Fielding Data fetched for year: 2010
Fielding Data fetched for year: 2011
Fielding Data fetched for year: 2012
Fielding Data fetched for year: 2013
Fielding Data fetched for year: 2014
Fielding Data fetched for year: 2015
Fielding Data fetched for year: 2016
Fielding Data fetched for year: 2017
Fielding Data fetched for year: 2018
Fielding Data fet

  per_game_data=pd.read_csv(per_game_path)


In [46]:
# List of columns to keep
columns_to_keep = ['Game Index','Date', 'HmTm', 'VisTm', 'HmRuns', 'VisRuns'] + \
                 [f'VisBat{i}ID' for i in range(1, 10)] + \
                 [f'HmBat{i}ID' for i in range(1, 10)] + \
                 [f'VisBat{i}Pos' for i in range(1, 10)] + \
                 [f'HmBat{i}Pos' for i in range(1, 10)] + \
                 ['VisStPchID', 'HmStPchID']
                
                
                #  ['VisStPchNm', 'HmStPchNm']
                #  [f'VisBat{i}Nm' for i in range(1, 10)] + \
                #  [f'HmBat{i}Nm' for i in range(1, 10)] + \
# Drop other columns

formatted_per_game_data = per_game_data[columns_to_keep]
# Ensure the 'Date' column is in datetime format
formatted_per_game_data['Date'] = pd.to_datetime(formatted_per_game_data['Date'], format='%Y%m%d')

# Create a new column 'game_id' by combining 'Date', 'HmTm', and 'VisTm'
formatted_per_game_data['Game_Id'] = formatted_per_game_data['Date'].dt.strftime('%Y%m%d') + formatted_per_game_data['HmTm'] + formatted_per_game_data['VisTm']
# Set 'game_id' as the index
# Set 'Date' and 'game_id' as the index
formatted_per_game_data.set_index(['Game_Id'], inplace=True)
# all_player_names_per_game = pd.unique(pd.concat([formatted_per_game_data['VisStPchID'], formatted_per_game_data['HmStPchID']] + [formatted_per_game_data[f'VisBat{i}ID'] for i in range(1, 10)] + [formatted_per_game_data[f'HmBat{i}ID'] for i in range(1, 10)]))
by_year_game_data = formatted_per_game_data[formatted_per_game_data.index.get_level_values('Game_Id').str.contains(test_date)]

print(by_year_game_data)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  formatted_per_game_data['Date'] = pd.to_datetime(formatted_per_game_data['Date'], format='%Y%m%d')


                Game Index       Date HmTm VisTm  HmRuns  VisRuns VisBat1ID  \
Game_Id                                                                       
20220407ARISDN       51916 2022-04-07  ARI   SDN       4        2  nolaa002   
20220407ATLCIN       51917 2022-04-07  ATL   CIN       3        6  indij001   
20220407CHNMIL       51918 2022-04-07  CHN   MIL       5        4  wongk001   
20220407SLNPIT       51919 2022-04-07  SLN   PIT       9        0  voged001   
20220407WASNYN       51920 2022-04-07  WAS   NYN       1        5  marts002   
...                    ...        ...  ...   ...     ...      ...       ...   
20221005CLEKCA       54341 2022-10-05  CLE   KCA       9        2  melem001   
20221005HOUPHI       54342 2022-10-05  HOU   PHI       3        2  schwk001   
20221005OAKANA       54343 2022-10-05  OAK   ANA       3        2  rengl001   
20221005SEADET       54344 2022-10-05  SEA   DET       5        4  badda001   
20221005TEXNYA       54345 2022-10-05  TEX   NYA    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  formatted_per_game_data['Game_Id'] = formatted_per_game_data['Date'].dt.strftime('%Y%m%d') + formatted_per_game_data['HmTm'] + formatted_per_game_data['VisTm']


In [24]:
print(formatted_per_game_data.index.unique())

Index(['20000329NYNCHN', '20000330CHNNYN', '20000403ATLCOL', '20000403CINMIL',
       '20000403FLOSFN', '20000403MONLAN', '20000403NYNSDN', '20000403SLNCHN',
       '20000403ANANYA', '20000403BALCLE',
       ...
       '20231001PITMIA', '20231001SFNLAN', '20231001SLNCIN', '20231001ANAOAK',
       '20231001BALBOS', '20231001CHASDN', '20231001DETCLE', '20231001KCANYA',
       '20231001SEATEX', '20231001TORTBA'],
      dtype='object', name='Game_Id', length=56044)


In [25]:
player_names = pyb.playerid_reverse_lookup(by_year_game_data, key_type='retro')
print(player_names)

Empty DataFrame
Columns: [name_last, name_first, key_mlbam, key_retro, key_bbref, key_fangraphs, mlb_played_first, mlb_played_last]
Index: []


In [26]:
fielder_mlbids = set(fielder_data['mlbID'])
pitcher_mlbids = set(pitcher_data['mlbID'])
batter_mlbids = set(batter_data['mlbID'])

print(f"Fielder mlbids: {fielder_mlbids}")
print(f"Pitcher mlbids: {pitcher_mlbids}")
print(f"Batter mlbids: {batter_mlbids}")

common_mlbids = fielder_mlbids.intersection(pitcher_mlbids, batter_mlbids)
print(f"Common mlbids: {common_mlbids}")



Fielder mlbids: {606213, 663586, 475174, 466988, 663609, 663611, 663616, 663624, 622666, 622668, 663630, 450641, 622682, 606299, 663647, 663656, 475243, 663662, 475247, 467055, 475253, 622713, 606336, 647304, 663697, 491676, 458913, 622761, 491696, 663731, 680118, 647351, 573627, 663757, 663796, 663799, 598265, 606466, 598284, 663837, 663845, 663853, 516416, 663886, 663897, 663898, 663905, 663911, 516472, 434567, 500135, 434604, 663993, 475582, 664011, 434636, 664023, 664029, 434658, 664034, 664040, 664041, 434670, 500208, 664056, 664057, 664058, 664059, 664068, 451089, 672279, 672284, 664119, 623180, 623182, 434778, 672356, 623205, 664167, 680552, 623214, 451192, 680574, 516770, 459431, 516782, 664238, 664247, 516809, 623323, 672478, 606956, 664314, 680700, 672515, 680716, 664334, 606992, 606993, 656180, 680757, 656185, 607043, 672580, 680776, 582473, 680779, 607054, 467793, 516949, 680814, 467827, 435062, 435063, 656248, 656252, 672640, 435079, 607111, 623507, 623508, 623515, 623520,

In [27]:
print(formatted_per_game_data.head(1))

                Game Index       Date HmTm VisTm VisBat1ID VisBat2ID  \
Game_Id                                                                
20000329NYNCHN           1 2000-03-29  NYN   CHN  youne001  bufod001   

               VisBat3ID VisBat4ID VisBat5ID VisBat6ID  ... HmBat2Pos  \
Game_Id                                                 ...             
20000329NYNCHN  gracm001  sosas001  rodrh001  andrs001  ...         8   

               HmBat3Pos HmBat4Pos HmBat5Pos HmBat6Pos HmBat7Pos HmBat8Pos  \
Game_Id                                                                      
20000329NYNCHN         4         2         5         9         3         6   

               HmBat9Pos VisStPchID HmStPchID  
Game_Id                                        
20000329NYNCHN         1   liebj001  hampm001  

[1 rows x 42 columns]


In [28]:
print(fielder_data)
print(pitcher_data)
print(batter_data)

    last_name, first_name   mlbID display_team_name  year  \
0             Abreu, José  547989         White Sox  2016   
1          Ackley, Dustin  554429           Yankees  2016   
2       Adames, Cristhian  542436           Rockies  2016   
3             Adams, Matt  571431         Cardinals  2016   
4         Adrianza, Ehire  501303            Giants  2016   
..                    ...     ...               ...   ...   
553     Yelich, Christian  592885           Brewers  2023   
554           Yepez, Juan  660766         Cardinals  2023   
555     Yoshida, Masataka  807799           Red Sox  2023   
556          Young, Jacob  696285         Nationals  2023   
557          Young, Jared  676724              Cubs  2023   

    primary_pos_formatted fielding_runs_prevented outs_above_average  \
0                      1B                      -4                 -6   
1                      1B                       2                  3   
2                      SS                       2  

In [37]:
# Concatenate the dataframes
all_data = pd.concat([batter_data, pitcher_data, fielder_data], axis=0)

# Create an instance of SimpleImputer
imputer = SimpleImputer(strategy='constant', fill_value=0)

# Fit the imputer on the data
imputer.fit(all_data)

# Transform the data
imputed_data = imputer.transform(all_data)

# Convert the result back to a DataFrame
ml_data = pd.DataFrame(imputed_data, columns=all_data.columns)
ml_data.set_index('mlbID', inplace=True)  

# Create an instance of LabelEncoder
le = LabelEncoder()
from sklearn.preprocessing import LabelEncoder

# Create an instance of LabelEncoder
le = LabelEncoder()

# Iterate over all the values of each column and extract their dtypes
for col in ml_data.columns:
    # Compare if the dtype is object
    if ml_data[col].dtype=='object':
    # Use LabelEncoder to do the numeric transformation
        ml_data[col] = ml_data[col].astype(str)
        ml_data[col]=le.fit_transform(ml_data[col])

In [42]:
print(ml_data.head(1))

        Name  Age  #days  Lev   Tm  G  PA  AB  R  H  ...  outs_above_average  \
mlbID                                                ...                       
430911  1305    8   2338    1  109  1   2   2  1  1  ...                  24   

        outs_above_average_infront  outs_above_average_lateral_toward3bline  \
mlbID                                                                         
430911                          13                                       13   

        outs_above_average_lateral_toward1bline  outs_above_average_behind  \
mlbID                                                                        
430911                                       12                          5   

        outs_above_average_rhh  outs_above_average_lhh  \
mlbID                                                    
430911                      18                      14   

        actual_success_rate_formatted  adj_estimated_success_rate_formatted  \
mlbID                           

In [30]:

specific_player=ml_data.loc[430911]


In [31]:
print(ml_data)

                      Name   Age   #days     Lev         Tm      G     PA  \
mlbID                                                                       
430911       David Aardsma  26.0  5661.0  Maj-AL     Boston    1.0    1.0   
430631  Reggie Abercrombie  27.0  5569.0  Maj-NL    Houston   32.0   60.0   
110029         Bobby Abreu  34.0  5569.0  Maj-AL   New York  156.0  684.0   
407924        Manny Acosta  27.0  5653.0  Maj-NL    Atlanta    4.0    5.0   
430606          Mike Adams  29.0  5598.0  Maj-NL  San Diego    2.0    2.0   
...                    ...   ...     ...     ...        ...    ...    ...   
592885                   0     0       0       0          0      0      0   
660766                   0     0       0       0          0      0      0   
807799                   0     0       0       0          0      0      0   
696285                   0     0       0       0          0      0      0   
676724                   0     0       0       0          0      0      0   

In [32]:
draftResults = pyb.amateur_draft(2021, 1, True)


In [33]:
player_names = pyb.playerid_reverse_lookup(chad['key_mlbam'], key_type='mlbam')

In [34]:
"""
# game_id = '20000329NYNCHN'  # replace with your specific game_id
# game_data = formatted_per_game_data.loc[game_id]
# print(game_data)

########
# import pkg_resources

# def get_version(package_name):
#     try:
#         return pkg_resources.get_distribution(package_name).version
#     except pkg_resources.DistributionNotFound:
#         return "Package not found"

# print(get_version("pybaseball"))

"""

'\n# game_id = \'20000329NYNCHN\'  # replace with your specific game_id\n# game_data = formatted_per_game_data.loc[game_id]\n# print(game_data)\n\n########\n# import pkg_resources\n\n# def get_version(package_name):\n#     try:\n#         return pkg_resources.get_distribution(package_name).version\n#     except pkg_resources.DistributionNotFound:\n#         return "Package not found"\n\n# print(get_version("pybaseball"))\n\n'