In [35]:
import pybaseball as pyb
import pandas as pd
import time
import inspect
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch import optim
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
import os
from featureranker.utils import *
from featureranker.plots import *
from featureranker.rankers import *
import glob
import numpy as np
from tqdm.auto import tqdm
import pickle
from sklearn import svm
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, ConfusionMatrixDisplay
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
import warnings
from sklearn.model_selection import KFold
warnings.filterwarnings('ignore')

pyb.cache.enable()
pyb.cache.config.cache_type='csv'
pyb.cache.config.save()

# Imports

# Functions

# Importing Data

In [36]:

def bref_batter(date1,date2):
# Initialize a list to store all data
    all_data = []

# Loop over the years from the beginning year to the current year
    for year in range(date1, date2):  # Replace 2024 with the current year
        try:
            data = pyb.batting_stats_bref(year)
            print(f"Batter Data fetched for year: {year}")
            all_data.append(data)
        except Exception as e:
            print(f"An error occurred for year: {year}. Error: {str(e)}")
    

    time.sleep(4)
# Concatenate all dataframes in the list
    all_data_df = pd.concat(all_data)
    return all_data_df



def bref_pitcher(date1,date2):
# Initialize a list to store all data
    all_data = []

# Loop over the years from the beginning year to the current year
    for year in range(date1, date2):  # Replace 2024 with the current year
        try:
            data = pyb.pitching_stats_bref(year)
            print(f"Pitcher Data fetched for year: {year}")
            all_data.append(data)
        except Exception as e:
            print(f"An error occurred for year: {year}. Error: {str(e)}")
    

    time.sleep(4)
# Concatenate all dataframes in the list
    all_data_df = pd.concat(all_data)
    return all_data_df



def bref_fielder(date1,date2):
# Initialize a list to store all data
    all_data = []

# Loop over the years from the beginning year to the current year
    for year in range(date1, date2):  # Replace 2024 with the current year
        try:
            data = pyb.statcast_outs_above_average(year, "all", 0)
            print(f"Fielding Data fetched for year: {year}")
            all_data.append(data)
        except Exception as e:
            print(f"An error occurred for year: {year}. Error: {str(e)}")
    

    time.sleep(4)
# Concatenate all dataframes in the list
    all_data_df = pd.concat(all_data)
    return all_data_df



def per_game_data(date1,date2):
# Initialize a list to store all data
    all_data = []

# Loop over the years from the beginning year to the current year
    for year in range(date1, date2):  # Replace 2024 with the current year
        try:
            data = pyb.season_game_logs(year)
            print(f"Data fetched for year: {year}")
            all_data.append(data)
        except Exception as e:
            print(f"An error occurred for year: {year}. Error: {str(e)}")
# Concatenate all dataframes in the list
    all_data_df = pd.concat(all_data)
    return all_data_df




def Main_Scrape(date1,date2):

    batter_data=bref_batter(date1,date2)

    pitcher_data=bref_pitcher(date1,date2)

    fielder_data=bref_fielder(date1,date2)
    batter_data.to_csv('./Lahman_compiled_player_data/batter_data.csv', index=False)
    pitcher_data.to_csv('./Lahman_compiled_player_data/pitcher_data.csv', index=False)
    fielder_data.to_csv('./Lahman_compiled_player_data/fielder_data.csv', index=False)
    return batter_data,fielder_data,pitcher_data



def Main_Import(per_game_path,batter_path,pitcher_path,fielder_path):
    # Save the dataframes to csv files
    per_game_data=pd.read_csv(per_game_path, header=0)
    batter_data=pd.read_csv(batter_path, header=0)
    pitcher_data=pd.read_csv(pitcher_path, header=0)
    fielder_data=pd.read_csv(fielder_path, header=0)
    return batter_data,pitcher_data,fielder_data, per_game_data

# Formatting player data

In [37]:
#Remove columns with an NaN above a certain level
def remove_columns_with_nan(df, NaN_cutoff_percentage):
    NaN_cutoff = NaN_cutoff_percentage / 100.0
    return df.loc[:, df.isnull().mean() < NaN_cutoff]
from sklearn.preprocessing import LabelEncoder

def label_encode(df):
    le = LabelEncoder()
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = le.fit_transform(df[col])
    return df
    

In [50]:
#Training and testing years for the machine learning model
test_year = '2022'
train_year = [str(year) for year in range(int(test_year) - 6, int(test_year))]


#Use this to update data from the websites (has data starting from 2008)
date1=2008
date2=2024

#Do not use this 
# batter_data,pitcher_data,fielder_data=Main_Scrape(2008,2024)
fielder_data=bref_fielder(date1,date2)
batter_bwar_data = pyb.bwar_bat()
pitcher_bwar_data = pyb.bwar_pitch()

# get the register data and save to disk
chad = pyb.chadwick_register(save=True)

#After scraping the files are saved locally. Also, the per_game_data is found manually and imported
per_game_path='./pybaseball/pybaseball/data/Lahman_MLB_per_game_data.csv'
batter_path='./Lahman_compiled_player_data/batter_data.csv'
pitcher_path='./Lahman_compiled_player_data/pitcher_data.csv'
fielder_path='./Lahman_compiled_player_data/fielder_data.csv'
_,_,fielder_data, per_game_data = Main_Import(per_game_path,batter_path,pitcher_path,fielder_path)

# Use the function to remove columns with more than 40% NaN values
# batter_data = remove_columns_with_nan(batter_data, 40)
# pitcher_data = remove_columns_with_nan(pitcher_data, 40)
# fielder_data = remove_columns_with_nan(fielder_data, 40)
batter_bwar_data
fielder_data
pitcher_bwar_data

Fielding Data fetched for year: 2008
Fielding Data fetched for year: 2009
Fielding Data fetched for year: 2010
Fielding Data fetched for year: 2011
Fielding Data fetched for year: 2012
Fielding Data fetched for year: 2013
Fielding Data fetched for year: 2014
Fielding Data fetched for year: 2015
Fielding Data fetched for year: 2016
Fielding Data fetched for year: 2017
Fielding Data fetched for year: 2018
Fielding Data fetched for year: 2019
Fielding Data fetched for year: 2020
Fielding Data fetched for year: 2021
Fielding Data fetched for year: 2022
Fielding Data fetched for year: 2023


Unnamed: 0,name_common,mlb_ID,player_ID,year_ID,team_ID,stint_ID,lg_ID,G,GS,RA,xRA,BIP,BIP_perc,salary,ERA_plus,WAR_rep,WAA,WAA_adj,WAR
0,George Bechtel,110756.0,bechtge01,1871,ATH,1,,3,3,42,29.183,134.0,0.1176,,52.682609,0.2455,-0.5940,-0.0153,-0.36
1,Asa Brainard,111373.0,brainas01,1871,OLY,1,,30,30,292,310.377,1237.0,0.9486,,93.906818,2.4889,-0.1470,-0.1556,2.19
2,Bob Ferguson,114069.0,fergubo01,1871,NYU,1,,1,0,9,1.185,14.0,0.0104,,19.733333,0.0102,-0.3177,-0.0006,-0.31
3,Cherokee Fisher,114181.0,fishech01,1871,ROK,1,,24,24,257,250.929,1031.0,0.9398,,97.129126,2.0067,0.2352,-0.1255,2.12
4,Frank Fleet,114224.0,fleetfr01,1871,NYU,1,,1,1,21,10.665,54.0,0.0400,,39.680000,0.0847,-0.3863,-0.0053,-0.31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54847,Alex Young,622065.0,youngal01,2023,CIN,1,NL,63,0,27,28.041,165.0,0.0390,1150000.0,118.926087,0.5202,0.3402,-0.1128,0.75
54848,Danny Young,664849.0,youngda02,2023,ATL,1,NL,8,0,1,4.550,19.0,0.0047,,426.900000,0.0810,0.3408,-0.1144,0.31
54849,Rob Zastryzny,642239.0,zastrro01,2023,PIT,1,NL,21,1,15,10.558,69.0,0.0165,750000.0,93.963636,0.2002,-0.4998,0.0269,-0.27
54850,Seby Zavala,664874.0,zavalse01,2023,ARI,2,NL,1,0,0,0.363,2.0,0.0005,,,0.0065,0.0302,-0.0170,0.02


In [40]:
test_year=2022
train_year=years = [str(year) for year in range(test_year-6, test_year)]
test_year=str(test_year)


date1=2008
date2=2024
batter_data=bref_batter(date1,date2)

pitcher_data=bref_pitcher(date1,date2)

fielder_data=bref_fielder(date1,date2)
fielder_data.drop(columns=['actual_success_rate_formatted', 'adj_estimated_success_rate_formatted', 'diff_success_rate_formatted'], inplace=True)
columns_to_convert = ['fielding_runs_prevented', 'outs_above_average', 'outs_above_average_infront', 
                      'outs_above_average_lateral_toward3bline', 'outs_above_average_lateral_toward1bline', 
                      'outs_above_average_behind', 'outs_above_average_rhh', 'outs_above_average_lhh']
position_dict = {
    'P': 1,
    'C': 2,
    '1B': 3,
    '2B': 4,
    '3B': 5,
    'SS': 6,
    'LF': 7,
    'CF': 8,
    'RF': 9
}


fielder_data['primary_pos_formatted'] = fielder_data['primary_pos_formatted'].replace(position_dict)

# fielder_data['mlbID'] = pd.to_numeric(fielder_data['mlbID'], errors='coerce')
# Replace 'nan' with np.nan


# Ensure the directory exists

directory = './Lahman_compiled_player_data'
if not os.path.exists(directory):
    os.makedirs(directory)

# Save the dataframes to csv files
batter_data.to_csv('./Lahman_compiled_player_data/batter_data.csv', index=False)
pitcher_data.to_csv('./Lahman_compiled_player_data/pitcher_data.csv', index=False)
fielder_data.to_csv('./Lahman_compiled_player_data/fielder_data.csv', index=False)

# import os
# os.environ['GH_TOKEN'] = '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'
# per_game_data=per_game_data(date1,date2)

per_game_path='





SyntaxError: EOL while scanning string literal (492782921.py, line 51)

In [None]:
def Impute(df, method):
    # Create an imputer instance
    imputer = SimpleImputer(strategy=method, fill_value=0)
    # Fit and transform all columns
    df[:] = imputer.fit_transform(df)
    return df

In [None]:
view_data(per_game_data)

The column Completion has 99.9% NaN values.
The column Forfeit has 100.0% NaN values.
The column Protest has 100.0% NaN values.
The column Attendance has 1.6% NaN values.
The column Ump2BID has 0.2% NaN values.
The column UmpLFID has 100.0% NaN values.
The column UmpRFID has 100.0% NaN values.
The column SavePID has 49.7% NaN values.
The column GWinRBIID has 4.7% NaN values.
The column Additional has 99.4% NaN values.
