In [72]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import GridSearchCV

DATA_FOLDER = "../../../../Documents/MLBData/"
print(DATA_FOLDER)

def load_batting_data(fileName,data_path = DATA_FOLDER):
    csv_path = os.path.join(data_path,fileName)
    return pd.read_csv(csv_path)

# advanced, standard = pd.DataFrame(), pd.DataFrame()
total_stats = pd.DataFrame()
years = ['2015','2016','2017','2018','2019','2020','2021']


for year in years:
    # For each year I want to get the advanced and standard data
    advancedData = load_batting_data(f"{year}-BattingAdvanced(CSV).csv")
    standardData = load_batting_data(f"{year}-BattingStandard(CSV).csv")
    
    # Clean up the data
    advancedData = advancedData[advancedData['PA'] >=50]
    percentValues = ['HR%', 'SO%','BB%','HardH%', 'LD%', 'GB%','FB%', 'Pull%', 'Cent%', 'Oppo%', 'RS%', 'SB%', 'XBT%']
    for value in percentValues:
        advancedData[value] = advancedData[value].apply(lambda x: float(x.strip('%')) if isinstance(x, str) else x)
#     advancedData.drop(columns=['Tm', 'cWPA'], inplace = True)
        
    standardData = standardData[standardData['PA'] >= 50]
    standardData.drop(columns=['Pos Summary', 'Tm', 'Lg', 'Name', 'Age', 'PA', 'BA'], inplace=True)
    
    # combine the data together
    year_stats = pd.merge(advancedData, standardData, on='Name-additional', how='outer')
    year_stats.drop(columns=['Tm', 'cWPA'], inplace= True)
    
    # concat the data the our total data pd
    total_stats = pd.concat([total_stats, year_stats])
    
    
    
#     advanced = pd.concat([advanced, advancedData])
#     standard = pd.concat([standard, standardData])
#     advanced = advanced.append(advancedData)
#     standard = standard.append(standardData)

    

../../../../Documents/MLBData/


In [73]:
# percentValues = ['HR%', 'SO%','BB%','HardH%', 'LD%', 'GB%','FB%', 'Pull%', 'Cent%', 'Oppo%', 'RS%', 'SB%', 'XBT%']
# for value in percentValues:
#     advanced[value] = advanced[value].apply(lambda x: float(x.strip('%')) if isinstance(x, str) else x)

In [74]:
# total_stats = pd.merge(advanced, standard, on='Name-additional', how='outer')
total_stats.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3728 entries, 0 to 556
Data columns (total 47 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             3727 non-null   object 
 1   Age              3721 non-null   float64
 2   PA               3727 non-null   float64
 3   rOBA             3727 non-null   float64
 4   Rbat+            3727 non-null   float64
 5   BAbip            3727 non-null   float64
 6   ISO              3727 non-null   float64
 7   HR%              3727 non-null   float64
 8   SO%              3727 non-null   float64
 9   BB%              3727 non-null   float64
 10  EV               3727 non-null   float64
 11  HardH%           3727 non-null   float64
 12  LD%              3727 non-null   float64
 13  GB%              3727 non-null   float64
 14  FB%              3727 non-null   float64
 15  GB/FB            3727 non-null   float64
 16  Pull%            3727 non-null   float64
 17  Cent%          

In [75]:
league_avg = total_stats.iloc[-1]

train_set, test_set = train_test_split(total_stats, test_size=0.2, random_state=42) # Split into test and train

In [76]:
training_stats = train_set.drop(['H', 'R', 'HR', 'RBI','SB','BB', 'IBB','HBP','OPS'], axis=1) # Remove label from data set
training_labels = train_set[['H', 'R', 'HR', 'RBI','SB','BB', 'IBB','HBP','OPS']].copy() # Copy the label values
training_stats.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2982 entries, 195 to 3
Data columns (total 38 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             2982 non-null   object 
 1   Age              2977 non-null   float64
 2   PA               2982 non-null   float64
 3   rOBA             2982 non-null   float64
 4   Rbat+            2982 non-null   float64
 5   BAbip            2982 non-null   float64
 6   ISO              2982 non-null   float64
 7   HR%              2982 non-null   float64
 8   SO%              2982 non-null   float64
 9   BB%              2982 non-null   float64
 10  EV               2982 non-null   float64
 11  HardH%           2982 non-null   float64
 12  LD%              2982 non-null   float64
 13  GB%              2982 non-null   float64
 14  FB%              2982 non-null   float64
 15  GB/FB            2982 non-null   float64
 16  Pull%            2982 non-null   float64
 17  Cent%          

In [77]:
# Starting Data Cleaning:
training_stats['XBT%'].fillna(0,inplace=True)
training_stats['SB%'].fillna(0, inplace=True)

In [78]:
# fitting missing Data
imputer = SimpleImputer(strategy='median')
batting_num = training_stats.drop(['Name-additional', 'Name'], axis=1) #need to only have numeric values for fitting
imputer.fit(batting_num)
X = imputer.transform(batting_num)
batting_tr = pd.DataFrame(X, columns=batting_num.columns, index=batting_num.index)

In [79]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler',StandardScaler())
])

In [80]:
num_attribs = list(batting_num)

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs)
])

batting_prepared = full_pipeline.fit_transform(batting_tr)

In [81]:
#### Selecting and Training a Model

In [82]:
## Linear Model
lin_reg = LinearRegression()
multioutput_reg = MultiOutputRegressor(lin_reg)
multioutput_reg.fit(batting_prepared, training_labels)

In [83]:
# Testing section used to see if our model is working

In [84]:
stat_predictions = multioutput_reg.predict(batting_prepared)

# for i, column in enumerate(training_labels.columns):
#     scores = cross_val_score(multioutput_reg, batting_prepared, training_labels[column].values.reshape(-1, 1), scoring='neg_mean_squared_error', cv=10)
#     tree_rmse_scores = np.sqrt(-scores)
#     print(f"For {column}:\nScores: {scores}, Mean: {scores.mean()}, Standard Deviation: {scores.std()}")
#     mse = mean_squared_error(training_labels[column], stat_predictions[:, i])
#     rmse = np.sqrt(mse)
#     print(f"RMSE for {column}: {rmse:.2f}")
#     RMSE for H: 3.27
#     RMSE for R: 4.64
#     RMSE for HR: 1.09
#     RMSE for RBI: 4.83
#     RMSE for SB: 3.13
#     RMSE for BB: 1.83
#     RMSE for IBB: 2.02
#     RMSE for HBP: 1.82
#     RMSE for OPS: 0.00

In [85]:
## Decision Tree
# tree_reg = DecisionTreeRegressor()
# tree_reg.fit(batting_prepared, training_labels)

# stat_predictions_tree = tree_reg.predict(batting_prepared)


# for i, column in enumerate(training_labels.columns):
#     scores = cross_val_score(tree_reg, batting_prepared, training_labels[column], scoring='neg_mean_squared_error', cv=10)
#     tree_rmse_scores = np.sqrt(-scores)
#     print(f"For {column}:\nScores: {scores}, Mean: {scores.mean()}, Standard Deviation: {scores.std()}")

    

In [86]:
## Random Forest
forest_reg = RandomForestRegressor()
forest_reg.fit(batting_prepared, training_labels)

stat_predictions_forest = forest_reg.predict(batting_prepared)
for i, column in enumerate(training_labels.columns):
#     scores = cross_val_score(forest_reg, batting_prepared, training_labels[column], scoring='neg_mean_squared_error', cv=10)
#     tree_rmse_scores = np.sqrt(-scores)
#     print(f"For {column}:\nScores: {scores}, Mean: {scores.mean()}, Standard Deviation: {scores.std()}")
    mse = mean_squared_error(training_labels[column], stat_predictions_forest[:, i])
    rmse = np.sqrt(mse)
    print(f"RMSE for {column}: {rmse:.2f}")
    #     RMSE for H: 2.55
    # RMSE for R: 2.29
    # RMSE for HR: 1.04
    # RMSE for RBI: 2.52
    # RMSE for SB: 1.99
    # RMSE for BB: 2.52
    # RMSE for IBB: 0.91
    # RMSE for HBP: 1.18
    # RMSE for OPS: 0.02

    
    

RMSE for H: 1.79
RMSE for R: 1.73
RMSE for HR: 0.66
RMSE for RBI: 2.44
RMSE for SB: 1.79
RMSE for BB: 1.40
RMSE for IBB: 0.71
RMSE for HBP: 1.16
RMSE for OPS: 0.02


In [87]:
# Fine Tuning the models

In [88]:
param_grid = [
    {'n_estimators': [100], 'max_features': [16]},
#     {'bootstrap':[False], 'n_estimators': [3, 10], 'max_features': [2,3,4]}
]

grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)

grid_search.fit(batting_prepared, training_labels)

grid_search.best_params_

{'max_features': 16, 'n_estimators': 100}

In [89]:
grid_search.best_estimator_

In [90]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(np.sqrt(-mean_score), params)

4.061392121672558 {'max_features': 16, 'n_estimators': 100}


In [91]:
##Final Model testing
final_model = grid_search.best_estimator_

testing_stats = test_set.drop(['H', 'R', 'HR', 'RBI','SB','BB', 'IBB','HBP','OPS'], axis=1) # Remove label from data set
testing_labels = test_set[['H', 'R', 'HR', 'RBI','SB','BB', 'IBB','HBP','OPS']].copy() # Copy the label values

X_test_prepared = full_pipeline.transform(testing_stats)

final_predictions = final_model.predict(X_test_prepared)
for i, column in enumerate(testing_labels.columns):
    final_mse = mean_squared_error(testing_labels[column], final_predictions[:,i])
    final_rmse = np.sqrt(final_mse)
    print(f"RMSE for {column}: {final_rmse:.2f}")

# RMSE for H: 4.54
# RMSE for R: 5.01
# RMSE for HR: 1.85
# RMSE for RBI: 6.30
# RMSE for SB: 4.99
# RMSE for BB: 4.03
# RMSE for IBB: 1.83
# RMSE for HBP: 2.97
# RMSE for OPS: 0.03


RMSE for H: 4.54
RMSE for R: 5.01
RMSE for HR: 1.85
RMSE for RBI: 6.30
RMSE for SB: 4.99
RMSE for BB: 4.03
RMSE for IBB: 1.83
RMSE for HBP: 2.97
RMSE for OPS: 0.03
