In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import GridSearchCV

DATA_FOLDER = "../../../../Documents/MLBData/"
print(DATA_FOLDER)

def load_batting_data(fileName,data_path = DATA_FOLDER):
    csv_path = os.path.join(data_path,fileName)
    return pd.read_csv(csv_path)


advanced = load_batting_data('2018-BattingAdvanced(CSV).csv')
advanced = advanced[advanced['PA'] >= 50]
advanced.drop(columns=['Tm', 'cWPA'], inplace=True)

standard = load_batting_data('2018-BattingStandard(CSV).csv')
standard = standard[standard['PA'] >= 50]
standard.drop(columns=['Pos Summary', 'Tm', 'Lg', 'Name', 'Age', 'PA'], inplace=True)


../../../../Documents/MLBData/


In [2]:
percentValues = ['HR%', 'SO%','BB%','HardH%', 'LD%', 'GB%','FB%', 'Pull%', 'Cent%', 'Oppo%', 'RS%', 'SB%', 'XBT%']
for value in percentValues:
    advanced[value] = advanced[value].apply(lambda x: float(x.strip('%')) if isinstance(x, str) else x)

In [3]:
total_stats = pd.merge(advanced, standard, on='Name-additional', how='outer')

In [4]:
league_avg = total_stats.iloc[-1]

train_set, test_set = train_test_split(total_stats, test_size=0.2, random_state=42) # Split into test and train

In [5]:
training_stats = train_set.drop(['H', 'R', 'HR', 'RBI','SB','BB', 'IBB','HBP','OPS'], axis=1) # Remove label from data set
training_labels = train_set[['H', 'R', 'HR', 'RBI','SB','BB', 'IBB','HBP','OPS']].copy() # Copy the label values
training_stats.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 440 entries, 388 to 102
Data columns (total 39 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             440 non-null    object 
 1   Age              439 non-null    float64
 2   PA               440 non-null    int64  
 3   rOBA             440 non-null    float64
 4   Rbat+            440 non-null    float64
 5   BAbip            440 non-null    float64
 6   ISO              440 non-null    float64
 7   HR%              440 non-null    float64
 8   SO%              440 non-null    float64
 9   BB%              440 non-null    float64
 10  EV               440 non-null    float64
 11  HardH%           440 non-null    float64
 12  LD%              440 non-null    float64
 13  GB%              440 non-null    float64
 14  FB%              440 non-null    float64
 15  GB/FB            440 non-null    float64
 16  Pull%            440 non-null    float64
 17  Cent%         

In [6]:
# Starting Data Cleaning:
training_stats['XBT%'].fillna(0,inplace=True)
training_stats['SB%'].fillna(0, inplace=True)

In [7]:
# fitting missing Data
imputer = SimpleImputer(strategy='median')
batting_num = training_stats.drop(['Name-additional', 'Name'], axis=1) #need to only have numeric values for fitting
imputer.fit(batting_num)
X = imputer.transform(batting_num)
batting_tr = pd.DataFrame(X, columns=batting_num.columns, index=batting_num.index)

In [8]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler',StandardScaler())
])

In [9]:
num_attribs = list(batting_num)

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs)
])

batting_prepared = full_pipeline.fit_transform(batting_tr)

In [10]:
#### Selecting and Training a Model

In [11]:
## Linear Model
lin_reg = LinearRegression()
multioutput_reg = MultiOutputRegressor(lin_reg)
multioutput_reg.fit(batting_prepared, training_labels)

In [12]:
# Testing section used to see if our model is working

In [13]:
stat_predictions = multioutput_reg.predict(batting_prepared)

# for i, column in enumerate(training_labels.columns):
#     scores = cross_val_score(multioutput_reg, batting_prepared, training_labels[column].values.reshape(-1, 1), scoring='neg_mean_squared_error', cv=10)
#     tree_rmse_scores = np.sqrt(-scores)
#     print(f"For {column}:\nScores: {scores}, Mean: {scores.mean()}, Standard Deviation: {scores.std()}")
#     mse = mean_squared_error(training_labels[column], stat_predictions[:, i])
#     rmse = np.sqrt(mse)
#     print(f"RMSE for {column}: {rmse:.2f}")
#     RMSE for H: 3.27
#     RMSE for R: 4.64
#     RMSE for HR: 1.09
#     RMSE for RBI: 4.83
#     RMSE for SB: 3.13
#     RMSE for BB: 1.83
#     RMSE for IBB: 2.02
#     RMSE for HBP: 1.82
#     RMSE for OPS: 0.00

In [14]:
## Decision Tree
# tree_reg = DecisionTreeRegressor()
# tree_reg.fit(batting_prepared, training_labels)

# stat_predictions_tree = tree_reg.predict(batting_prepared)


# for i, column in enumerate(training_labels.columns):
#     scores = cross_val_score(tree_reg, batting_prepared, training_labels[column], scoring='neg_mean_squared_error', cv=10)
#     tree_rmse_scores = np.sqrt(-scores)
#     print(f"For {column}:\nScores: {scores}, Mean: {scores.mean()}, Standard Deviation: {scores.std()}")

    

In [15]:
## Random Forest
forest_reg = RandomForestRegressor()
forest_reg.fit(batting_prepared, training_labels)

stat_predictions_forest = forest_reg.predict(batting_prepared)
# for i, column in enumerate(training_labels.columns):
#     scores = cross_val_score(forest_reg, batting_prepared, training_labels[column], scoring='neg_mean_squared_error', cv=10)
#     tree_rmse_scores = np.sqrt(-scores)
#     print(f"For {column}:\nScores: {scores}, Mean: {scores.mean()}, Standard Deviation: {scores.std()}")
#     mse = mean_squared_error(training_labels[column], stat_predictions_forest[:, i])
#     rmse = np.sqrt(mse)
#     print(f"RMSE for {column}: {rmse:.2f}")
    #     RMSE for H: 2.55
    # RMSE for R: 2.29
    # RMSE for HR: 1.04
    # RMSE for RBI: 2.52
    # RMSE for SB: 1.99
    # RMSE for BB: 2.52
    # RMSE for IBB: 0.91
    # RMSE for HBP: 1.18
    # RMSE for OPS: 0.02

    
    

In [16]:
# Fine Tuning the models

In [24]:
param_grid = [
    {'n_estimators': [3,10,30,100], 'max_features': [2,4,6,8,16]},
    {'bootstrap':[False], 'n_estimators': [3, 10], 'max_features': [2,3,4]}
]

grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)

grid_search.fit(batting_prepared, training_labels)

grid_search.best_params_

{'max_features': 16, 'n_estimators': 30}