In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor

DATA_FOLDER = "../../../../Documents/MLBData/"
print(DATA_FOLDER)

def load_batting_data(fileName,data_path = DATA_FOLDER):
    csv_path = os.path.join(data_path,fileName)
    return pd.read_csv(csv_path)


advanced = load_batting_data('2018-BattingAdvanced(CSV).csv')
advanced = advanced[advanced['PA'] >= 50]
advanced.drop(columns=['Tm', 'cWPA'], inplace=True)

standard = load_batting_data('2018-BattingStandard(CSV).csv')
standard = standard[standard['PA'] >= 50]
standard.drop(columns=['Pos Summary', 'Tm', 'Lg', 'Name', 'Age', 'PA'], inplace=True)


../../../../Documents/MLBData/


In [2]:
percentValues = ['HR%', 'SO%','BB%','HardH%', 'LD%', 'GB%','FB%', 'Pull%', 'Cent%', 'Oppo%', 'RS%', 'SB%', 'XBT%']
for value in percentValues:
    advanced[value] = advanced[value].apply(lambda x: float(x.strip('%')) if isinstance(x, str) else x)

In [3]:
total_stats = pd.merge(advanced, standard, on='Name-additional', how='outer')

In [4]:
league_avg = total_stats.iloc[-1]

train_set, test_set = train_test_split(total_stats, test_size=0.2, random_state=42) # Split into test and train

In [5]:
training_stats = train_set.drop(['H', 'R', 'HR', 'RBI','SB','BB', 'IBB','HBP','OPS'], axis=1) # Remove label from data set
training_labels = train_set[['H', 'R', 'HR', 'RBI','SB','BB', 'IBB','HBP','OPS']].copy() # Copy the label values
training_stats.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 440 entries, 388 to 102
Data columns (total 39 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             440 non-null    object 
 1   Age              439 non-null    float64
 2   PA               440 non-null    int64  
 3   rOBA             440 non-null    float64
 4   Rbat+            440 non-null    float64
 5   BAbip            440 non-null    float64
 6   ISO              440 non-null    float64
 7   HR%              440 non-null    float64
 8   SO%              440 non-null    float64
 9   BB%              440 non-null    float64
 10  EV               440 non-null    float64
 11  HardH%           440 non-null    float64
 12  LD%              440 non-null    float64
 13  GB%              440 non-null    float64
 14  FB%              440 non-null    float64
 15  GB/FB            440 non-null    float64
 16  Pull%            440 non-null    float64
 17  Cent%         

In [6]:
# Starting Data Cleaning:
training_stats['XBT%'].fillna(0,inplace=True)
training_stats['SB%'].fillna(0, inplace=True)

In [7]:
# fitting missing Data
imputer = SimpleImputer(strategy='median')
batting_num = training_stats.drop(['Name-additional', 'Name'], axis=1) #need to only have numeric values for fitting
imputer.fit(batting_num)
X = imputer.transform(batting_num)
batting_tr = pd.DataFrame(X, columns=batting_num.columns, index=batting_num.index)

In [8]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler',StandardScaler())
])

In [9]:
num_attribs = list(batting_num)

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs)
])

batting_prepared = full_pipeline.fit_transform(batting_tr)

In [10]:
#### Selecting and Training a Model

In [11]:
lin_reg = LinearRegression()
multioutput_reg = MultiOutputRegressor(lin_reg)
multioutput_reg.fit(batting_prepared, training_labels)

In [14]:
some_data = training_stats.iloc[:5]
some_labels = training_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions:", multioutput_reg.predict(some_data_prepared))
print("Labels:", list(some_labels.values))

Predictions: [[120.65130647  72.27432242  21.77888316  76.48627788   7.88128057
   61.68073767   4.57611807   5.14736534   0.83906861]
 [118.36743017  62.35658163  12.54132997  54.98692742  10.40193494
   42.57866875   2.55239221   3.48879112   0.69895163]
 [116.64237127  66.34971596   4.78873956  32.37185195  27.07659267
   43.75258023   0.9014709    3.14978108   0.62613858]
 [159.68362058  77.55084438  16.10839075  66.09268102  12.95432617
   54.37124241   3.87922688   6.40275328   0.83685465]
 [ 38.39480557  23.25163844   4.53523005  13.6845825    2.49292057
   17.49687592   0.44483841   1.48130906   0.67113584]]
Labels: [array([117.   ,  75.   ,  23.   ,  81.   ,  12.   ,  61.   ,   5.   ,
         3.   ,   0.839]), array([114.   ,  51.   ,  14.   ,  59.   ,  10.   ,  44.   ,  15.   ,
         2.   ,   0.699]), array([119.   ,  74.   ,   4.   ,  29.   ,  34.   ,  46.   ,   0.   ,
         1.   ,   0.626]), array([169.   ,  84.   ,  13.   ,  61.   ,  17.   ,  55.   ,   4.   ,
      