In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

DATA_FOLDER = "../../../../Documents/MLBData/"
print(DATA_FOLDER)

def load_batting_data(fileName,data_path = DATA_FOLDER):
    csv_path = os.path.join(data_path,fileName)
    return pd.read_csv(csv_path)


advanced = load_batting_data('2018-BattingAdvanced(CSV).csv')
advanced = advanced[advanced['PA'] >= 50]
advanced.drop(columns=['Tm', 'cWPA'], inplace=True)

standard = load_batting_data('2018-BattingStandard(CSV).csv')
standard = standard[standard['PA'] >= 50]
standard.drop(columns=['Pos Summary', 'Tm', 'Lg', 'Name', 'Age', 'PA'], inplace=True)


../../../../Documents/MLBData/


In [2]:
percentValues = ['HR%', 'SO%','BB%','HardH%', 'LD%', 'GB%','FB%', 'Pull%', 'Cent%', 'Oppo%', 'RS%', 'SB%', 'XBT%']
for value in percentValues:
    advanced[value] = advanced[value].apply(lambda x: float(x.strip('%')) if isinstance(x, str) else x)

In [3]:
total_stats = pd.merge(advanced, standard, on='Name-additional', how='outer')

In [4]:
league_avg = total_stats.iloc[-1]

train_set, test_set = train_test_split(total_stats, test_size=0.2, random_state=42)

# print(league_avg)
print(len(train_set))
print(len(test_set))

440
111


In [5]:
training_stats = train_set.copy()
training_stats.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 440 entries, 388 to 102
Data columns (total 48 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             440 non-null    object 
 1   Age              439 non-null    float64
 2   PA               440 non-null    int64  
 3   rOBA             440 non-null    float64
 4   Rbat+            440 non-null    float64
 5   BAbip            440 non-null    float64
 6   ISO              440 non-null    float64
 7   HR%              440 non-null    float64
 8   SO%              440 non-null    float64
 9   BB%              440 non-null    float64
 10  EV               440 non-null    float64
 11  HardH%           440 non-null    float64
 12  LD%              440 non-null    float64
 13  GB%              440 non-null    float64
 14  FB%              440 non-null    float64
 15  GB/FB            440 non-null    float64
 16  Pull%            440 non-null    float64
 17  Cent%         

In [6]:
# Starting Data Cleaning:

# medianOPS = training_stats['OPS+'].median()#Taking Median for OPS
# medianAge = training_stats['Age'].median() 
# training_stats['OPS+'].fillna(medianOPS, inplace=True)
training_stats['XBT%'].fillna(0,inplace=True)
training_stats['SB%'].fillna(0, inplace=True)
# training_stats['Age'].fillna(medianAge, inplace=True)



In [7]:
# fitting missing Data
imputer = SimpleImputer(strategy='median')
batting_num = training_stats.drop(['Name-additional', 'Name'], axis=1) #need to only have numeric values for fitting
imputer.fit(batting_num)
X = imputer.transform(batting_num)
batting_tr = pd.DataFrame(X, columns=batting_num.columns, index=batting_num.index)

# batting_tr.info()

In [8]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler',StandardScaler())
])

In [9]:
num_attribs = list(batting_num)

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs)
])

batting_prepared = full_pipeline.fit_transform(batting_tr)