# Prediction the number of wins Oakland Ace required to win the 2002 American Baseball League.
## Regression Problem

In [2]:
import warnings                    # to ignore warnings in the output
warnings.filterwarnings('ignore')

import pandas as pd                # for dataframe operations
import numpy as np                 # for math 
import matplotlib.pyplot as plt    # for basic plots
import seaborn as sns              # for advance-level plots

# Null value Imuptation

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# ML Models
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

##### Importing the Basball data

In [3]:

df = pd.read_csv('baseball.csv')

# Since the prediction was made for the 2002 playoffs, let's use the data from 1962 to 2001.

df = df[df['Year']<2002]

##### Handling Null Values

In [5]:
df.isnull().sum()

Team              0
League            0
Year              0
RS                0
RA                0
W                 0
OBP               0
SLG               0
BA                0
Playoffs          0
RankSeason      748
RankPlayoffs    748
G                 0
OOBP            812
OSLG            812
dtype: int64

In [4]:
# Taking only the numerical features for imputation
df_1 = df.select_dtypes(include=['float64','int64'])

# Seting up the imputation knobs
IT = IterativeImputer(estimator = XGBRegressor(), random_state=20)

# Dataframe without NULL values
df_final = pd.DataFrame(IT.fit_transform(df_1), columns= df_1.columns)
df_final.isnull().sum()

Year            0
RS              0
RA              0
W               0
OBP             0
SLG             0
BA              0
Playoffs        0
RankSeason      0
RankPlayoffs    0
G               0
OOBP            0
OSLG            0
dtype: int64

##### Train_Test Split

In [61]:
# After removing the features that had multicollinearity, X:
X = df_final[['OBP','SLG','OOBP','OSLG']]
y = df_final['W']

### Non Linear Models

##### XGB - Extreme Gradient Boosting Model

In [63]:
# A variable to remember the model
XGB = XGBRegressor()

param_grid = {'n_estimators': np.arange(1,50), 'max_depth': np.arange(3,15)}

Grid = GridSearchCV(XGB, param_grid = param_grid, cv = 5, scoring = 'neg_mean_squared_error')

Grid.fit(X,y)

Grid.best_params_

{'max_depth': 4, 'n_estimators': 26}

##### Data resulting from other analysis that was provided. 

In [64]:
# Test data
X_test = pd.DataFrame({'OBP':[0.339], 'SLG':[0.430], 'OOBP':[0.307],'OSLG':[0.373]})

In [98]:

XGB = XGBRegressor(n_estimators = 16, max_depth = 17)

Model = XGB.fit(X,y)
y_pred = Model.predict(X_test)
print('Required Number of Wins:', round(float(y_pred),2))

Required Number of Wins: 101.06


# ADA BOOST Model

In [62]:
RF = RandomForestRegressor(n_estimators=26, max_depth=5)
boost = AdaBoostRegressor(RF)

grid = GridSearchCV(boost, param_grid={'n_estimators':np.arange(1,30)}, cv = 5, scoring = 'neg_mean_squared_error')
model = grid.fit(X,y)

grid.best_params_

{'n_estimators': 23}

In [64]:
RF = RandomForestRegressor(n_estimators=26, max_depth=5)
boost = AdaBoostRegressor(RF, n_estimators=23)

model = boost.fit(X,y)
model.predict(X_test)

array([99.80090735])

# Random Forest Model

In [36]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
It=IterativeImputer()
X=pd.DataFrame(It.fit_transform(X), columns = X.columns)

In [37]:
X = df_final[['OBP','SLG','OOBP','OSLG']]
y = df_final['W']

In [58]:
RF = RandomForestRegressor()

param = {'n_estimators':np.arange(1,30), 'max_depth':np.arange(1,15)}
grid = GridSearchCV(RF, param_grid = param, cv = 5, scoring = 'neg_mean_squared_error')

grid.fit(X,y)
grid.best_params_

{'max_depth': 11, 'n_estimators': 23}

In [59]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LinearRegression
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

RF = RandomForestRegressor(n_estimators=23, max_depth = 11)
RF.fit(X,y)

RandomForestRegressor(max_depth=11, n_estimators=23)

In [9]:
X_test = pd.DataFrame({'OBP':[0.339], 'SLG':[0.430], 'OOBP':[0.307],'OSLG':[0.373]})

In [61]:
RF.predict(X_test)

array([101.62415459])

### Linear Model

##### Linear Regression Model

In [106]:
# Assigning a variable for the model
LR = LinearRegression()

# Training the model
LR.fit(X,y)

# Predicting with the test data
print('Required Number of Wins:', round(float(LR.predict(X_test)),2))

Required Number of Wins: 102.87
