In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestRegressor



## Load data

In [2]:
# Importing the dataset
missing_values = ["n/a", "na", "--"]
dataset = pd.read_csv('final_df2.csv', )
X = dataset.iloc[:, 1:81]
y = dataset.iloc[:, 81]

# delete L column since this is repetetive and the inverse of Wins
del X['L']
del X['W-L%']
del X['RA/G_x']
# Replace missing values with 0.  
X = X.fillna(0)

In [3]:
# Import the data from 2019 we will use to predict, and then delete the Team column

test2019 = pd.read_excel('df2019.xlsx')
teams2019 = test2019['Tm']
del test2019['Tm']

## PreProcessing

In [5]:
def cor_selector(X, y, num_feats):
    ''' This is a filter-based method.
We check the absolute value of the Pearson’s correlation between the target 
and numerical features in our dataset. We keep the top n features based on this criterion.'''
    cor_list = []
    feature_name = X.columns.tolist()
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # feature name
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-num_feats:]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature
cor_support, cor_feature = cor_selector(X, y, 12)
print(str(len(cor_feature)), 'selected features')

12 selected features


* 'ERA' = 9 * ER/IP = 9 * Earned Runs Allowed / Innings PItched
* 'WHIP' = (BB + H)/IP = Bases on Balls + Hits allowed / Innings Pitched (lower is better)
* 'H9' = 9 * H/IP = 9 * Hits allowed / Innings Pitched (Lower numbers are better),
* 'RA/G_y' = Runs Allowed per Game (The y is because this column appeared on both the pitching and fielding tables.
* 'RA/G_x': Not suprisingly, this duplicate feature shows up again since it's the same values & strongly correlated to the target
* 'OPS+ = Too long and complicated to write out the formula and what each variable means but it is a batting statistic
* ERA+ = 100*(lgERA/ERA)
* W = Wins, a strong and simple variable correlated with Champions
* L = Losses.  A little bit redundant since teams play nearly the same # of reg. season games and its the inverse of Wins.
* W-L% = W / (W + L) = Wins/Loss percentage.  Also a bit redundant.  


In [6]:
# CHI Squared
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=12)
chi_selector.fit(X_norm, y)
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')

12 selected features


In [7]:
# Recursive Feature Elimination
'''The goal of recursive feature elimination (RFE) is 
to select features by recursively considering smaller and smaller sets of features.'''
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=12, step=10, verbose=5)
rfe_selector.fit(X_norm, y)
rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')

Fitting estimator with 77 features.
Fitting estimator with 67 features.
Fitting estimator with 57 features.
Fitting estimator with 47 features.
Fitting estimator with 37 features.
Fitting estimator with 27 features.
Fitting estimator with 17 features.
12 selected features




In [8]:
# Lasso: SelectFromModel
'''Lasso Regularizer forces a lot of feature weights to be zero.'''
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l1"), max_features=12)
embeded_lr_selector.fit(X_norm, y)

embeded_lr_support = embeded_lr_selector.get_support()
embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist()
print(str(len(embeded_lr_feature)), 'selected features')

12 selected features




In [9]:
# Tree Based
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), max_features=12)
embeded_rf_selector.fit(X, y)

embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()
print(str(len(embeded_rf_feature)), 'selected features')

12 selected features


In [10]:
# put all selection together
feature_selection_df = pd.DataFrame({'Feature':X, 'Pearson':cor_support, 'Chi-2':chi_support, 'RFE':rfe_support, 'Logistics':embeded_lr_support,
                                    'Random Forest':embeded_rf_support})
# count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
# display the top 100
feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)
feature_selection_df.head(10)


Unnamed: 0,Feature,Pearson,Chi-2,RFE,Logistics,Random Forest,Total
1,"(W,)",True,True,True,True,True,5
2,"(R, A, /, G, _, y)",True,True,True,False,True,4
3,"(O, P, S, +)",True,True,False,True,True,4
4,"(H, 9)",True,True,True,True,False,4
5,"(t, S, h, o)",True,True,False,False,True,3
6,"(W, H, I, P)",True,True,False,False,True,3
7,"(R, /, G)",True,True,True,False,False,3
8,"(E, R, A, +)",True,True,False,False,True,3
9,"(E, R, A)",True,True,False,False,True,3
10,"(#, F, l, d)",False,True,True,True,False,3


In [11]:
core_features = ['W','RA/G_y', 'OPS+', 'H9', 'tSho', 'WHIP', 'R/G', 'ERA+', '#Fld', 'R_y', 'OBP', 'ERA']

In [13]:
#Filter the X_train and X_test data to 

X = X[cor_feature]
X2019 = test2019[cor_feature]

## Creating the Random Forest Regressor model

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=123, stratify=y)

In [15]:
pipeline = make_pipeline(preprocessing.StandardScaler(),
RandomForestRegressor(n_estimators=100))

In [17]:
hyperparameters = { 'randomforestregressor__max_features' : ['auto',
'sqrt', 'log2'],
'randomforestregressor__max_depth':
[None, 5, 3, 1]}

In [18]:
clf = GridSearchCV(pipeline, hyperparameters, cv=10)

In [19]:
clf.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('randomforestregressor',
                                        RandomForestRegressor(bootstrap=True,
                                                              criterion='mse',
                                                              max_depth=None,
                                                              max_features='auto',
                                                              max_leaf_nodes=None,
                                                              min_impurity_decrease=0.0,
                                                              min_impurity_split

In [39]:
pred = clf.predict(X_test)

In [43]:
print(r2_score(y_test, pred))

0.054148981919752925


In [23]:
print(mean_squared_error(y_test, pred))

0.03373405551452184


In [25]:
pred2019 = clf.predict(X2019)

In [26]:
prediction = pd.Series(pred2019)

In [30]:
results = pd.DataFrame({"Team": teams2019, "Winner probability": prediction })


In [37]:
results.sort_values(by='Winner probability', axis=0, ascending=False, inplace=False, kind='quicksort').head(10).reset_index().drop('index', axis=1)

Unnamed: 0,Team,Winner probability
0,HOU,0.462176
1,NYY,0.326382
2,MIN,0.269182
3,LAD,0.239066
4,OAK,0.09416
5,WSN,0.083536
6,CLE,0.067353
7,TBR,0.056232
8,STL,0.053824
9,ATL,0.052205


# Model performance looking at past winners

## Save model for future use


In [32]:
joblib.dump(clf, 'rf_regressor.pkl')
# To load: clf2 = joblib.load(‘rf_regressor.pkl’)

['rf_regressor.pkl']

# Conclusions

This project could be modified into a regression model by including all teams in the playoffs, ranking them ordinally on how far they got, and them making this a new column