In [1]:
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from glob import glob

In [2]:
filenames = glob('./Training_Data/*.csv')

In [3]:
dataframes = [pd.read_csv(f,index_col = 'Date', infer_datetime_format = True, parse_dates = True) for f in filenames]

In [4]:
baseball_data_2016 = dataframes[0]
baseball_data_2017 = dataframes[1]
baseball_data_2018 = dataframes[2]
baseball_data_2019 = dataframes[3]

In [5]:
baseball_data_2019 = baseball_data_2019[baseball_data_2019['visitor_open_odds'] != 'NL']

In [6]:
baseball_data_2019 = baseball_data_2019[baseball_data_2019['home_open_odds'] != 'NL']

In [7]:
X = baseball_data_2019[['home_open_odds', 'visitor_open_odds','Home_PitchingK%', 'Home_PitchingBB%',
       'Home_PitchingOBP_allowed', 'Home_PitchingSLG%_allowed',
       'Visitor_PitchingK%', 'Visitor_PitchingBB%',
       'Visitor_PitchingOBP_allowed', 'Visitor_PitchingSLG%_allowed',
       'Home_HittingK%', 'Home_HittingBB%', 'Home_HittingOBP',
       'Home_HittingSLG%', 'Visitor_HittingK%', 'Visitor_HittingBB%',
       'Visitor_HittingOBP','Visitor_HittingSLG%']]
y = baseball_data_2019['home_win_loss']

In [8]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [9]:
len(baseball_data_2019) * 0.50 

1198.5

In [10]:
X_train = X_scaled[:1199]
X_test = X_scaled[1199:]
y_train = y[:1199]
y_test = y[1199:]

In [11]:
rf_classifier = RandomForestClassifier(n_estimators = 1000, random_state = 1)

In [12]:
rf_classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [14]:
predictions_rf = rf_classifier.predict(X_test)

In [15]:
acc_score = accuracy_score(y_test, predictions_rf)

In [16]:
print(f'The accuracy score for the 2nd half of the 2016 season: {acc_score}')

The accuracy score for the 2nd half of the 2016 season: 0.5759599332220368


In [17]:
importances = rf_classifier.feature_importances_

In [18]:
sorted(zip(importances, X.columns), reverse=True)

[(0.06542768005411397, 'Home_PitchingBB%'),
 (0.06065061222794597, 'Visitor_HittingBB%'),
 (0.05733390604239558, 'Visitor_HittingSLG%'),
 (0.057125270315651455, 'Home_PitchingOBP_allowed'),
 (0.05641760016742564, 'Visitor_PitchingOBP_allowed'),
 (0.05608614336284373, 'Home_HittingSLG%'),
 (0.055995164419794095, 'Visitor_PitchingSLG%_allowed'),
 (0.055840407791590746, 'Home_HittingK%'),
 (0.05559143229702416, 'home_open_odds'),
 (0.054936514987132624, 'Home_HittingBB%'),
 (0.05443233101288947, 'visitor_open_odds'),
 (0.05439975016688641, 'Home_PitchingK%'),
 (0.05428429983126747, 'Visitor_HittingK%'),
 (0.05300437474851013, 'Home_PitchingSLG%_allowed'),
 (0.05249986832133361, 'Visitor_PitchingK%'),
 (0.05247327847865012, 'Visitor_HittingOBP'),
 (0.05212953090365691, 'Visitor_PitchingBB%'),
 (0.051371834870888027, 'Home_HittingOBP')]

In [19]:
print(classification_report(y_test, predictions_rf))

              precision    recall  f1-score   support

           0       0.56      0.50      0.53       564
           1       0.59      0.64      0.62       634

    accuracy                           0.58      1198
   macro avg       0.57      0.57      0.57      1198
weighted avg       0.57      0.58      0.57      1198



In [20]:
actual_df = pd.DataFrame(y_test)
actual_df.reset_index(inplace = True)
predict_df = pd.DataFrame(predictions_rf)
actual_predict_df = pd.concat([actual_df,predict_df], axis = 1, join = 'inner')
odds_df_new = baseball_data_2019[['home','visitor','home_open_odds','visitor_open_odds']][1199:]
odds_df_new.reset_index(inplace = True)
odds_df_new.drop(columns = ['Date'],inplace = True)
df = pd.concat([actual_df,predict_df, odds_df_new], axis = 1, join ='inner')
df.set_index('Date', inplace = True)
df.columns = ['Actual','Predicted','Home','Visitor','Home_Open_Odds','Visitor_Open_Odds']


In [21]:
df.to_csv(f'./Predictions_Vs_Actual/predictions_2019.csv')

In [None]:
df.head()