In [27]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree

#set pandas to display all rows of a dataframe, create our playerData dataframe
pd.set_option("display.max_rows", None)
playerData = pd.read_csv("MLBStatScraperMVP_20210904.csv")

#if we have a model already saved load it, else handle exception by creating a new model
try:
    model = joblib.load("mvp_predictor_randomforest_model.joblib")
except FileNotFoundError:
    #process data to remove names that cannot be used in training the model, and seperate into input X, and output y sets
    namelessPlayerData = playerData.drop(columns=["player name"])
    X = namelessPlayerData.drop(columns=["mvp"])
    y = namelessPlayerData["mvp"]
    
    #split our data into training and testing sets, using 80% for training and 20% for testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    #create, train, and save the model
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    joblib.dump(model, "mvp_predictor_randomforest_model.joblib")
    
    #testing our model
    predictions = model.predict(X_test)

In [28]:
accuracyScore = accuracy_score(y_test, predictions)
accuracyScore

0.9543859649122807

In [29]:
#read and process our AL data, then make predictions
alPredictionData = pd.read_csv("MLBStatScraper_20210905_american_ops_2021.csv")
namelessALData = alPredictionData.drop(columns=["player name"])
alPredictions = model.predict(namelessALData)

#create a dataframe to easier visualize our AL predictions
alPredictionsArray = np.array(alPredictions)
alPredictionsDf = pd.DataFrame(alPredictionsArray)
alPredictionsDf.columns = ["mvp"]
alPredictionsDf

Unnamed: 0,mvp
0,No
1,No
2,No
3,No
4,No
5,No
6,No
7,No
8,No
9,No


In [30]:
alPredictionData

Unnamed: 0,player name,ops,avg,rbi,hr
0,Vladimir Guerrero,1.007,0.318,97,39
1,Shohei Ohtani,0.972,0.258,93,43
2,Aaron Judge,0.917,0.294,75,30
3,Matt Olson,0.917,0.274,89,32
4,Cedric Mullins,0.899,0.305,52,25
5,Jose Ramirez,0.898,0.26,86,32
6,Rafael Devers,0.891,0.274,100,33
7,Yordan Alvarez,0.881,0.282,89,28
8,Kyle Tucker,0.87,0.281,76,24
9,J.D. Martinez,0.868,0.286,88,25


In [31]:
#read and process our NL data, then make predictions
nlPredictionData = pd.read_csv("MLBStatScraper_20210905_national_ops_2021.csv")
namelessNLData = nlPredictionData.drop(columns=["player name"])
nlPredictions = model.predict(namelessNLData)

#create a dataframe to easier visualize our NL predictions
nlPredictionsArray = np.array(nlPredictions)
nlPredictionsDf = pd.DataFrame(nlPredictionsArray)
nlPredictionsDf.columns = ["mvp"]
nlPredictionsDf

Unnamed: 0,mvp
0,No
1,No
2,No
3,No
4,No
5,No
6,No
7,No
8,No
9,No


In [32]:
nlPredictionData

Unnamed: 0,player name,ops,avg,rbi,hr
0,Fernando Tatis,1.001,0.28,85,37
1,Bryce Harper,0.998,0.301,64,27
2,Juan Soto,0.963,0.3,79,24
3,Jesse Winker,0.954,0.307,71,24
4,Nick Castellanos,0.949,0.32,78,26
5,Max Muncy,0.924,0.256,78,30
6,Austin Riley,0.916,0.305,87,29
7,Joey Votto,0.912,0.269,85,28
8,C.J. Cron,0.911,0.273,79,26
9,Bryan Reynolds,0.902,0.301,81,22


In [34]:
tree.export_graphviz(model, out_file="mvp_predictor_randomforest.dot",
                    feature_names=["ops", "avg", "rbi", "hr"],
                    class_names=sorted(y.unique()),
                    label="all",
                    rounded=True,
                    filled=True)

AttributeError: 'RandomForestClassifier' object has no attribute 'tree_'