In [82]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#set pandas to display all rows of a dataframe, create our playerData dataframe
pd.set_option("display.max_rows", None)
playerData = pd.read_csv("MLBStatScraperMVP_20210906_string.csv")

#if we have a model already saved load it, else handle exception by creating a new model
try:
    model = joblib.load("mvp_predictor_logisticregression_model.joblib")
except FileNotFoundError:
    #process data to remove names that cannot be used in training the model, and seperate into input X, and output y sets
    namelessPlayerData = playerData.drop(columns=["player name"])
    X = namelessPlayerData.drop(columns=["mvp"])
    y = namelessPlayerData["mvp"]
    
    #split our data into training and testing sets, using 80% for training and 20% for testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    #create, train, and save the model
    model = LogisticRegression()
    #model = LogisticRegression(class_weight="balanced")
    model.fit(X_train, y_train)
    joblib.dump(model, "mvp_predictor_logisticregression_model.joblib")
    
    #testing our model
    predictions = model.predict(X_test)

In [83]:
accuracyScore = accuracy_score(y_test, predictions)
accuracyScore

0.9617283950617284

In [84]:
testedPredictionsArray = np.array(predictions)
testedPredictionsDf = pd.DataFrame(testedPredictionsArray)
testedPredictionsDf.columns = ["mvp"]
testedPredictionsDf

Unnamed: 0,mvp
0,Not MVP
1,Not MVP
2,Not MVP
3,Not MVP
4,Not MVP
5,Not MVP
6,Not MVP
7,Not MVP
8,Not MVP
9,Not MVP


In [85]:
#read and process our AL data, then make predictions
alPredictionData = pd.read_csv("MLBStatScraper_20210905_american_ops_2021.csv")
namelessALData = alPredictionData.drop(columns=["player name"])
alPredictions = model.predict(namelessALData)

#create a dataframe to easier visualize our AL predictions
alPredictionsArray = np.array(alPredictions)
alPredictionsDf = pd.DataFrame(alPredictionsArray)
alPredictionsDf.columns = ["mvp"]
alPredictionsDf

Unnamed: 0,mvp
0,Not MVP
1,Not MVP
2,Not MVP
3,Not MVP
4,Not MVP
5,Not MVP
6,Not MVP
7,Not MVP
8,Not MVP
9,Not MVP


In [86]:
#calculate probabilies of our model's predictions for ALMVP
alPredictionProbabilities = model.predict_proba(namelessALData)
alProbabilitiesArray = np.array(alPredictionProbabilities)
alProbabilitiesDf = pd.DataFrame(alProbabilitiesArray)
alProbabilitiesDf.columns = ["almvp", "not almvp"]
alProbabilitiesDf

Unnamed: 0,almvp,not almvp
0,0.052677,0.947323
1,0.036379,0.963621
2,0.015003,0.984997
3,0.024785,0.975215
4,0.005826,0.994174
5,0.019996,0.980004
6,0.033391,0.966609
7,0.021267,0.978733
8,0.012312,0.987688
9,0.019358,0.980642


In [87]:
alPredictionData

Unnamed: 0,player name,ops,avg,rbi,hr
0,Vladimir Guerrero,1.007,0.318,97,39
1,Shohei Ohtani,0.972,0.258,93,43
2,Aaron Judge,0.917,0.294,75,30
3,Matt Olson,0.917,0.274,89,32
4,Cedric Mullins,0.899,0.305,52,25
5,Jose Ramirez,0.898,0.26,86,32
6,Rafael Devers,0.891,0.274,100,33
7,Yordan Alvarez,0.881,0.282,89,28
8,Kyle Tucker,0.87,0.281,76,24
9,J.D. Martinez,0.868,0.286,88,25


In [88]:
#read and process our NL data, then make predictions
nlPredictionData = pd.read_csv("MLBStatScraper_20210905_national_ops_2021.csv")
namelessNLData = nlPredictionData.drop(columns=["player name"])
nlPredictions = model.predict(namelessNLData)

#create a dataframe to easier visualize our NL predictions
nlPredictionsArray = np.array(nlPredictions)
nlPredictionsDf = pd.DataFrame(nlPredictionsArray)
nlPredictionsDf.columns = ["mvp"]
nlPredictionsDf

Unnamed: 0,mvp
0,Not MVP
1,Not MVP
2,Not MVP
3,Not MVP
4,Not MVP
5,Not MVP
6,Not MVP
7,Not MVP
8,Not MVP
9,Not MVP


In [89]:
#calculate probabilies of our model's predictions for NLMVP
nlPredictionProbabilities = model.predict_proba(namelessNLData)
nlProbabilitiesArray = np.array(nlPredictionProbabilities)
nlProbabilitiesDf = pd.DataFrame(nlProbabilitiesArray)
nlProbabilitiesDf.columns = ["nlmvp", "not nlmvp"]
nlProbabilitiesDf

Unnamed: 0,nlmvp,not nlmvp
0,0.031285,0.968715
1,0.014223,0.985777
2,0.021314,0.978686
3,0.015308,0.984692
4,0.019909,0.980091
5,0.016444,0.983556
6,0.023772,0.976228
7,0.020596,0.979404
8,0.016398,0.983602
9,0.017546,0.982454


In [90]:
nlPredictionData

Unnamed: 0,player name,ops,avg,rbi,hr
0,Fernando Tatis,1.001,0.28,85,37
1,Bryce Harper,0.998,0.301,64,27
2,Juan Soto,0.963,0.3,79,24
3,Jesse Winker,0.954,0.307,71,24
4,Nick Castellanos,0.949,0.32,78,26
5,Max Muncy,0.924,0.256,78,30
6,Austin Riley,0.916,0.305,87,29
7,Joey Votto,0.912,0.269,85,28
8,C.J. Cron,0.911,0.273,79,26
9,Bryan Reynolds,0.902,0.301,81,22
