In [35]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#set pandas to display all rows of a dataframe, create our playerData dataframe
pd.set_option("display.max_rows", None)
playerData = pd.read_csv("MLBStatScraperMVP_20210906_string.csv")

#if we have a model already saved load it, else handle exception by creating a new model
try:
    model = joblib.load("mvp_predictor_logisticregression_model.joblib")
except FileNotFoundError:
    #process data to remove names that cannot be used in training the model, and seperate into input X, and output y sets
    namelessPlayerData = playerData.drop(columns=["player name"])
    X = namelessPlayerData.drop(columns=["mvp"])
    y = namelessPlayerData["mvp"]
    
    #split our data into training and testing sets, using 80% for training and 20% for testing
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

    
    #create, train, and save the model
#     model = LogisticRegression()
    model = LogisticRegression(class_weight="balanced")
    model.fit(X_train, y_train)
    joblib.dump(model, "mvp_predictor_logisticregression_model.joblib")
    
    #testing our model
    predictions = model.predict(X_test)

In [36]:
accuracyScore = accuracy_score(y_test, predictions)
accuracyScore

0.808641975308642

In [37]:
#read and process our AL data, then make predictions
alPredictionData = pd.read_csv("MLBStatScraper_20211004_american_ops_2021.csv")
namelessALData = alPredictionData.drop(columns=["player name"])
alPredictions = model.predict(namelessALData)

#create a dataframe to easier visualize our AL predictions
alPredictionsArray = np.array(alPredictions)
alPredictionsDf = pd.DataFrame(alPredictionsArray)
alPredictionsDf.columns = ["mvp"]
alPredictionsDf

Unnamed: 0,mvp
0,MVP
1,Not MVP
2,Not MVP
3,Not MVP
4,Not MVP
5,Not MVP
6,Not MVP
7,Not MVP
8,Not MVP
9,Not MVP


In [38]:
#calculate probabilies of our model's predictions for ALMVP
alPredictionProbabilities = model.predict_proba(namelessALData)
alProbabilitiesArray = np.array(alPredictionProbabilities)
alProbabilitiesDf = pd.DataFrame(alProbabilitiesArray)
alProbabilitiesDf.columns = ["almvp", "not almvp"]
alProbabilitiesDf

Unnamed: 0,almvp,not almvp
0,0.666913,0.333087
1,0.414183,0.585817
2,0.355016,0.644984
3,0.366983,0.633017
4,0.469946,0.530054
5,0.357627,0.642373
6,0.463709,0.536291
7,0.087529,0.912471
8,0.363391,0.636609
9,0.277779,0.722221


In [39]:
alPredictionData

Unnamed: 0,player name,ops,avg,rbi,hr
0,Vladimir Guerrero,1.002,0.311,111,48
1,Shohei Ohtani,0.965,0.257,100,46
2,Kyle Tucker,0.917,0.294,92,30
3,Aaron Judge,0.916,0.287,98,39
4,Matt Olson,0.911,0.271,111,39
5,Jose Ramirez,0.893,0.266,103,36
6,Rafael Devers,0.89,0.279,113,38
7,Cedric Mullins,0.878,0.291,59,30
8,Yordan Alvarez,0.877,0.277,104,33
9,Marcus Semien,0.873,0.265,102,45


In [40]:
#read and process our NL data, then make predictions
nlPredictionData = pd.read_csv("MLBStatScraper_20211004_national_ops_2021.csv")
namelessNLData = nlPredictionData.drop(columns=["player name"])
nlPredictions = model.predict(namelessNLData)

#create a dataframe to easier visualize our NL predictions
nlPredictionsArray = np.array(nlPredictions)
nlPredictionsDf = pd.DataFrame(nlPredictionsArray)
nlPredictionsDf.columns = ["mvp"]
nlPredictionsDf

Unnamed: 0,mvp
0,MVP
1,MVP
2,Not MVP
3,Not MVP
4,Not MVP
5,Not MVP
6,Not MVP
7,Not MVP
8,Not MVP
9,Not MVP


In [41]:
#calculate probabilies of our model's predictions for NLMVP
nlPredictionProbabilities = model.predict_proba(namelessNLData)
nlProbabilitiesArray = np.array(nlPredictionProbabilities)
nlProbabilitiesDf = pd.DataFrame(nlProbabilitiesArray)
nlProbabilitiesDf.columns = ["nlmvp", "not nlmvp"]
nlProbabilitiesDf

Unnamed: 0,nlmvp,not nlmvp
0,0.525454,0.474546
1,0.581493,0.418507
2,0.454348,0.545652
3,0.484896,0.515104
4,0.404543,0.595457
5,0.360273,0.639727
6,0.220061,0.779939
7,0.256877,0.743123
8,0.324435,0.675565
9,0.471388,0.528612


In [42]:
nlPredictionData

Unnamed: 0,player name,ops,avg,rbi,hr
0,Bryce Harper,1.044,0.309,84,35
1,Juan Soto,0.999,0.313,95,29
2,Fernando Tatis,0.975,0.282,97,42
3,Nick Castellanos,0.939,0.309,100,34
4,Joey Votto,0.938,0.266,99,36
5,Bryan Reynolds,0.912,0.302,90,24
6,Tyler O'Neill,0.912,0.286,80,34
7,Trea Turner,0.911,0.328,77,28
8,C.J. Cron,0.905,0.281,92,28
9,Austin Riley,0.898,0.303,107,33
