## Purpose: Try different models-- Part2.
### Logistic Regression with scaling.

In [1]:
# import dependencies.
import pandas as pd
import numpy as np

from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, classification_report
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings(module="sklearn*", action="ignore")

#### STEP1: Read in dataset.  Remove data from 2016-2019.
- data from 2016-2018 will be used to bs test the model.
- data from 2019 will be used to predict the winners of the 2019 WS.

In [2]:
# read in the data.
team_data = pd.read_csv("../../Resources/clean_data_1969.csv")
del team_data["Unnamed: 0"]
team_data.head()

Unnamed: 0,team,year,A,DP,E,G2,GS2,INN,PB,PO,...,SHO,SO1,SV,SVO,TBF,W,WHIP,WP,WPCT,winners
0,St. Louis Cardinals,2019,1033,114,43,104,936,8313.0,3,2771,...,4,895,33,42,3896,56,1.29,21,0.538,0
1,Arizona Diamondbacks,2019,1010,83,45,105,945,8538.0,2,2846,...,7,925,24,37,4001,53,1.28,35,0.505,0
2,Kansas City Royals,2019,990,105,45,106,954,8421.0,6,2807,...,5,816,24,41,4125,39,1.46,34,0.368,0
3,Houston Astros,2019,875,54,50,106,954,8589.0,6,2863,...,7,1074,27,42,3929,67,1.14,31,0.632,0
4,Tampa Bay Rays,2019,975,92,53,107,963,8760.0,11,2920,...,6,1037,26,43,3985,59,1.16,40,0.551,0


In [3]:
# remove data from 2016 through 2019.
team_data_new = team_data.loc[team_data["year"] < 2016]
team_data_new.head()

Unnamed: 0,team,year,A,DP,E,G2,GS2,INN,PB,PO,...,SHO,SO1,SV,SVO,TBF,W,WHIP,WP,WPCT,winners
120,San Francisco Giants,2015,1639,136,72,162,1458,13143.0,6,4381,...,11,1309,43,72,6048,87,1.21,40,0.537,0
121,Washington Nationals,2015,1425,142,73,162,1458,13137.0,17,4379,...,12,1476,46,60,6036,95,1.19,47,0.586,0
122,Houston Astros,2015,1599,135,77,162,1458,13212.0,18,4404,...,8,1396,44,64,6180,84,1.29,98,0.519,0
123,Detroit Tigers,2015,1537,148,75,161,1449,12852.0,5,4284,...,8,1232,47,66,6048,86,1.32,44,0.534,0
124,Boston Red Sox,2015,1427,139,75,162,1458,12957.0,37,4319,...,5,1362,43,61,6073,93,1.27,52,0.574,0


In [4]:
# models require numbers! Drop team and year columns.
target = team_data_new["winners"]
features = team_data_new.drop({"team", "year", "winners"}, axis=1)
feature_columns = list(features.columns)
print (target.shape)
print (features.shape)
print (feature_columns)

(1266,)
(1266, 59)
['A', 'DP', 'E', 'G2', 'GS2', 'INN', 'PB', 'PO', 'TC', '2B', '3B', 'AB', 'AO', 'BB', 'CS', 'G', 'GDP', 'H', 'HBP', 'HR', 'IBB', 'NP_x', 'OBP', 'OPS_x', 'PA', 'R', 'RBI', 'SAC', 'SB', 'SF', 'SLG', 'SO', 'TB', 'XBH', 'BB1', 'BK', 'CG', 'ER', 'ERA', 'G1', 'GF', 'GS', 'H1', 'HB', 'HR1', 'IBB1', 'IP', 'L', 'OBP1', 'R1', 'SHO', 'SO1', 'SV', 'SVO', 'TBF', 'W', 'WHIP', 'WP', 'WPCT']


In [5]:
# split data to train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=42)

#### STEP2: Try Logistic Regression.

In [6]:
# reshape the target data.
target1 = target.values.reshape(-1, 1)

In [7]:
# split data into train and test sets.
X_train1, X_test1, y_train1, y_test1 = train_test_split(features, target1, random_state=42)

In [8]:
# create standard scaler model and fit the training data.
X_scaler = StandardScaler().fit(X_train1)
y_scaler = StandardScaler().fit(y_train1)

# transform the training and testing data.
X_train_scaled = X_scaler.transform(X_train1)
X_test_scaled = X_scaler.transform(X_test1)



In [9]:
# fit the model.
model = LogisticRegression(solver="lbfgs")
model.fit(X_train_scaled, y_train1)

# predict.
prediction = model.predict(X_test_scaled)

# evaluate the model.
mse = mean_squared_error(y_test1, prediction)
r2 = model.score(X_test_scaled, y_test1)
print (f"MSE: {mse}")
print (f"R2: {r2}")

MSE: 0.03785488958990536
R2: 0.9621451104100947


Wow.  That's a pretty good fit!  Since the scores are so high, try bs test with 2016-2018 data. But wait...  

In [10]:
print (classification_report(y_test1, prediction, target_names=["0", "1"]))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       304
           1       1.00      0.08      0.14        13

   micro avg       0.96      0.96      0.96       317
   macro avg       0.98      0.54      0.56       317
weighted avg       0.96      0.96      0.95       317



Ah hah!.  The model is predicting 0's correctly but only predicts 1's 50% of the time.  And it isn't terribly accurate.  That means that the two classes (0, 1) are very imbalanced.  Need to address this one.

#### STEP3: Try MinMaxScaler with upsampling

In [11]:
# start over.  grab the data.
team_data_new = team_data_new.reset_index().drop({"index"}, axis=1)
team_data_new.head()

Unnamed: 0,team,year,A,DP,E,G2,GS2,INN,PB,PO,...,SHO,SO1,SV,SVO,TBF,W,WHIP,WP,WPCT,winners
0,San Francisco Giants,2015,1639,136,72,162,1458,13143.0,6,4381,...,11,1309,43,72,6048,87,1.21,40,0.537,0
1,Washington Nationals,2015,1425,142,73,162,1458,13137.0,17,4379,...,12,1476,46,60,6036,95,1.19,47,0.586,0
2,Houston Astros,2015,1599,135,77,162,1458,13212.0,18,4404,...,8,1396,44,64,6180,84,1.29,98,0.519,0
3,Detroit Tigers,2015,1537,148,75,161,1449,12852.0,5,4284,...,8,1232,47,66,6048,86,1.32,44,0.534,0
4,Boston Red Sox,2015,1427,139,75,162,1458,12957.0,37,4319,...,5,1362,43,61,6073,93,1.27,52,0.574,0


In [12]:
team_data_new.head()

Unnamed: 0,team,year,A,DP,E,G2,GS2,INN,PB,PO,...,SHO,SO1,SV,SVO,TBF,W,WHIP,WP,WPCT,winners
0,San Francisco Giants,2015,1639,136,72,162,1458,13143.0,6,4381,...,11,1309,43,72,6048,87,1.21,40,0.537,0
1,Washington Nationals,2015,1425,142,73,162,1458,13137.0,17,4379,...,12,1476,46,60,6036,95,1.19,47,0.586,0
2,Houston Astros,2015,1599,135,77,162,1458,13212.0,18,4404,...,8,1396,44,64,6180,84,1.29,98,0.519,0
3,Detroit Tigers,2015,1537,148,75,161,1449,12852.0,5,4284,...,8,1232,47,66,6048,86,1.32,44,0.534,0
4,Boston Red Sox,2015,1427,139,75,162,1458,12957.0,37,4319,...,5,1362,43,61,6073,93,1.27,52,0.574,0


In [13]:
def scale_data(team_data_new):

    # separate into features and target values.
    target = team_data_new["winners"]
    features = team_data_new.drop({"team", "winners"}, axis=1)

    # apply annual minmaxscaler to features.
    features_scaled = pd.DataFrame()
    for year in list(set(features["year"])):
        df = features.loc[features["year"] == year]
        df = df.select_dtypes(include=["float", "int"])

        scaler = MinMaxScaler()
        features_transformed = scaler.fit_transform(df)
        features_scaled = features_scaled.append(pd.DataFrame(features_transformed))

    # grab column names, put back to features_scaled.
    features_scaled.columns = features.select_dtypes(include=["float", "int"]).columns
    features_scaled.reset_index(drop=True, inplace=True)

    # remove year column.
    features_scaled = features_scaled.drop({"year"}, axis=1)

    return features_scaled, target

In [14]:
features_scaled, target = scale_data(team_data_new)

In [15]:
# split into train and test sets.
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, random_state=42)
    
# fit the model.
model = LogisticRegression(solver="lbfgs")
model.fit(X_train, y_train)

# predict.
prediction = model.predict(X_test)

print (classification_report(y_test, prediction, target_names=["0", "1"]))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       304
           1       0.00      0.00      0.00        13

   micro avg       0.96      0.96      0.96       317
   macro avg       0.48      0.50      0.49       317
weighted avg       0.92      0.96      0.94       317



#### STEP3: Try Logistic Regression with Upsampled Data.

In [16]:
# start over.  grab the data.
team_data_new = team_data_new.reset_index().drop({"index"}, axis=1)
team_data_new.head()

Unnamed: 0,team,year,A,DP,E,G2,GS2,INN,PB,PO,...,SHO,SO1,SV,SVO,TBF,W,WHIP,WP,WPCT,winners
0,San Francisco Giants,2015,1639,136,72,162,1458,13143.0,6,4381,...,11,1309,43,72,6048,87,1.21,40,0.537,0
1,Washington Nationals,2015,1425,142,73,162,1458,13137.0,17,4379,...,12,1476,46,60,6036,95,1.19,47,0.586,0
2,Houston Astros,2015,1599,135,77,162,1458,13212.0,18,4404,...,8,1396,44,64,6180,84,1.29,98,0.519,0
3,Detroit Tigers,2015,1537,148,75,161,1449,12852.0,5,4284,...,8,1232,47,66,6048,86,1.32,44,0.534,0
4,Boston Red Sox,2015,1427,139,75,162,1458,12957.0,37,4319,...,5,1362,43,61,6073,93,1.27,52,0.574,0


In [17]:
# add columns back to scaled data.
features_scaled["winners"] = team_data_new["winners"]
features_scaled.head()

Unnamed: 0,A,DP,E,G2,GS2,INN,PB,PO,TC,2B,...,SHO,SO1,SV,SVO,TBF,W,WHIP,WP,WPCT,winners
0,0.079304,0.207792,0.0,0.0,0.0,0.642412,0.592593,0.451362,0.09329,0.547945,...,0.4,0.907643,0.4,0.444444,0.669192,0.326923,0.607143,0.338983,0.333333,0
1,0.816248,0.714286,0.2,0.5,0.5,0.56341,0.333333,0.59144,0.795417,0.0,...,0.6,0.194268,0.5,0.577778,0.449495,0.269231,0.535714,0.322034,0.267913,0
2,0.491296,0.662338,0.233333,0.5,0.5,0.665281,0.888889,0.474708,0.474632,0.178082,...,0.533333,0.509554,0.725,0.8,0.179293,0.576923,0.178571,0.338983,0.576324,0
3,0.0,0.233766,0.183333,0.5,0.5,0.60499,0.111111,0.346304,0.0,0.369863,...,0.533333,0.961783,0.3,0.333333,0.265152,0.519231,0.178571,0.271186,0.517134,0
4,0.2147,0.155844,0.2,0.5,0.5,0.37422,0.296296,0.0,0.037643,0.630137,...,0.666667,0.566879,0.95,1.0,0.106061,0.807692,0.142857,0.101695,0.806854,0


In [18]:
# upsample for a more balanced dataset.
def upsample(dataset, no_samples):
    '''
    INPUT: 
    -dataset = dataset without team names and year.
    -n_samples = number of minority_unsampled.
    
    OUTPUT:
    -X_train_scaled = scaled X train data.
    -X_test_scaled = scaled X test data.
    -y_train = y train data
    -y_test = y test data
    
    DESCRIPTION:
    -dataset is taken in and split into minority and majority classes.
    -dataset is then upsampled for the mainority class
    -split the data into features and targets
    -split data into train and test sets
    -train and test sets were are scaled.
    '''
    
    # separate majority and minority classes.
    df_majority = dataset.loc[dataset["winners"] == 0]
    df_minority = dataset.loc[dataset["winners"] == 1]

    # upsample minority class.
    df_minority_unsampled = resample(df_minority,
                                    replace=True,
                                    n_samples=no_samples,
                                    random_state=123)

    # combine majority class with upsampled minority class.
    df_upsampled = pd.concat([df_majority, df_minority_unsampled])

    # separate features and target.
    y = df_upsampled["winners"]
    X = df_upsampled[feature_columns]
    
    # split into train and test sets.
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    
    return X_train, X_test, y_train, y_test

In [19]:
# Do three different upsamplings.
X_train_100, X_test_100, y_train_100, y_test_100 = upsample(features_scaled, 2234)
X_train_50, X_test_50, y_train_50, y_test_50 = upsample(features_scaled, 1117)
X_train_25, X_test_25, y_train_25, y_test_25 = upsample(features_scaled, 559)

In [20]:
def fit_logistic(X_train, X_test, y_train, y_test):
    '''
    INPUT: 
    -X_train = scaled X train data.
    -X_test = scaled X test data.
    -y_train = y train data.
    -y_test = y test data.
    
    OUTPUT:
    -classification report (has F1 score, precision and recall).
    -model = saved model for prediction. 
    
    DESCRIPTION:
    -the scaled and split data is put through a logistic regression.
    -the model is trained.
    -a prediction is made.
    -print out the classification report and give the model.
    '''
    
    # fit the model.
    model = LogisticRegression(solver="lbfgs", max_iter= 2000)
    model.fit(X_train, y_train)

    # predict.
    prediction = model.predict(X_test)
    
    # print out the classification report
    print (classification_report(y_test, prediction, target_names=["0", "1"]))
    
    return model

In [21]:
# for 1 part 0 to 1 part 1
model_100 = fit_logistic(X_train_100, X_test_100, y_train_100, y_test_100)

              precision    recall  f1-score   support

           0       0.88      0.39      0.54       340
           1       0.71      0.97      0.82       524

   micro avg       0.74      0.74      0.74       864
   macro avg       0.80      0.68      0.68       864
weighted avg       0.78      0.74      0.71       864



In [22]:
# for 1 part 0 to 0.5 part 1
model_50 = fit_logistic(X_train_50, X_test_50, y_train_50, y_test_50)

              precision    recall  f1-score   support

           0       0.65      0.66      0.66       306
           1       0.62      0.62      0.62       279

   micro avg       0.64      0.64      0.64       585
   macro avg       0.64      0.64      0.64       585
weighted avg       0.64      0.64      0.64       585



In [23]:
# for 1 part 0 to 0.25 part 1
model_25 = fit_logistic(X_train_25, X_test_25, y_train_25, y_test_25)

              precision    recall  f1-score   support

           0       0.70      0.91      0.79       303
           1       0.47      0.17      0.25       142

   micro avg       0.67      0.67      0.67       445
   macro avg       0.59      0.54      0.52       445
weighted avg       0.63      0.67      0.62       445



Ok.  That looks a little better -- at least it is predicting 0 and 1s.  The model_100 seems to be most even between predicting 0 and 1s.  Use that one.

#### STEP4: Predict 2016-2018 winners.

In [24]:
def predict_the_winner(model, year, team_data, X_train):
    '''
    INPUT: 
    -X_train = scaled X train data.
    -model = the saved model.
    -team_data = complete dataframe with all data.
    -year = the year want to look at.
    
    OUTPUT:
    -printed prediction.
    
    DESCRIPTION:
    -data from year of interest is isolated.
    -the data are scaled.
    -the prediction is made.
    -print out the resulting probability and the name of the team.
    '''
    
    # grab the data.
    team_data = team_data.loc[team_data["year"] == year].reset_index()
    
    # set features (no team, year, winners).
    # set target (winners).
    features = team_data[feature_columns]
    
    # scale X_train and X_test.
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    features = scaler.fit_transform(features)
    
    # fit the model.
    probabilities = model.predict_proba(features)

    # convert predictions to datafram.e
    WS_predictions = pd.DataFrame(probabilities[:,1])

    # Sort the DataFrame (descending)
    WS_predictions = WS_predictions.sort_values(0, ascending=False)

    WS_predictions['Probability'] = WS_predictions[0]

    # Print 50 highest probability HoF inductees from still eligible players
    for i, row in WS_predictions.head(50).iterrows():
       prob = ' '.join(('WS Probability =', str(row['Probability'])))
       print('')
       print(prob)
       print(team_data.iloc[i, :]["team"])

##### 2018 Prediction

In [27]:
# predict for 2018.
predict_the_winner(model_100, 2018, team_data, X_train_100)


WS Probability = 0.9927780156604565
Philadelphia Phillies

WS Probability = 0.9819942788460531
Miami Marlins

WS Probability = 0.972014980440298
Oakland Athletics

WS Probability = 0.9023238988516328
Tampa Bay Rays

WS Probability = 0.8407429242665494
Baltimore Orioles

WS Probability = 0.8402172355958232
Texas Rangers

WS Probability = 0.8242165787696276
Atlanta Braves

WS Probability = 0.8209098522866287
Kansas City Royals

WS Probability = 0.7585046691120783
Chicago Cubs

WS Probability = 0.586272314944325
New York Mets

WS Probability = 0.468153075273536
Detroit Tigers

WS Probability = 0.33665793265787064
Cincinnati Reds

WS Probability = 0.19678117415409416
Washington Nationals

WS Probability = 0.1615258480888124
Cleveland Indians

WS Probability = 0.13295899375093584
Chicago White Sox

WS Probability = 0.0941784441099002
San Francisco Giants

WS Probability = 0.08083653225176154
New York Yankees

WS Probability = 0.06743705261026949
Seattle Mariners

WS Probability = 0.0672847

This does not look good.  The Red Sox won 2018.

##### 2017 Prediction

In [28]:
# predict for 2017.
predict_the_winner(model_100, 2017, team_data, X_train_100)


WS Probability = 0.9967336849515465
Miami Marlins

WS Probability = 0.9944752215799231
Tampa Bay Rays

WS Probability = 0.9902382448435176
Oakland Athletics

WS Probability = 0.969011828210464
Chicago White Sox

WS Probability = 0.8750622492624983
Los Angeles Dodgers

WS Probability = 0.8641509495555776
New York Mets

WS Probability = 0.830455125758249
Chicago Cubs

WS Probability = 0.6762858629569851
Milwaukee Brewers

WS Probability = 0.6477262791026757
Boston Red Sox

WS Probability = 0.48353877841409476
Minnesota Twins

WS Probability = 0.3201697284303745
Arizona Diamondbacks

WS Probability = 0.26635265468839836
Texas Rangers

WS Probability = 0.22153392522098136
Washington Nationals

WS Probability = 0.20058765238736653
Seattle Mariners

WS Probability = 0.1908590242905949
Detroit Tigers

WS Probability = 0.1794072565127242
New York Yankees

WS Probability = 0.13702165314543413
San Francisco Giants

WS Probability = 0.12694632282316853
Los Angeles Angels

WS Probability = 0.0900

And the Astros won 2017. 

##### 2016 Prediction

In [29]:
# predict for 2016.
predict_the_winner(model_100, 2016, team_data, X_train_100)


WS Probability = 0.9945919337880463
Philadelphia Phillies

WS Probability = 0.9935986506291473
Minnesota Twins

WS Probability = 0.9538995899499126
Tampa Bay Rays

WS Probability = 0.9156203174776603
Seattle Mariners

WS Probability = 0.9109868019778568
Los Angeles Dodgers

WS Probability = 0.8875702652024116
Milwaukee Brewers

WS Probability = 0.8224077673522732
Chicago White Sox

WS Probability = 0.8174877041807032
Chicago Cubs

WS Probability = 0.8156650559710545
Baltimore Orioles

WS Probability = 0.7895317622726482
Texas Rangers

WS Probability = 0.7762134145403885
New York Yankees

WS Probability = 0.7297103764304513
Cincinnati Reds

WS Probability = 0.6645576277107077
Miami Marlins

WS Probability = 0.4603695059313815
Detroit Tigers

WS Probability = 0.30182306858339075
Los Angeles Angels

WS Probability = 0.24451176983272813
St. Louis Cardinals

WS Probability = 0.20230216347931793
San Diego Padres

WS Probability = 0.18861618736932317
Pittsburgh Pirates

WS Probability = 0.11

The Cubs won 2016. I want to try another model--want to improve F1 score.  Also, minmax scaling doesn't do anything else.