## Purpose: Try different models-- Part2.
### Logistic Regression with scaling.

In [1]:
# import dependencies.
import pandas as pd
import numpy as np

from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, classification_report
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings(module="sklearn*", action="ignore")

#### STEP1: Read in dataset.  Remove data from 2016-2019.
- data from 2016-2018 will be used to bs test the model.
- data from 2019 will be used to predict the winners of the 2019 WS.

In [2]:
# read in the data.
team_data = pd.read_csv("../../Resources/clean_data_1905.csv")
del team_data["Unnamed: 0"]
team_data.head()

Unnamed: 0,team,year,A,DP,E,G2,GS2,INN,PB,PO,...,R1,SHO,SO1,SV,TBF,W,WHIP,WP,WPCT,winners
0,St. Louis Cardinals,2019,1033,114,43,104,936,8313.0,3,2771,...,456,4,895,33,3896,56,1.29,21,0.538,0
1,Arizona Diamondbacks,2019,1010,83,45,105,945,8538.0,2,2846,...,472,7,925,24,4001,53,1.28,35,0.505,0
2,Kansas City Royals,2019,990,105,45,106,954,8421.0,6,2807,...,543,5,816,24,4125,39,1.46,34,0.368,0
3,Houston Astros,2019,875,54,50,106,954,8589.0,6,2863,...,432,7,1074,27,3929,67,1.14,31,0.632,0
4,Tampa Bay Rays,2019,975,92,53,107,963,8760.0,11,2920,...,409,6,1037,26,3985,59,1.16,40,0.551,0


In [3]:
# remove data from 2016 through 2019.
team_data_new = team_data.loc[team_data["year"] < 2016]
team_data_new.head()

Unnamed: 0,team,year,A,DP,E,G2,GS2,INN,PB,PO,...,R1,SHO,SO1,SV,TBF,W,WHIP,WP,WPCT,winners
120,San Francisco Giants,2015,1639,136,72,162,1458,13143.0,6,4381,...,631,11,1309,43,6048,87,1.21,40,0.537,0
121,Washington Nationals,2015,1425,142,73,162,1458,13137.0,17,4379,...,612,12,1476,46,6036,95,1.19,47,0.586,0
122,Houston Astros,2015,1599,135,77,162,1458,13212.0,18,4404,...,701,8,1396,44,6180,84,1.29,98,0.519,0
123,Detroit Tigers,2015,1537,148,75,161,1449,12852.0,5,4284,...,721,8,1232,47,6048,86,1.32,44,0.534,0
124,Boston Red Sox,2015,1427,139,75,162,1458,12957.0,37,4319,...,694,5,1362,43,6073,93,1.27,52,0.574,0


In [4]:
# models require numbers! Drop team and year columns.
target = team_data_new["winners"]
features = team_data_new.drop({"team", "year", "winners"}, axis=1)
feature_columns = list(features.columns)
print (target.shape)
print (features.shape)
print (feature_columns)

(2344,)
(2344, 52)
['A', 'DP', 'E', 'G2', 'GS2', 'INN', 'PB', 'PO', 'TC', '2B', '3B', 'AB', 'AO', 'BB', 'G', 'H', 'HBP', 'HR', 'NP_x', 'OBP', 'OPS_x', 'PA', 'R', 'RBI', 'SAC', 'SB', 'SLG', 'TB', 'XBH', 'BB1', 'BK', 'CG', 'ER', 'ERA', 'G1', 'GF', 'GS', 'H1', 'HB', 'HR1', 'IP', 'L', 'OBP1', 'R1', 'SHO', 'SO1', 'SV', 'TBF', 'W', 'WHIP', 'WP', 'WPCT']


In [5]:
# split data to train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=42)

#### STEP2: Try Logistic Regression.

In [6]:
# reshape the target data.
target1 = target.values.reshape(-1, 1)

In [7]:
# split data into train and test sets.
X_train1, X_test1, y_train1, y_test1 = train_test_split(features, target1, random_state=42)

In [8]:
# create standard scaler model and fit the training data.
X_scaler = StandardScaler().fit(X_train1)
y_scaler = StandardScaler().fit(y_train1)

# transform the training and testing data.
X_train_scaled = X_scaler.transform(X_train1)
X_test_scaled = X_scaler.transform(X_test1)



In [9]:
# fit the model.
model = LogisticRegression(solver="lbfgs")
model.fit(X_train_scaled, y_train1)

# predict.
prediction = model.predict(X_test_scaled)

# evaluate the model.
mse = mean_squared_error(y_test1, prediction)
r2 = model.score(X_test_scaled, y_test1)
print (f"MSE: {mse}")
print (f"R2: {r2}")

MSE: 0.05631399317406143
R2: 0.9436860068259386


Wow.  That's a pretty good fit!  Since the scores are so high, try bs test with 2016-2018 data. But wait...  

In [10]:
print (classification_report(y_test1, prediction, target_names=["0", "1"]))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97       553
           1       0.50      0.03      0.06        33

   micro avg       0.94      0.94      0.94       586
   macro avg       0.72      0.51      0.51       586
weighted avg       0.92      0.94      0.92       586



Ah hah!.  The model is predicting 0's correctly but only predicts 1's 50% of the time.  And it isn't terribly accurate.  That means that the two classes (0, 1) are very imbalanced.  Need to address this one.

#### STEP3: Try MinMaxScaler with upsampling

In [11]:
# start over.  grab the data.
team_data_new = team_data_new.reset_index().drop({"index"}, axis=1)
team_data_new.head()

Unnamed: 0,team,year,A,DP,E,G2,GS2,INN,PB,PO,...,R1,SHO,SO1,SV,TBF,W,WHIP,WP,WPCT,winners
0,San Francisco Giants,2015,1639,136,72,162,1458,13143.0,6,4381,...,631,11,1309,43,6048,87,1.21,40,0.537,0
1,Washington Nationals,2015,1425,142,73,162,1458,13137.0,17,4379,...,612,12,1476,46,6036,95,1.19,47,0.586,0
2,Houston Astros,2015,1599,135,77,162,1458,13212.0,18,4404,...,701,8,1396,44,6180,84,1.29,98,0.519,0
3,Detroit Tigers,2015,1537,148,75,161,1449,12852.0,5,4284,...,721,8,1232,47,6048,86,1.32,44,0.534,0
4,Boston Red Sox,2015,1427,139,75,162,1458,12957.0,37,4319,...,694,5,1362,43,6073,93,1.27,52,0.574,0


In [12]:
team_data_new.head()

Unnamed: 0,team,year,A,DP,E,G2,GS2,INN,PB,PO,...,R1,SHO,SO1,SV,TBF,W,WHIP,WP,WPCT,winners
0,San Francisco Giants,2015,1639,136,72,162,1458,13143.0,6,4381,...,631,11,1309,43,6048,87,1.21,40,0.537,0
1,Washington Nationals,2015,1425,142,73,162,1458,13137.0,17,4379,...,612,12,1476,46,6036,95,1.19,47,0.586,0
2,Houston Astros,2015,1599,135,77,162,1458,13212.0,18,4404,...,701,8,1396,44,6180,84,1.29,98,0.519,0
3,Detroit Tigers,2015,1537,148,75,161,1449,12852.0,5,4284,...,721,8,1232,47,6048,86,1.32,44,0.534,0
4,Boston Red Sox,2015,1427,139,75,162,1458,12957.0,37,4319,...,694,5,1362,43,6073,93,1.27,52,0.574,0


In [13]:
def scale_data(team_data_new):

    # separate into features and target values.
    target = team_data_new["winners"]
    features = team_data_new.drop({"team", "winners"}, axis=1)

    # apply annual minmaxscaler to features.
    features_scaled = pd.DataFrame()
    for year in list(set(features["year"])):
        df = features.loc[features["year"] == year]
        df = df.select_dtypes(include=["float", "int"])

        scaler = MinMaxScaler()
        features_transformed = scaler.fit_transform(df)
        features_scaled = features_scaled.append(pd.DataFrame(features_transformed))

    # grab column names, put back to features_scaled.
    features_scaled.columns = features.select_dtypes(include=["float", "int"]).columns
    features_scaled.reset_index(drop=True, inplace=True)

    # remove year column.
    features_scaled = features_scaled.drop({"year"}, axis=1)

    return features_scaled, target

In [14]:
features_scaled, target = scale_data(team_data_new)

In [15]:
# split into train and test sets.
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, random_state=42)
    
# fit the model.
model = LogisticRegression(solver="lbfgs")
model.fit(X_train, y_train)

# predict.
prediction = model.predict(X_test)

print (classification_report(y_test, prediction, target_names=["0", "1"]))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97       553
           1       0.00      0.00      0.00        33

   micro avg       0.94      0.94      0.94       586
   macro avg       0.47      0.50      0.49       586
weighted avg       0.89      0.94      0.92       586



#### STEP3: Try Logistic Regression with Upsampled Data.

In [16]:
# start over.  grab the data.
team_data_new = team_data_new.reset_index().drop({"index"}, axis=1)
team_data_new.head()

Unnamed: 0,team,year,A,DP,E,G2,GS2,INN,PB,PO,...,R1,SHO,SO1,SV,TBF,W,WHIP,WP,WPCT,winners
0,San Francisco Giants,2015,1639,136,72,162,1458,13143.0,6,4381,...,631,11,1309,43,6048,87,1.21,40,0.537,0
1,Washington Nationals,2015,1425,142,73,162,1458,13137.0,17,4379,...,612,12,1476,46,6036,95,1.19,47,0.586,0
2,Houston Astros,2015,1599,135,77,162,1458,13212.0,18,4404,...,701,8,1396,44,6180,84,1.29,98,0.519,0
3,Detroit Tigers,2015,1537,148,75,161,1449,12852.0,5,4284,...,721,8,1232,47,6048,86,1.32,44,0.534,0
4,Boston Red Sox,2015,1427,139,75,162,1458,12957.0,37,4319,...,694,5,1362,43,6073,93,1.27,52,0.574,0


In [17]:
# add columns back to scaled data.
features_scaled["winners"] = team_data_new["winners"]
features_scaled.head()

Unnamed: 0,A,DP,E,G2,GS2,INN,PB,PO,TC,2B,...,R1,SHO,SO1,SV,TBF,W,WHIP,WP,WPCT,winners
0,0.276442,0.738095,0.0,0.75,0.05,0.196998,0.142857,0.711111,0.444947,0.432692,...,0.0,0.928571,0.832143,0.555556,0.411184,1.0,0.0,0.1,1.0,0
1,0.829327,1.0,0.135593,1.0,1.0,1.0,0.428571,1.0,0.972851,1.0,...,0.306061,0.821429,0.217857,0.222222,0.848684,0.597015,0.2,0.25,0.593258,0
2,0.71875,0.357143,0.169492,0.5,0.75,0.633677,0.571429,0.572222,0.677225,0.25,...,0.384848,0.535714,0.607143,1.0,0.404605,0.701493,0.5,0.3,0.705618,0
3,0.415865,0.952381,0.152542,0.625,0.85,0.745779,0.571429,0.502778,0.446456,0.269231,...,0.251515,0.821429,0.225,0.111111,0.279605,0.656716,0.3,0.4,0.651685,0
4,1.0,0.261905,0.313559,0.625,0.85,0.825985,0.321429,0.794444,1.0,0.153846,...,0.239394,1.0,0.264286,0.222222,0.141447,0.656716,0.066667,0.75,0.669663,0


In [28]:
# upsample for a more balanced dataset.
def upsample(dataset, no_samples):
    '''
    INPUT: 
    -dataset = dataset without team names and year.
    -n_samples = number of minority_unsampled.
    
    OUTPUT:
    -X_train_scaled = scaled X train data.
    -X_test_scaled = scaled X test data.
    -y_train = y train data
    -y_test = y test data
    
    DESCRIPTION:
    -dataset is taken in and split into minority and majority classes.
    -dataset is then upsampled for the mainority class
    -split the data into features and targets
    -split data into train and test sets
    -train and test sets were are scaled.
    '''
    
    # separate majority and minority classes.
    df_majority = dataset.loc[dataset["winners"] == 0]
    df_minority = dataset.loc[dataset["winners"] == 1]

    # upsample minority class.
    df_minority_unsampled = resample(df_minority,
                                    replace=True,
                                    n_samples=no_samples,
                                    random_state=123)

    # combine majority class with upsampled minority class.
    df_upsampled = pd.concat([df_majority, df_minority_unsampled])

    # separate features and target.
    y = df_upsampled["winners"]
    X = df_upsampled[feature_columns]
    
    # split into train and test sets.
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    
    return X_train, X_test, y_train, y_test

In [29]:
# Do three different upsamplings.
X_train_100, X_test_100, y_train_100, y_test_100 = upsample(features_scaled, 2234)
X_train_50, X_test_50, y_train_50, y_test_50 = upsample(features_scaled, 1117)
X_train_25, X_test_25, y_train_25, y_test_25 = upsample(features_scaled, 559)

In [30]:
def fit_logistic(X_train, X_test, y_train, y_test):
    '''
    INPUT: 
    -X_train = scaled X train data.
    -X_test = scaled X test data.
    -y_train = y train data.
    -y_test = y test data.
    
    OUTPUT:
    -classification report (has F1 score, precision and recall).
    -model = saved model for prediction. 
    
    DESCRIPTION:
    -the scaled and split data is put through a logistic regression.
    -the model is trained.
    -a prediction is made.
    -print out the classification report and give the model.
    '''
    
    # fit the model.
    model = LogisticRegression(solver="lbfgs", max_iter= 2000)
    model.fit(X_train, y_train)

    # predict.
    prediction = model.predict(X_test)
    
    # print out the classification report
    print (classification_report(y_test, prediction, target_names=["0", "1"]))
    
    return model

In [31]:
# for 1 part 0 to 1 part 1
model_100 = fit_logistic(X_train_100, X_test_100, y_train_100, y_test_100)

              precision    recall  f1-score   support

           0       0.62      0.62      0.62       573
           1       0.60      0.60      0.60       544

   micro avg       0.61      0.61      0.61      1117
   macro avg       0.61      0.61      0.61      1117
weighted avg       0.61      0.61      0.61      1117



In [32]:
# for 1 part 0 to 0.5 part 1
model_50 = fit_logistic(X_train_50, X_test_50, y_train_50, y_test_50)

              precision    recall  f1-score   support

           0       0.70      0.94      0.80       570
           1       0.54      0.16      0.25       268

   micro avg       0.69      0.69      0.69       838
   macro avg       0.62      0.55      0.52       838
weighted avg       0.65      0.69      0.63       838



In [33]:
# for 1 part 0 to 0.25 part 1
model_25 = fit_logistic(X_train_25, X_test_25, y_train_25, y_test_25)

              precision    recall  f1-score   support

           0       0.80      1.00      0.89       556
           1       1.00      0.01      0.01       143

   micro avg       0.80      0.80      0.80       699
   macro avg       0.90      0.50      0.45       699
weighted avg       0.84      0.80      0.71       699



Ok.  That looks a little better -- at least it is predicting 0 and 1s.  The model_100 seems to be most even between predicting 0 and 1s.  Use that one.

#### STEP4: Predict 2016-2018 winners.

In [34]:
def predict_the_winner(model, year, team_data, X_train):
    '''
    INPUT: 
    -X_train = scaled X train data.
    -model = the saved model.
    -team_data = complete dataframe with all data.
    -year = the year want to look at.
    
    OUTPUT:
    -printed prediction.
    
    DESCRIPTION:
    -data from year of interest is isolated.
    -the data are scaled.
    -the prediction is made.
    -print out the resulting probability and the name of the team.
    '''
    
    # grab the data.
    team_data = team_data.loc[team_data["year"] == year].reset_index()
    
    # set features (no team, year, winners).
    # set target (winners).
    features = team_data[feature_columns]
    
    # scale X_train and X_test.
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    features = scaler.fit_transform(features)
    
    # fit the model.
    probabilities = model.predict_proba(features)

    # convert predictions to datafram.e
    WS_predictions = pd.DataFrame(probabilities[:,1])

    # Sort the DataFrame (descending)
    WS_predictions = WS_predictions.sort_values(0, ascending=False)

    WS_predictions['Probability'] = WS_predictions[0]

    # Print 50 highest probability HoF inductees from still eligible players
    for i, row in WS_predictions.head(50).iterrows():
       prob = ' '.join(('WS Probability =', str(row['Probability'])))
       print('')
       print(prob)
       print(team_data.iloc[i, :]["team"])

##### 2018 Prediction

In [35]:
# predict for 2018.
predict_the_winner(model_100, 2018, team_data, X_train_100)


WS Probability = 0.9975794655717883
Detroit Tigers

WS Probability = 0.9959083981506428
St. Louis Cardinals

WS Probability = 0.9941698360822696
Cincinnati Reds

WS Probability = 0.9926828366599859
Chicago White Sox

WS Probability = 0.988583727896264
Baltimore Orioles

WS Probability = 0.9699691786267038
New York Yankees

WS Probability = 0.9599357252922823
Minnesota Twins

WS Probability = 0.956121272014548
Philadelphia Phillies

WS Probability = 0.9484819085879085
San Diego Padres

WS Probability = 0.9482318897331948
Los Angeles Angels

WS Probability = 0.944800524443883
Chicago Cubs

WS Probability = 0.9442227787034219
Washington Nationals

WS Probability = 0.9338441241684621
Oakland Athletics

WS Probability = 0.9237707570442406
Toronto Blue Jays

WS Probability = 0.9114868579603316
Los Angeles Dodgers

WS Probability = 0.9061417941024638
Miami Marlins

WS Probability = 0.9042634352946205
Pittsburgh Pirates

WS Probability = 0.8928922850810614
Seattle Mariners

WS Probability = 0

This does not look good.  The Red Sox won 2018.

##### 2017 Prediction

In [36]:
# predict for 2017.
predict_the_winner(model_100, 2017, team_data, X_train_100)


WS Probability = 0.9976771620111261
Cincinnati Reds

WS Probability = 0.9964880877795317
Texas Rangers

WS Probability = 0.995137080797179
St. Louis Cardinals

WS Probability = 0.9869887943810981
Minnesota Twins

WS Probability = 0.9824046349273565
Detroit Tigers

WS Probability = 0.9803267265141581
Baltimore Orioles

WS Probability = 0.9793174130026131
Kansas City Royals

WS Probability = 0.9750706162673903
Miami Marlins

WS Probability = 0.9714227495067738
Toronto Blue Jays

WS Probability = 0.9683278605862046
Oakland Athletics

WS Probability = 0.9618523695857589
Atlanta Braves

WS Probability = 0.9303566972543154
Pittsburgh Pirates

WS Probability = 0.9149312510868394
Tampa Bay Rays

WS Probability = 0.9068681161094939
Chicago Cubs

WS Probability = 0.9015484911119062
Washington Nationals

WS Probability = 0.8959187104555753
New York Mets

WS Probability = 0.877695243900447
Los Angeles Angels

WS Probability = 0.8555277081574868
San Francisco Giants

WS Probability = 0.83855815923

And the Astros won 2017. 

##### 2016 Prediction

In [37]:
# predict for 2016.
predict_the_winner(model_100, 2016, team_data, X_train_100)


WS Probability = 0.9922688704768011
Oakland Athletics

WS Probability = 0.990484529307817
Texas Rangers

WS Probability = 0.9885874748273272
Cincinnati Reds

WS Probability = 0.9842633583141229
New York Mets

WS Probability = 0.9812421229540422
Minnesota Twins

WS Probability = 0.9774026463411918
Kansas City Royals

WS Probability = 0.9724185473649842
Detroit Tigers

WS Probability = 0.9682397292467926
Atlanta Braves

WS Probability = 0.9655416727872286
St. Louis Cardinals

WS Probability = 0.9607905786752322
Chicago White Sox

WS Probability = 0.9566913259907093
Toronto Blue Jays

WS Probability = 0.9553898838060676
Seattle Mariners

WS Probability = 0.9507513906713907
Baltimore Orioles

WS Probability = 0.9485221496814161
San Diego Padres

WS Probability = 0.9137292124866053
Colorado Rockies

WS Probability = 0.8995159905218364
Pittsburgh Pirates

WS Probability = 0.8980513140930948
Los Angeles Angels

WS Probability = 0.8611567602883019
San Francisco Giants

WS Probability = 0.8459

The Cubs won 2016.  Nope.