## Purpose: Try different models-- Part2.
### Logistic Regression with scaling.

In [1]:
# import dependencies.
import pandas as pd
import numpy as np

from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, classification_report
from sklearn.model_selection import train_test_split

#### STEP1: Read in dataset.  Remove data from 2016-2019.
- data from 2016-2018 will be used to bs test the model.
- data from 2019 will be used to predict the winners of the 2019 WS.

In [2]:
# read in the data.
team_data = pd.read_csv("../Resources/clean_data.csv")
del team_data["Unnamed: 0"]
team_data.head()

Unnamed: 0,team,year,A,DP,E,G2,GS2,INN,PB,PO,...,R1,SHO,SO1,SV,TBF,W,WHIP,WP,WPCT,winners
0,St. Louis Cardinals,2019,1033,114,43,104,936,8313.0,3,2771,...,456,4,895,33,3896,56,1.29,21,0.538,0
1,Arizona Diamondbacks,2019,1010,83,45,105,945,8538.0,2,2846,...,472,7,925,24,4001,53,1.28,35,0.505,0
2,Kansas City Royals,2019,990,105,45,106,954,8421.0,6,2807,...,543,5,816,24,4125,39,1.46,34,0.368,0
3,Houston Astros,2019,875,54,50,106,954,8589.0,6,2863,...,432,7,1074,27,3929,67,1.14,31,0.632,0
4,Tampa Bay Rays,2019,975,92,53,107,963,8760.0,11,2920,...,409,6,1037,26,3985,59,1.16,40,0.551,0


In [3]:
# remove data from 2016 through 2019.
team_data_new = team_data.loc[team_data["year"] < 2016]
team_data_new.head()

Unnamed: 0,team,year,A,DP,E,G2,GS2,INN,PB,PO,...,R1,SHO,SO1,SV,TBF,W,WHIP,WP,WPCT,winners
120,San Francisco Giants,2015,1639,136,72,162,1458,13143.0,6,4381,...,631,11,1309,43,6048,87,1.21,40,0.537,0
121,Washington Nationals,2015,1425,142,73,162,1458,13137.0,17,4379,...,612,12,1476,46,6036,95,1.19,47,0.586,0
122,Houston Astros,2015,1599,135,77,162,1458,13212.0,18,4404,...,701,8,1396,44,6180,84,1.29,98,0.519,0
123,Detroit Tigers,2015,1537,148,75,161,1449,12852.0,5,4284,...,721,8,1232,47,6048,86,1.32,44,0.534,0
124,Boston Red Sox,2015,1427,139,75,162,1458,12957.0,37,4319,...,694,5,1362,43,6073,93,1.27,52,0.574,0


In [4]:
# models require numbers! Drop team and year columns.
target = team_data_new["winners"]
features = team_data_new.drop({"team", "year", "winners"}, axis=1)
print (target.shape)
print (features.shape)

(2344,)
(2344, 52)


In [5]:
# split data to train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=42)

#### STEP2: Try Logistic Regression.

In [6]:
# reshape the target data.
target1 = target.values.reshape(-1, 1)

In [7]:
# split data into train and test sets.
X_train1, X_test1, y_train1, y_test1 = train_test_split(features, target1, random_state=42)

In [8]:
# create standard scaler model and fit the training data.
X_scaler = StandardScaler().fit(X_train1)
y_scaler = StandardScaler().fit(y_train1)

# transform the training and testing data.
X_train_scaled = X_scaler.transform(X_train1)
X_test_scaled = X_scaler.transform(X_test1)

  return self.partial_fit(X, y)


In [9]:
# fit the model.
model = LogisticRegression(solver="lbfgs")
model.fit(X_train_scaled, y_train1)

# predict.
prediction = model.predict(X_test_scaled)

# evaluate the model.
mse = mean_squared_error(y_test1, prediction)
r2 = model.score(X_test_scaled, y_test1)
print (f"MSE: {mse}")
print (f"R2: {r2}")

MSE: 0.05631399317406143
R2: 0.9436860068259386


  y = column_or_1d(y, warn=True)


Wow.  That's a pretty good fit!  Since the scores are so high, try bs test with 2016-2018 data. But wait...  

In [10]:
print (classification_report(y_test1, prediction, target_names=["0", "1"]))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97       553
           1       0.50      0.03      0.06        33

   micro avg       0.94      0.94      0.94       586
   macro avg       0.72      0.51      0.51       586
weighted avg       0.92      0.94      0.92       586



Ah hah!.  The model is predicting 0's correctly but only predicts 1's 50% of the time.  And it isn't terribly accurate.  That means that the two classes (0, 1) are very imbalanced.  Need to address this one.

#### STEP3: Try Logistic Regression with Upsampled Data.

In [12]:
# start over.  grab the data.
team_data_new = team_data_new.reset_index().drop({"index"}, axis=1)
team_data_new.head()

Unnamed: 0,team,year,A,DP,E,G2,GS2,INN,PB,PO,...,R1,SHO,SO1,SV,TBF,W,WHIP,WP,WPCT,winners
0,San Francisco Giants,2015,1639,136,72,162,1458,13143.0,6,4381,...,631,11,1309,43,6048,87,1.21,40,0.537,0
1,Washington Nationals,2015,1425,142,73,162,1458,13137.0,17,4379,...,612,12,1476,46,6036,95,1.19,47,0.586,0
2,Houston Astros,2015,1599,135,77,162,1458,13212.0,18,4404,...,701,8,1396,44,6180,84,1.29,98,0.519,0
3,Detroit Tigers,2015,1537,148,75,161,1449,12852.0,5,4284,...,721,8,1232,47,6048,86,1.32,44,0.534,0
4,Boston Red Sox,2015,1427,139,75,162,1458,12957.0,37,4319,...,694,5,1362,43,6073,93,1.27,52,0.574,0


In [13]:
# remove team and year from the df.
team_data_new = team_data_new.drop({"team", "year"}, axis=1)
team_data_new.head()

Unnamed: 0,A,DP,E,G2,GS2,INN,PB,PO,TC,2B,...,R1,SHO,SO1,SV,TBF,W,WHIP,WP,WPCT,winners
0,1639,136,72,162,1458,13143.0,6,4381,6092,280,...,631,11,1309,43,6048,87,1.21,40,0.537,0
1,1425,142,73,162,1458,13137.0,17,4379,5877,268,...,612,12,1476,46,6036,95,1.19,47,0.586,0
2,1599,135,77,162,1458,13212.0,18,4404,6080,291,...,701,8,1396,44,6180,84,1.29,98,0.519,0
3,1537,148,75,161,1449,12852.0,5,4284,5896,252,...,721,8,1232,47,6048,86,1.32,44,0.534,0
4,1427,139,75,162,1458,12957.0,37,4319,5821,343,...,694,5,1362,43,6073,93,1.27,52,0.574,0


In [14]:
# separate majority and minority classes.
df_majority = team_data_new.loc[team_data_new["winners"] == 0]
df_minority = team_data_new.loc[team_data_new["winners"] == 1]

# upsample minority class.
df_minority_unsampled = resample(df_minority,
                                replace=True,
                                n_samples=2234,
                                random_state=123)

# combine majority class with upsampled minority class.
df_upsampled = pd.concat([df_majority, df_minority_unsampled])

# display new class counts.
df_upsampled["winners"].value_counts()

1    2234
0    2234
Name: winners, dtype: int64

In [15]:
# separate features and target.
y = df_upsampled["winners"].values.reshape(-1,1)
X = df_upsampled.drop("winners", axis=1)
print (y.shape)
print (X.shape)

# split into train and test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# scale X_train and X_test.
scaler = StandardScaler()

# transform the training and testing data.
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

# fit the model.
model = LogisticRegression(solver="lbfgs")
model.fit(X_train_scaled, y_train)

# predict.
prediction = model.predict(X_test_scaled)

(4468, 1)
(4468, 52)


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  y = column_or_1d(y, warn=True)


In [16]:
print (classification_report(y_test, prediction, target_names=["0", "1"]))

              precision    recall  f1-score   support

           0       0.75      0.68      0.71       573
           1       0.69      0.76      0.72       544

   micro avg       0.72      0.72      0.72      1117
   macro avg       0.72      0.72      0.72      1117
weighted avg       0.72      0.72      0.72      1117



Ok.  That looks a little better -- at least it is predicting 0 and 1s.  Let's try this model on the bs test data (2016, 2018).

#### STEP4: Predict 2016-2018 winners.

##### 2018 Prediction

In [18]:
# grab the 2018 data.
team_data_2018 = team_data.loc[team_data["year"] == 2018].reset_index()

# set features (no team, year, winners).
# set target (winners).
features_2018 = team_data_2018.drop({"team", "year", "winners"}, axis=1).drop({"index"}, axis=1)
features_2018 = scaler.fit_transform(features_2018)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [19]:
# fit the model.
probabilities = model.predict_proba(features_2018)

# convert predictions to datafram.e
WS_predictions = pd.DataFrame(probabilities[:,1])

# Sort the DataFrame (descending)
WS_predictions = WS_predictions.sort_values(0, ascending=False)

WS_predictions['Probability'] = WS_predictions[0]

# Print 50 highest probability HoF inductees from still eligible players
for i, row in WS_predictions.head(50).iterrows():
   prob = ' '.join(('WS Probability =', str(row['Probability'])))
   print('')
   print(prob)
   print(team_data_2018.iloc[i,1:27]["team"])


WS Probability = 0.9983351763525983
Los Angeles Dodgers

WS Probability = 0.9977231764350494
Houston Astros

WS Probability = 0.9967250519115325
Cleveland Indians

WS Probability = 0.9957379480686269
Minnesota Twins

WS Probability = 0.9938146345121649
Washington Nationals

WS Probability = 0.9935808532671789
Cincinnati Reds

WS Probability = 0.9864768712747255
St. Louis Cardinals

WS Probability = 0.9388161722525327
Arizona Diamondbacks

WS Probability = 0.8967298864009554
Oakland Athletics

WS Probability = 0.7757335563561325
Los Angeles Angels

WS Probability = 0.7587494683393395
Tampa Bay Rays

WS Probability = 0.7534313234238927
Boston Red Sox

WS Probability = 0.7163743981423119
Atlanta Braves

WS Probability = 0.7041543443128453
New York Yankees

WS Probability = 0.32746264936640035
Miami Marlins

WS Probability = 0.2664533155931836
New York Mets

WS Probability = 0.17631643041328735
Texas Rangers

WS Probability = 0.16770315667388888
Chicago White Sox

WS Probability = 0.10138

This does not look good.  The Red Sox won 2018.

##### 2017 Prediction

In [20]:
# grab the 2017 data.
team_data_2017 = team_data.loc[team_data["year"] == 2017].reset_index()

# set features (no team, year, winners).
# set target (winners).
features_2017 = team_data_2017.drop({"team", "year", "winners"}, axis=1).drop({"index"}, axis=1)
features_2017 = scaler.fit_transform(features_2017)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [21]:
# fit the model.
probabilities = model.predict_proba(features_2017)

# convert predictions to datafram.e
WS_predictions = pd.DataFrame(probabilities[:,1])

# Sort the DataFrame (descending)
WS_predictions = WS_predictions.sort_values(0, ascending=False)

WS_predictions['Probability'] = WS_predictions[0]

# Print 50 highest probability HoF inductees from still eligible players
for i, row in WS_predictions.head(50).iterrows():
   prob = ' '.join(('WS Probability =', str(row['Probability'])))
   print('')
   print(prob)
   print(team_data_2017.iloc[i,1:27]["team"])


WS Probability = 0.9999893579631512
Houston Astros

WS Probability = 0.9999761101573533
Cleveland Indians

WS Probability = 0.9964428488106882
Boston Red Sox

WS Probability = 0.995360294533806
Atlanta Braves

WS Probability = 0.9936723328401077
Washington Nationals

WS Probability = 0.9854765060721314
New York Yankees

WS Probability = 0.9843113938433664
Oakland Athletics

WS Probability = 0.9831887388557188
Los Angeles Angels

WS Probability = 0.9681217296745668
Los Angeles Dodgers

WS Probability = 0.9679421620264258
Tampa Bay Rays

WS Probability = 0.8294825796777769
Seattle Mariners

WS Probability = 0.6661839741218599
New York Mets

WS Probability = 0.6101226188274288
Pittsburgh Pirates

WS Probability = 0.33717946739735677
Minnesota Twins

WS Probability = 0.33377447629965434
Detroit Tigers

WS Probability = 0.18202796335504068
Arizona Diamondbacks

WS Probability = 0.15992975145160662
Milwaukee Brewers

WS Probability = 0.1523032206252995
Colorado Rockies

WS Probability = 0.1

And the Astros won 2017.  Huh.  This one worked.

##### 2016 Prediction

In [22]:
# grab the 2016 data.
team_data_2016 = team_data.loc[team_data["year"] == 2016].reset_index()

# set features (no team, year, winners).
# set target (winners).
features_2016 = team_data_2016.drop({"team", "year", "winners"}, axis=1).drop({"index"}, axis=1)
features_2016 = scaler.fit_transform(features_2016)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [23]:
# fit the model.
probabilities = model.predict_proba(features_2016)

# convert predictions to datafram.e
WS_predictions = pd.DataFrame(probabilities[:,1])

# Sort the DataFrame (descending)
WS_predictions = WS_predictions.sort_values(0, ascending=False)

WS_predictions['Probability'] = WS_predictions[0]

# Print 50 highest probability HoF inductees from still eligible players
for i, row in WS_predictions.head(50).iterrows():
   prob = ' '.join(('WS Probability =', str(row['Probability'])))
   print('')
   print(prob)
   print(team_data_2016.iloc[i,1:27]["team"])


WS Probability = 0.9999997672797294
Cleveland Indians

WS Probability = 0.9988420351702614
Houston Astros

WS Probability = 0.9986891840652153
Los Angeles Dodgers

WS Probability = 0.9983061803845389
New York Yankees

WS Probability = 0.9932132987913331
Washington Nationals

WS Probability = 0.9872386949647212
Minnesota Twins

WS Probability = 0.9661466124284565
Boston Red Sox

WS Probability = 0.8114139310491425
Kansas City Royals

WS Probability = 0.8072916329738508
Arizona Diamondbacks

WS Probability = 0.7310464782094956
Los Angeles Angels

WS Probability = 0.7296303883884093
St. Louis Cardinals

WS Probability = 0.6875986733904021
Seattle Mariners

WS Probability = 0.5642080844967279
Detroit Tigers

WS Probability = 0.49891318058705786
Chicago Cubs

WS Probability = 0.28181116756568275
Tampa Bay Rays

WS Probability = 0.2496846079230339
New York Mets

WS Probability = 0.20883075004149265
Oakland Athletics

WS Probability = 0.20042333719696123
Colorado Rockies

WS Probability = 0.

The Cubs won 2016.  Two of three predictions were incorrect.  I want to try another model--want to improve F1 score.