## Purpose: Try different models-- Part3.
### Grid search with scaling.

In [1]:
# import dependencies.
import pandas as pd
import numpy as np

from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.decomposition import PCA

#### STEP1: Read in dataset.  Remove data from 2016-2019.
- data from 2016-2018 will be used to bs test the model.
- data from 2019 will be used to predict the winners of the 2019 WS.

In [2]:
# read in the data.
team_data = pd.read_csv("../Resources/clean_data.csv")
del team_data["Unnamed: 0"]
team_data.head()

Unnamed: 0,team,year,A,DP,E,G2,GS2,INN,PB,PO,...,R1,SHO,SO1,SV,TBF,W,WHIP,WP,WPCT,winners
0,St. Louis Cardinals,2019,1033,114,43,104,936,8313.0,3,2771,...,456,4,895,33,3896,56,1.29,21,0.538,0
1,Arizona Diamondbacks,2019,1010,83,45,105,945,8538.0,2,2846,...,472,7,925,24,4001,53,1.28,35,0.505,0
2,Kansas City Royals,2019,990,105,45,106,954,8421.0,6,2807,...,543,5,816,24,4125,39,1.46,34,0.368,0
3,Houston Astros,2019,875,54,50,106,954,8589.0,6,2863,...,432,7,1074,27,3929,67,1.14,31,0.632,0
4,Tampa Bay Rays,2019,975,92,53,107,963,8760.0,11,2920,...,409,6,1037,26,3985,59,1.16,40,0.551,0


In [3]:
# remove data from 2016 through 2019.
team_data_new = team_data.loc[team_data["year"] < 2016]
team_data_new.head()

Unnamed: 0,team,year,A,DP,E,G2,GS2,INN,PB,PO,...,R1,SHO,SO1,SV,TBF,W,WHIP,WP,WPCT,winners
120,San Francisco Giants,2015,1639,136,72,162,1458,13143.0,6,4381,...,631,11,1309,43,6048,87,1.21,40,0.537,0
121,Washington Nationals,2015,1425,142,73,162,1458,13137.0,17,4379,...,612,12,1476,46,6036,95,1.19,47,0.586,0
122,Houston Astros,2015,1599,135,77,162,1458,13212.0,18,4404,...,701,8,1396,44,6180,84,1.29,98,0.519,0
123,Detroit Tigers,2015,1537,148,75,161,1449,12852.0,5,4284,...,721,8,1232,47,6048,86,1.32,44,0.534,0
124,Boston Red Sox,2015,1427,139,75,162,1458,12957.0,37,4319,...,694,5,1362,43,6073,93,1.27,52,0.574,0


#### STEP2: Upsample and scale data.

In [4]:
# reset the index.
team_data_new = team_data_new.reset_index().drop({"index"}, axis=1)
team_data_new.head()

Unnamed: 0,team,year,A,DP,E,G2,GS2,INN,PB,PO,...,R1,SHO,SO1,SV,TBF,W,WHIP,WP,WPCT,winners
0,San Francisco Giants,2015,1639,136,72,162,1458,13143.0,6,4381,...,631,11,1309,43,6048,87,1.21,40,0.537,0
1,Washington Nationals,2015,1425,142,73,162,1458,13137.0,17,4379,...,612,12,1476,46,6036,95,1.19,47,0.586,0
2,Houston Astros,2015,1599,135,77,162,1458,13212.0,18,4404,...,701,8,1396,44,6180,84,1.29,98,0.519,0
3,Detroit Tigers,2015,1537,148,75,161,1449,12852.0,5,4284,...,721,8,1232,47,6048,86,1.32,44,0.534,0
4,Boston Red Sox,2015,1427,139,75,162,1458,12957.0,37,4319,...,694,5,1362,43,6073,93,1.27,52,0.574,0


In [5]:
# remove team and year from the df.
team_data_new = team_data_new.drop({"team", "year"}, axis=1)
team_data_new.head()

Unnamed: 0,A,DP,E,G2,GS2,INN,PB,PO,TC,2B,...,R1,SHO,SO1,SV,TBF,W,WHIP,WP,WPCT,winners
0,1639,136,72,162,1458,13143.0,6,4381,6092,280,...,631,11,1309,43,6048,87,1.21,40,0.537,0
1,1425,142,73,162,1458,13137.0,17,4379,5877,268,...,612,12,1476,46,6036,95,1.19,47,0.586,0
2,1599,135,77,162,1458,13212.0,18,4404,6080,291,...,701,8,1396,44,6180,84,1.29,98,0.519,0
3,1537,148,75,161,1449,12852.0,5,4284,5896,252,...,721,8,1232,47,6048,86,1.32,44,0.534,0
4,1427,139,75,162,1458,12957.0,37,4319,5821,343,...,694,5,1362,43,6073,93,1.27,52,0.574,0


In [6]:
# separate majority and minority classes.
df_majority = team_data_new.loc[team_data_new["winners"] == 0]
df_minority = team_data_new.loc[team_data_new["winners"] == 1]

# upsample minority class.
df_minority_unsampled = resample(df_minority,
                                replace=True,
                                n_samples=2234,
                                random_state=123)

# combine majority class with upsampled minority class.
df_upsampled = pd.concat([df_majority, df_minority_unsampled])

# display new class counts.
df_upsampled["winners"].value_counts()

1    2234
0    2234
Name: winners, dtype: int64

In [7]:
# separate features and target.
y = df_upsampled["winners"]
X = df_upsampled.drop("winners", axis=1)
print (y.shape)
print (X.shape)

# split into train and test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# scale X_train and X_test.
scaler = StandardScaler()

# transform the training and testing data.
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

(4468,)
(4468, 52)


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


#### STEP3: Grid Search Model--Logistic Regression.

In [8]:
# set up svc model.
model = LogisticRegression(solver="lbfgs")

# create gridsearch estimator.
param_grid = {"C": [0.001, 0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(model, param_grid, verbose=3)

# fit the model.
grid.fit(X_train_scaled, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] C=0.001 .........................................................
[CV] ................ C=0.001, score=0.7423971377459749, total=   0.0s
[CV] C=0.001 .........................................................
[CV] ................ C=0.001, score=0.7457475380483438, total=   0.0s
[CV] C=0.001 .........................................................
[CV] ................ C=0.001, score=0.7186379928315412, total=   0.0s
[CV] C=0.01 ..........................................................
[CV] ................. C=0.01, score=0.7280858676207513, total=   0.0s
[CV] C=0.01 ..........................................................
[CV] ................. C=0.01, score=0.7314234556848702, total=   0.0s
[CV] C=0.01 ..........................................................
[CV] ................. C=0.01, score=0.7168458781362007, total=   0.0s
[CV] C=0.1 ...........................................................
[CV] ............

[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:    0.4s finished


[CV] ..................... C=1, score=0.735663082437276, total=   0.0s
[CV] C=10 ............................................................
[CV] ................... C=10, score=0.7584973166368515, total=   0.0s
[CV] C=10 ............................................................
[CV] ................... C=10, score=0.7547000895255148, total=   0.0s
[CV] C=10 ............................................................
[CV] .................... C=10, score=0.739247311827957, total=   0.0s
[CV] C=100 ...........................................................
[CV] .................. C=100, score=0.7638640429338104, total=   0.0s
[CV] C=100 ...........................................................
[CV] .................. C=100, score=0.7493285586392122, total=   0.0s
[CV] C=100 ...........................................................
[CV] .................. C=100, score=0.7419354838709677, total=   0.0s




GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [9]:
print (grid.best_params_)
print (grid.best_score_)
print (grid.best_estimator_)

{'C': 100}
0.7517159056997911
LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)


In [10]:
grid = grid.best_estimator_
predictions = grid.predict(X_test_scaled)
print (classification_report(y_test, predictions, target_names=["0", "1"]))

              precision    recall  f1-score   support

           0       0.74      0.69      0.71       573
           1       0.70      0.74      0.72       544

   micro avg       0.72      0.72      0.72      1117
   macro avg       0.72      0.72      0.72      1117
weighted avg       0.72      0.72      0.72      1117



This is slightly better than the straight logistic regression.

#### STEP4: Grid Search Model--SVC.

In [11]:
# set up svc model.
model = SVC(kernel="linear", probability=True)

# create gridsearch estimator.
param_grid = {"C": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
             "gamma": [0.0001, 0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(model, param_grid, verbose=3)

# fit the model.
grid.fit(X_train_scaled, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 42 candidates, totalling 126 fits
[CV] C=0.0001, gamma=0.0001 ..........................................
[CV] . C=0.0001, gamma=0.0001, score=0.7057245080500895, total=   1.4s
[CV] C=0.0001, gamma=0.0001 ..........................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.6s remaining:    0.0s


[CV] . C=0.0001, gamma=0.0001, score=0.7162041181736795, total=   1.4s
[CV] C=0.0001, gamma=0.0001 ..........................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.1s remaining:    0.0s


[CV] . C=0.0001, gamma=0.0001, score=0.6998207885304659, total=   1.3s
[CV] C=0.0001, gamma=0.001 ...........................................
[CV] .. C=0.0001, gamma=0.001, score=0.7057245080500895, total=   1.3s
[CV] C=0.0001, gamma=0.001 ...........................................
[CV] .. C=0.0001, gamma=0.001, score=0.7162041181736795, total=   1.3s
[CV] C=0.0001, gamma=0.001 ...........................................
[CV] .. C=0.0001, gamma=0.001, score=0.6998207885304659, total=   1.3s
[CV] C=0.0001, gamma=0.01 ............................................
[CV] ... C=0.0001, gamma=0.01, score=0.7057245080500895, total=   1.3s
[CV] C=0.0001, gamma=0.01 ............................................
[CV] ... C=0.0001, gamma=0.01, score=0.7162041181736795, total=   1.3s
[CV] C=0.0001, gamma=0.01 ............................................
[CV] ... C=0.0001, gamma=0.01, score=0.6998207885304659, total=   1.3s
[CV] C=0.0001, gamma=0.1 .............................................
[CV] .

[CV] ...... C=0.1, gamma=0.01, score=0.7647584973166368, total=   1.1s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] ...... C=0.1, gamma=0.01, score=0.7439570277529096, total=   1.2s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] ...... C=0.1, gamma=0.01, score=0.7526881720430108, total=   1.1s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ....... C=0.1, gamma=0.1, score=0.7647584973166368, total=   1.1s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ....... C=0.1, gamma=0.1, score=0.7439570277529096, total=   1.1s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ....... C=0.1, gamma=0.1, score=0.7526881720430108, total=   1.1s
[CV] C=0.1, gamma=1 ..................................................
[CV] ......... C=0.1, gamma=1, score=0.7647584973166368, total=   1.1s
[CV] C=0.1, gamma=1 ..................................................
[CV] .

[CV] ....... C=100, gamma=0.1, score=0.7726051924798567, total= 2.8min
[CV] C=100, gamma=0.1 ................................................
[CV] ....... C=100, gamma=0.1, score=0.7491039426523297, total= 3.2min
[CV] C=100, gamma=1 ..................................................
[CV] ......... C=100, gamma=1, score=0.7737030411449016, total= 3.1min
[CV] C=100, gamma=1 ..................................................
[CV] ......... C=100, gamma=1, score=0.7726051924798567, total= 3.0min
[CV] C=100, gamma=1 ..................................................
[CV] ......... C=100, gamma=1, score=0.7491039426523297, total= 3.0min
[CV] C=100, gamma=10 .................................................
[CV] ........ C=100, gamma=10, score=0.7737030411449016, total= 2.7min
[CV] C=100, gamma=10 .................................................
[CV] ........ C=100, gamma=10, score=0.7726051924798567, total= 2.9min
[CV] C=100, gamma=10 .................................................
[CV] .

[Parallel(n_jobs=1)]: Done 126 out of 126 | elapsed: 63.3min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [12]:
print (grid.best_params_)
print (grid.best_score_)
print (grid.best_estimator_)

{'C': 100, 'gamma': 0.0001}
0.7651447329155476
SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


In [13]:
grid = grid.best_estimator_
predictions = grid.predict(X_test_scaled)
print (classification_report(y_test, predictions, target_names=["0", "1"]))

              precision    recall  f1-score   support

           0       0.78      0.65      0.71       573
           1       0.69      0.81      0.74       544

   micro avg       0.73      0.73      0.73      1117
   macro avg       0.74      0.73      0.73      1117
weighted avg       0.74      0.73      0.73      1117



#### STEP5: Predict 2016-2018 winners with SVC Grid Search.

In [14]:
# grab the 2018 data.
team_data_2018 = team_data.loc[team_data["year"] == 2018].reset_index()

# set features (no team, year, winners).
# set target (winners).
features_2018 = team_data_2018.drop({"team", "year", "winners"}, axis=1).drop({"index"}, axis=1)
features_2018 = scaler.fit_transform(features_2018)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [16]:
# fit the model.
probabilities = grid.predict_proba(features_2018)

# convert predictions to datafram.e
WS_predictions = pd.DataFrame(probabilities[:,1])

# Sort the DataFrame (descending)
WS_predictions = WS_predictions.sort_values(0, ascending=False)

WS_predictions['Probability'] = WS_predictions[0]

# Print 50 highest probability HoF inductees from still eligible players
for i, row in WS_predictions.head(50).iterrows():
   prob = ' '.join(('WS Probability =', str(row['Probability'])))
   print('')
   print(prob)
   print(team_data_2018.iloc[i,1:27]["team"])


WS Probability = 0.9999999999999699
Detroit Tigers

WS Probability = 0.9999999999999699
Cincinnati Reds

WS Probability = 0.9999999999445727
Washington Nationals

WS Probability = 0.9999999992278406
Los Angeles Angels

WS Probability = 0.9999999977284594
Chicago White Sox

WS Probability = 0.9999999954413378
Baltimore Orioles

WS Probability = 0.9999999709980233
St. Louis Cardinals

WS Probability = 0.9999998149202
Miami Marlins

WS Probability = 0.9999982143266664
Minnesota Twins

WS Probability = 0.9679258472656711
New York Yankees

WS Probability = 0.9564101490968399
Houston Astros

WS Probability = 0.93631472937284
Atlanta Braves

WS Probability = 0.7361036936956478
San Diego Padres

WS Probability = 0.6638114741363039
Arizona Diamondbacks

WS Probability = 0.37211805902424155
Los Angeles Dodgers

WS Probability = 0.24735728685161118
Chicago Cubs

WS Probability = 0.10469901698983923
Kansas City Royals

WS Probability = 0.10285195296498209
Oakland Athletics

WS Probability = 0.092

In [18]:
# grab the 2017 data.
team_data_2017 = team_data.loc[team_data["year"] == 2017].reset_index()

# set features (no team, year, winners).
# set target (winners).
features_2017 = team_data_2017.drop({"team", "year", "winners"}, axis=1).drop({"index"}, axis=1)
features_2017 = scaler.fit_transform(features_2017)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [19]:
# fit the model.
probabilities = grid.predict_proba(features_2017)

# convert predictions to datafram.e
WS_predictions = pd.DataFrame(probabilities[:,1])

# Sort the DataFrame (descending)
WS_predictions = WS_predictions.sort_values(0, ascending=False)

WS_predictions['Probability'] = WS_predictions[0]

# Print 50 highest probability HoF inductees from still eligible players
for i, row in WS_predictions.head(50).iterrows():
   prob = ' '.join(('WS Probability =', str(row['Probability'])))
   print('')
   print(prob)
   print(team_data_2017.iloc[i,1:27]["team"])


WS Probability = 0.9999999999998259
Detroit Tigers

WS Probability = 0.9999999999989544
Texas Rangers

WS Probability = 0.9999999999982622
Los Angeles Angels

WS Probability = 0.9999999999980312
Colorado Rockies

WS Probability = 0.9999999963486171
Washington Nationals

WS Probability = 0.9999999941454486
Milwaukee Brewers

WS Probability = 0.9999998894591676
Toronto Blue Jays

WS Probability = 0.9999997031838538
Tampa Bay Rays

WS Probability = 0.9999991009393887
Kansas City Royals

WS Probability = 0.9999964312096713
Los Angeles Dodgers

WS Probability = 0.9999862902482916
Houston Astros

WS Probability = 0.9968215307551554
Philadelphia Phillies

WS Probability = 0.9928693693146651
Cincinnati Reds

WS Probability = 0.831503211307288
New York Yankees

WS Probability = 0.6989816729055747
Baltimore Orioles

WS Probability = 0.6821574346059165
Minnesota Twins

WS Probability = 0.4092446149840321
Cleveland Indians

WS Probability = 0.05312080033412291
Chicago Cubs

WS Probability = 0.022

This is not better.  The straight up logistic regression worked better.  At least it accurately predicted the Astros winning 2017.  This one didn't.  So neither of these models work.  Next!