## Purpose: Try different models-- Part3.
### Grid search with scaling.

In [1]:
# import dependencies.
import pandas as pd
import numpy as np

from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC

#### STEP1: Read in dataset.  Remove data from 2016-2019.
- data from 2016-2018 will be used to bs test the model.
- data from 2019 will be used to predict the winners of the 2019 WS.

In [2]:
# read in the data.
team_data = pd.read_csv("../Resources/clean_data.csv")
del team_data["Unnamed: 0"]
team_data.head()

Unnamed: 0,team,year,A,DP,E,G2,GS2,INN,PB,PO,...,R1,SHO,SO1,SV,TBF,W,WHIP,WP,WPCT,winners
0,St. Louis Cardinals,2019,1033,114,43,104,936,8313.0,3,2771,...,456,4,895,33,3896,56,1.29,21,0.538,0
1,Arizona Diamondbacks,2019,1010,83,45,105,945,8538.0,2,2846,...,472,7,925,24,4001,53,1.28,35,0.505,0
2,Kansas City Royals,2019,990,105,45,106,954,8421.0,6,2807,...,543,5,816,24,4125,39,1.46,34,0.368,0
3,Houston Astros,2019,875,54,50,106,954,8589.0,6,2863,...,432,7,1074,27,3929,67,1.14,31,0.632,0
4,Tampa Bay Rays,2019,975,92,53,107,963,8760.0,11,2920,...,409,6,1037,26,3985,59,1.16,40,0.551,0


In [3]:
# remove data from 2016 through 2019.
team_data_new = team_data.loc[team_data["year"] < 2016]
team_data_new.head()

Unnamed: 0,team,year,A,DP,E,G2,GS2,INN,PB,PO,...,R1,SHO,SO1,SV,TBF,W,WHIP,WP,WPCT,winners
120,San Francisco Giants,2015,1639,136,72,162,1458,13143.0,6,4381,...,631,11,1309,43,6048,87,1.21,40,0.537,0
121,Washington Nationals,2015,1425,142,73,162,1458,13137.0,17,4379,...,612,12,1476,46,6036,95,1.19,47,0.586,0
122,Houston Astros,2015,1599,135,77,162,1458,13212.0,18,4404,...,701,8,1396,44,6180,84,1.29,98,0.519,0
123,Detroit Tigers,2015,1537,148,75,161,1449,12852.0,5,4284,...,721,8,1232,47,6048,86,1.32,44,0.534,0
124,Boston Red Sox,2015,1427,139,75,162,1458,12957.0,37,4319,...,694,5,1362,43,6073,93,1.27,52,0.574,0


#### STEP2: Upsample and scale data.

In [4]:
# reset the index.
team_data_new = team_data_new.reset_index().drop({"index"}, axis=1)
team_data_new.head()

Unnamed: 0,team,year,A,DP,E,G2,GS2,INN,PB,PO,...,R1,SHO,SO1,SV,TBF,W,WHIP,WP,WPCT,winners
0,San Francisco Giants,2015,1639,136,72,162,1458,13143.0,6,4381,...,631,11,1309,43,6048,87,1.21,40,0.537,0
1,Washington Nationals,2015,1425,142,73,162,1458,13137.0,17,4379,...,612,12,1476,46,6036,95,1.19,47,0.586,0
2,Houston Astros,2015,1599,135,77,162,1458,13212.0,18,4404,...,701,8,1396,44,6180,84,1.29,98,0.519,0
3,Detroit Tigers,2015,1537,148,75,161,1449,12852.0,5,4284,...,721,8,1232,47,6048,86,1.32,44,0.534,0
4,Boston Red Sox,2015,1427,139,75,162,1458,12957.0,37,4319,...,694,5,1362,43,6073,93,1.27,52,0.574,0


In [5]:
# remove team and year from the df.
team_data_new = team_data_new.drop({"team", "year"}, axis=1)
team_data_new.head()

Unnamed: 0,A,DP,E,G2,GS2,INN,PB,PO,TC,2B,...,R1,SHO,SO1,SV,TBF,W,WHIP,WP,WPCT,winners
0,1639,136,72,162,1458,13143.0,6,4381,6092,280,...,631,11,1309,43,6048,87,1.21,40,0.537,0
1,1425,142,73,162,1458,13137.0,17,4379,5877,268,...,612,12,1476,46,6036,95,1.19,47,0.586,0
2,1599,135,77,162,1458,13212.0,18,4404,6080,291,...,701,8,1396,44,6180,84,1.29,98,0.519,0
3,1537,148,75,161,1449,12852.0,5,4284,5896,252,...,721,8,1232,47,6048,86,1.32,44,0.534,0
4,1427,139,75,162,1458,12957.0,37,4319,5821,343,...,694,5,1362,43,6073,93,1.27,52,0.574,0


In [None]:
# upsample for a more balanced dataset.
def upsample(dataset, no_samples):
    '''
    INPUT: 
    -dataset = dataset without team names and year.
    -n_samples = number of minority_unsampled.
    
    OUTPUT:
    -X_train_scaled = scaled X train data.
    -X_test_scaled = scaled X test data.
    -y_train = y train data
    -y_test = y test data
    
    DESCRIPTION:
    -dataset is taken in and split into minority and majority classes.
    -dataset is then upsampled for the mainority class
    -split the data into features and targets
    -split data into train and test sets
    -train and test sets were are scaled.
    '''
    
    # separate majority and minority classes.
    df_majority = dataset.loc[dataset["winners"] == 0]
    df_minority = dataset.loc[dataset["winners"] == 1]

    # upsample minority class.
    df_minority_unsampled = resample(df_minority,
                                    replace=True,
                                    n_samples=no_samples,
                                    random_state=123)

    # combine majority class with upsampled minority class.
    df_upsampled = pd.concat([df_majority, df_minority_unsampled])

    # separate features and target.
    y = df_upsampled["winners"]
    X = df_upsampled.drop("winners", axis=1)
    
    # split into train and test sets.
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    # scale X_train and X_test.
    scaler = StandardScaler()

    # transform the training and testing data.
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.fit_transform(X_test)
    
    return X_train_scaled, X_test_scaled, y_train, y_test

In [6]:
# separate majority and minority classes.
df_majority = team_data_new.loc[team_data_new["winners"] == 0]
df_minority = team_data_new.loc[team_data_new["winners"] == 1]

# upsample minority class.
df_minority_unsampled = resample(df_minority,
                                replace=True,
                                n_samples=(1117),
                                random_state=123)

# combine majority class with upsampled minority class.
df_upsampled = pd.concat([df_majority, df_minority_unsampled])

# display new class counts.
df_upsampled["winners"].value_counts()

0    2234
1    1117
Name: winners, dtype: int64

In [7]:
# separate features and target.
y = df_upsampled["winners"]
X = df_upsampled.drop("winners", axis=1)
print (y.shape)
print (X.shape)

# split into train and test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# scale X_train and X_test.
scaler = StandardScaler()

# transform the training and testing data.
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

(3351,)
(3351, 52)


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


#### STEP3: Grid Search Model--Logistic Regression.

In [8]:
# set up svc model.
model = LogisticRegression(solver="lbfgs")

# create gridsearch estimator.
param_grid = {"C": [0.001, 0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(model, param_grid, verbose=3)

# fit the model.
grid.fit(X_train_scaled, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] C=0.001 .........................................................
[CV] ................ C=0.001, score=0.7159904534606205, total=   0.0s
[CV] C=0.001 .........................................................
[CV] ................ C=0.001, score=0.7088305489260143, total=   0.0s
[CV] C=0.001 .........................................................
[CV] ................ C=0.001, score=0.7275985663082437, total=   0.0s
[CV] C=0.01 ..........................................................
[CV] ................. C=0.01, score=0.7362768496420048, total=   0.0s
[CV] C=0.01 ..........................................................
[CV] .................. C=0.01, score=0.711217183770883, total=   0.0s
[CV] C=0.01 ..........................................................
[CV] ................. C=0.01, score=0.7311827956989247, total=   0.0s
[CV] C=0.1 ...........................................................
[CV] ............

[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:    0.4s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [9]:
print (grid.best_params_)
print (grid.best_score_)
print (grid.best_estimator_)

{'C': 1}
0.7528849980103463
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)


In [10]:
grid = grid.best_estimator_
predictions = grid.predict(X_test_scaled)
print (classification_report(y_test, predictions, target_names=["0", "1"]))

              precision    recall  f1-score   support

           0       0.81      0.86      0.83       570
           1       0.66      0.56      0.60       268

   micro avg       0.77      0.77      0.77       838
   macro avg       0.73      0.71      0.72       838
weighted avg       0.76      0.77      0.76       838



This is slightly better than the straight logistic regression.

#### STEP4: Predict 2016-2018 winners, Logistic Regression with Upscale.

In [11]:
# grab the 2018 data.
team_data_2018 = team_data.loc[team_data["year"] == 2018].reset_index()

# set features (no team, year, winners).
# set target (winners).
features_2018 = team_data_2018.drop({"team", "year", "winners"}, axis=1).drop({"index"}, axis=1)
features_2018 = scaler.fit_transform(features_2018)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [12]:
# fit the model.
probabilities = grid.predict_proba(features_2018)

# convert predictions to datafram.e
WS_predictions = pd.DataFrame(probabilities[:,1])

# Sort the DataFrame (descending)
WS_predictions = WS_predictions.sort_values(0, ascending=False)

WS_predictions['Probability'] = WS_predictions[0]

# Print 50 highest probability HoF inductees from still eligible players
for i, row in WS_predictions.head(50).iterrows():
   prob = ' '.join(('WS Probability =', str(row['Probability'])))
   print('')
   print(prob)
   print(team_data_2018.iloc[i,1:27]["team"])


WS Probability = 0.9898947040815076
Cleveland Indians

WS Probability = 0.9890973041873183
Los Angeles Dodgers

WS Probability = 0.9758472893083431
Houston Astros

WS Probability = 0.9646480134023846
Washington Nationals

WS Probability = 0.9440891204588948
Cincinnati Reds

WS Probability = 0.9268858539563188
St. Louis Cardinals

WS Probability = 0.9002484640394407
Minnesota Twins

WS Probability = 0.8611271093402917
Arizona Diamondbacks

WS Probability = 0.8233375565838049
Tampa Bay Rays

WS Probability = 0.5816284156258886
Boston Red Sox

WS Probability = 0.5713947577502062
New York Yankees

WS Probability = 0.4252050217553913
Atlanta Braves

WS Probability = 0.39328987280822814
Oakland Athletics

WS Probability = 0.28892515647197214
New York Mets

WS Probability = 0.26890337523277213
Los Angeles Angels

WS Probability = 0.2585264649129437
Miami Marlins

WS Probability = 0.14736167473446252
Chicago White Sox

WS Probability = 0.1196319942431059
Texas Rangers

WS Probability = 0.0908

In [13]:
# grab the 2017 data.
team_data_2017 = team_data.loc[team_data["year"] == 2017].reset_index()

# set features (no team, year, winners).
# set target (winners).
features_2017 = team_data_2017.drop({"team", "year", "winners"}, axis=1).drop({"index"}, axis=1)
features_2017 = scaler.fit_transform(features_2017)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [14]:
# fit the model.
probabilities = grid.predict_proba(features_2017)

# convert predictions to datafram.e
WS_predictions = pd.DataFrame(probabilities[:,1])

# Sort the DataFrame (descending)
WS_predictions = WS_predictions.sort_values(0, ascending=False)

WS_predictions['Probability'] = WS_predictions[0]

# Print 50 highest probability HoF inductees from still eligible players
for i, row in WS_predictions.head(50).iterrows():
   prob = ' '.join(('WS Probability =', str(row['Probability'])))
   print('')
   print(prob)
   print(team_data_2017.iloc[i,1:27]["team"])


WS Probability = 0.9996609448709619
Houston Astros

WS Probability = 0.9993871298546687
Cleveland Indians

WS Probability = 0.9908179917290707
Boston Red Sox

WS Probability = 0.9860852019694502
Atlanta Braves

WS Probability = 0.9753036872974937
Washington Nationals

WS Probability = 0.9395926528764766
Tampa Bay Rays

WS Probability = 0.9203819562052903
New York Yankees

WS Probability = 0.9056163117658068
Los Angeles Dodgers

WS Probability = 0.7544808135776877
Oakland Athletics

WS Probability = 0.7289041300229998
Seattle Mariners

WS Probability = 0.6676011972000717
Los Angeles Angels

WS Probability = 0.5556352054959984
Colorado Rockies

WS Probability = 0.4909582554080065
Milwaukee Brewers

WS Probability = 0.22998937420570462
New York Mets

WS Probability = 0.22836891581927218
Chicago Cubs

WS Probability = 0.17948654780020432
Pittsburgh Pirates

WS Probability = 0.12222355779353605
Arizona Diamondbacks

WS Probability = 0.06956716906150724
Minnesota Twins

WS Probability = 0.0

In [15]:
# grab the 2016 data.
team_data_2016 = team_data.loc[team_data["year"] == 2016].reset_index()

# set features (no team, year, winners).
# set target (winners).
features_2016 = team_data_2016.drop({"team", "year", "winners"}, axis=1).drop({"index"}, axis=1)
features_2016 = scaler.fit_transform(features_2016)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [16]:
# fit the model.
probabilities = grid.predict_proba(features_2016)

# convert predictions to datafram.e
WS_predictions = pd.DataFrame(probabilities[:,1])

# Sort the DataFrame (descending)
WS_predictions = WS_predictions.sort_values(0, ascending=False)

WS_predictions['Probability'] = WS_predictions[0]

# Print 50 highest probability HoF inductees from still eligible players
for i, row in WS_predictions.head(50).iterrows():
   prob = ' '.join(('WS Probability =', str(row['Probability'])))
   print('')
   print(prob)
   print(team_data_2016.iloc[i,1:27]["team"])


WS Probability = 0.9999933975879108
Cleveland Indians

WS Probability = 0.9930765404030228
Los Angeles Dodgers

WS Probability = 0.9863808115779764
Houston Astros

WS Probability = 0.9838677868347778
New York Yankees

WS Probability = 0.9806270928927661
Washington Nationals

WS Probability = 0.9233479520207871
Minnesota Twins

WS Probability = 0.8342464493783212
Arizona Diamondbacks

WS Probability = 0.784550953646454
Boston Red Sox

WS Probability = 0.4803739588327037
St. Louis Cardinals

WS Probability = 0.46898777979652784
Colorado Rockies

WS Probability = 0.43395200494370145
Chicago Cubs

WS Probability = 0.36999649228383985
Los Angeles Angels

WS Probability = 0.32918635043373434
Kansas City Royals

WS Probability = 0.2143467433510284
Detroit Tigers

WS Probability = 0.19738967433437182
Seattle Mariners

WS Probability = 0.1766589442845122
Tampa Bay Rays

WS Probability = 0.16703104473135175
Miami Marlins

WS Probability = 0.03947275476750499
New York Mets

WS Probability = 0.03

#### STEP5: Grid Search Model--SVC.

In [17]:
# set up svc model.
model = SVC(kernel="rbf", probability=True)

# create gridsearch estimator.
param_grid = {"C": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
             "gamma": [0.0001, 0.001, 0.01, 0.1]}
grid = GridSearchCV(model, param_grid, verbose=3)

# fit the model.
grid.fit(X_train_scaled, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 28 candidates, totalling 84 fits
[CV] C=0.0001, gamma=0.0001 ..........................................
[CV] .. C=0.0001, gamma=0.0001, score=0.662291169451074, total=   0.6s
[CV] C=0.0001, gamma=0.0001 ..........................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s


[CV] .. C=0.0001, gamma=0.0001, score=0.662291169451074, total=   0.6s
[CV] C=0.0001, gamma=0.0001 ..........................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.3s remaining:    0.0s


[CV] . C=0.0001, gamma=0.0001, score=0.6618876941457587, total=   0.6s
[CV] C=0.0001, gamma=0.001 ...........................................
[CV] ... C=0.0001, gamma=0.001, score=0.662291169451074, total=   0.6s
[CV] C=0.0001, gamma=0.001 ...........................................
[CV] ... C=0.0001, gamma=0.001, score=0.662291169451074, total=   0.6s
[CV] C=0.0001, gamma=0.001 ...........................................
[CV] .. C=0.0001, gamma=0.001, score=0.6618876941457587, total=   0.6s
[CV] C=0.0001, gamma=0.01 ............................................
[CV] .... C=0.0001, gamma=0.01, score=0.662291169451074, total=   0.6s
[CV] C=0.0001, gamma=0.01 ............................................
[CV] .... C=0.0001, gamma=0.01, score=0.662291169451074, total=   0.6s
[CV] C=0.0001, gamma=0.01 ............................................
[CV] ... C=0.0001, gamma=0.01, score=0.6618876941457587, total=   0.6s
[CV] C=0.0001, gamma=0.1 .............................................
[CV] .

[CV] ..... C=10, gamma=0.0001, score=0.7673031026252983, total=   6.0s
[CV] C=10, gamma=0.0001 ..............................................
[CV] ..... C=10, gamma=0.0001, score=0.7386634844868735, total=   6.0s
[CV] C=10, gamma=0.0001 ..............................................
[CV] ..... C=10, gamma=0.0001, score=0.7467144563918757, total=   6.9s
[CV] C=10, gamma=0.001 ...............................................
[CV] ...... C=10, gamma=0.001, score=0.7673031026252983, total=   5.7s
[CV] C=10, gamma=0.001 ...............................................
[CV] ...... C=10, gamma=0.001, score=0.7386634844868735, total=   6.6s
[CV] C=10, gamma=0.001 ...............................................
[CV] ...... C=10, gamma=0.001, score=0.7467144563918757, total=   7.3s
[CV] C=10, gamma=0.01 ................................................
[CV] ....... C=10, gamma=0.01, score=0.7673031026252983, total=   5.4s
[CV] C=10, gamma=0.01 ................................................
[CV] .

[Parallel(n_jobs=1)]: Done  84 out of  84 | elapsed: 15.9min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.0001, 0.001, 0.01, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [18]:
print (grid.best_params_)
print (grid.best_score_)
print (grid.best_estimator_)

{'C': 100, 'gamma': 0.0001}
0.7600477516912058
SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


In [19]:
grid = grid.best_estimator_
predictions = grid.predict(X_test_scaled)
print (classification_report(y_test, predictions, target_names=["0", "1"]))

              precision    recall  f1-score   support

           0       0.81      0.88      0.84       570
           1       0.69      0.56      0.62       268

   micro avg       0.78      0.78      0.78       838
   macro avg       0.75      0.72      0.73       838
weighted avg       0.77      0.78      0.77       838



#### STEP5: Predict 2016-2018 winners with SVC Grid Search.

In [20]:
# grab the 2018 data.
team_data_2018 = team_data.loc[team_data["year"] == 2018].reset_index()

# set features (no team, year, winners).
# set target (winners).
features_2018 = team_data_2018.drop({"team", "year", "winners"}, axis=1).drop({"index"}, axis=1)
features_2018 = scaler.fit_transform(features_2018)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [21]:
# fit the model.
probabilities = grid.predict_proba(features_2018)

# convert predictions to datafram.e
WS_predictions = pd.DataFrame(probabilities[:,1])

# Sort the DataFrame (descending)
WS_predictions = WS_predictions.sort_values(0, ascending=False)

WS_predictions['Probability'] = WS_predictions[0]

# Print 50 highest probability HoF inductees from still eligible players
for i, row in WS_predictions.head(50).iterrows():
   prob = ' '.join(('WS Probability =', str(row['Probability'])))
   print('')
   print(prob)
   print(team_data_2018.iloc[i,1:27]["team"])


WS Probability = 0.9999999999999307
Detroit Tigers

WS Probability = 0.9999999999916881
Cincinnati Reds

WS Probability = 0.9999999998265531
Chicago White Sox

WS Probability = 0.9999999985014809
Washington Nationals

WS Probability = 0.9999928668574126
St. Louis Cardinals

WS Probability = 0.9950727916877068
Los Angeles Angels

WS Probability = 0.9832820462039904
Baltimore Orioles

WS Probability = 0.9737467440030976
Los Angeles Dodgers

WS Probability = 0.9620824055087006
New York Yankees

WS Probability = 0.944872685455174
Kansas City Royals

WS Probability = 0.9185809122719995
Miami Marlins

WS Probability = 0.8009303761566805
Houston Astros

WS Probability = 0.5
Atlanta Braves

WS Probability = 0.4923290763706564
Cleveland Indians

WS Probability = 0.4880694690276627
San Diego Padres

WS Probability = 0.4095008423766043
Minnesota Twins

WS Probability = 0.2646439538716591
Pittsburgh Pirates

WS Probability = 0.2583764455674902
Toronto Blue Jays

WS Probability = 0.083014926108492

In [22]:
# grab the 2017 data.
team_data_2017 = team_data.loc[team_data["year"] == 2017].reset_index()

# set features (no team, year, winners).
# set target (winners).
features_2017 = team_data_2017.drop({"team", "year", "winners"}, axis=1).drop({"index"}, axis=1)
features_2017 = scaler.fit_transform(features_2017)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [23]:
# fit the model.
probabilities = grid.predict_proba(features_2017)

# convert predictions to datafram.e
WS_predictions = pd.DataFrame(probabilities[:,1])

# Sort the DataFrame (descending)
WS_predictions = WS_predictions.sort_values(0, ascending=False)

WS_predictions['Probability'] = WS_predictions[0]

# Print 50 highest probability HoF inductees from still eligible players
for i, row in WS_predictions.head(50).iterrows():
   prob = ' '.join(('WS Probability =', str(row['Probability'])))
   print('')
   print(prob)
   print(team_data_2017.iloc[i,1:27]["team"])


WS Probability = 0.9999999999999699
Colorado Rockies

WS Probability = 0.9999999999999699
Detroit Tigers

WS Probability = 0.999999999998201
Milwaukee Brewers

WS Probability = 0.9999999998681178
Texas Rangers

WS Probability = 0.9999999998203298
Kansas City Royals

WS Probability = 0.9999999993930205
Los Angeles Angels

WS Probability = 0.9999999923494618
Toronto Blue Jays

WS Probability = 0.9999996087734381
Baltimore Orioles

WS Probability = 0.9918644602393444
Washington Nationals

WS Probability = 0.9844780330857608
Cincinnati Reds

WS Probability = 0.9739590801220256
Tampa Bay Rays

WS Probability = 0.9599416030680565
Los Angeles Dodgers

WS Probability = 0.9114103573800123
Chicago White Sox

WS Probability = 0.8355977519942457
Minnesota Twins

WS Probability = 0.8284373724290154
Houston Astros

WS Probability = 0.6954722848810508
Philadelphia Phillies

WS Probability = 0.2408760528219042
Seattle Mariners

WS Probability = 0.04605168267682891
Chicago Cubs

WS Probability = 0.005

This is not better.  The logistic regression with grid and upsampling works just as good as the straight up logistic regression.  SVC didn't work well. Next!