## Purpose: Try different models-- Part3.
### Grid search with upsampling and scaling.

In [1]:
# import dependencies.
import pandas as pd
import numpy as np

from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC

#### STEP1: Read in dataset.  Remove data from 2016-2019.
- data from 2016-2018 will be used to bs test the model.
- data from 2019 will be used to predict the winners of the 2019 WS.

In [2]:
# read in the data.
team_data = pd.read_csv("../../Resources/clean_data_1969.csv")
del team_data["Unnamed: 0"]
team_data.head()

Unnamed: 0,team,year,A,DP,E,G2,GS2,INN,PB,PO,...,SHO,SO1,SV,SVO,TBF,W,WHIP,WP,WPCT,winners
0,St. Louis Cardinals,2019,1033,114,43,104,936,8313.0,3,2771,...,4,895,33,42,3896,56,1.29,21,0.538,0
1,Arizona Diamondbacks,2019,1010,83,45,105,945,8538.0,2,2846,...,7,925,24,37,4001,53,1.28,35,0.505,0
2,Kansas City Royals,2019,990,105,45,106,954,8421.0,6,2807,...,5,816,24,41,4125,39,1.46,34,0.368,0
3,Houston Astros,2019,875,54,50,106,954,8589.0,6,2863,...,7,1074,27,42,3929,67,1.14,31,0.632,0
4,Tampa Bay Rays,2019,975,92,53,107,963,8760.0,11,2920,...,6,1037,26,43,3985,59,1.16,40,0.551,0


In [3]:
# remove data from 2016 through 2019.
team_data_new = team_data.loc[team_data["year"] < 2016]
team_data_new.head()

Unnamed: 0,team,year,A,DP,E,G2,GS2,INN,PB,PO,...,SHO,SO1,SV,SVO,TBF,W,WHIP,WP,WPCT,winners
120,San Francisco Giants,2015,1639,136,72,162,1458,13143.0,6,4381,...,11,1309,43,72,6048,87,1.21,40,0.537,0
121,Washington Nationals,2015,1425,142,73,162,1458,13137.0,17,4379,...,12,1476,46,60,6036,95,1.19,47,0.586,0
122,Houston Astros,2015,1599,135,77,162,1458,13212.0,18,4404,...,8,1396,44,64,6180,84,1.29,98,0.519,0
123,Detroit Tigers,2015,1537,148,75,161,1449,12852.0,5,4284,...,8,1232,47,66,6048,86,1.32,44,0.534,0
124,Boston Red Sox,2015,1427,139,75,162,1458,12957.0,37,4319,...,5,1362,43,61,6073,93,1.27,52,0.574,0


In [4]:
target = team_data_new["winners"]
features = team_data_new.drop({"team", "year", "winners"}, axis=1)
feature_columns = list(features.columns)
print (target.shape)
print (features.shape)
print (feature_columns)

(1266,)
(1266, 59)
['A', 'DP', 'E', 'G2', 'GS2', 'INN', 'PB', 'PO', 'TC', '2B', '3B', 'AB', 'AO', 'BB', 'CS', 'G', 'GDP', 'H', 'HBP', 'HR', 'IBB', 'NP_x', 'OBP', 'OPS_x', 'PA', 'R', 'RBI', 'SAC', 'SB', 'SF', 'SLG', 'SO', 'TB', 'XBH', 'BB1', 'BK', 'CG', 'ER', 'ERA', 'G1', 'GF', 'GS', 'H1', 'HB', 'HR1', 'IBB1', 'IP', 'L', 'OBP1', 'R1', 'SHO', 'SO1', 'SV', 'SVO', 'TBF', 'W', 'WHIP', 'WP', 'WPCT']


#### STEP2: Upsample and scale data.

In [5]:
# reset the index.
team_data_new = team_data_new.reset_index().drop({"index"}, axis=1)
team_data_new.head()

Unnamed: 0,team,year,A,DP,E,G2,GS2,INN,PB,PO,...,SHO,SO1,SV,SVO,TBF,W,WHIP,WP,WPCT,winners
0,San Francisco Giants,2015,1639,136,72,162,1458,13143.0,6,4381,...,11,1309,43,72,6048,87,1.21,40,0.537,0
1,Washington Nationals,2015,1425,142,73,162,1458,13137.0,17,4379,...,12,1476,46,60,6036,95,1.19,47,0.586,0
2,Houston Astros,2015,1599,135,77,162,1458,13212.0,18,4404,...,8,1396,44,64,6180,84,1.29,98,0.519,0
3,Detroit Tigers,2015,1537,148,75,161,1449,12852.0,5,4284,...,8,1232,47,66,6048,86,1.32,44,0.534,0
4,Boston Red Sox,2015,1427,139,75,162,1458,12957.0,37,4319,...,5,1362,43,61,6073,93,1.27,52,0.574,0


In [6]:
# remove team and year.
feature_columns_new = feature_columns + ["winners"]
team_data_new = team_data[feature_columns_new]
team_data_new.head()

Unnamed: 0,A,DP,E,G2,GS2,INN,PB,PO,TC,2B,...,SHO,SO1,SV,SVO,TBF,W,WHIP,WP,WPCT,winners
0,1033,114,43,104,936,8313.0,3,2771,3847,157,...,4,895,33,42,3896,56,1.29,21,0.538,0
1,1010,83,45,105,945,8538.0,2,2846,3901,203,...,7,925,24,37,4001,53,1.28,35,0.505,0
2,990,105,45,106,954,8421.0,6,2807,3842,185,...,5,816,24,41,4125,39,1.46,34,0.368,0
3,875,54,50,106,954,8589.0,6,2863,3788,200,...,7,1074,27,42,3929,67,1.14,31,0.632,0
4,975,92,53,107,963,8760.0,11,2920,3948,195,...,6,1037,26,43,3985,59,1.16,40,0.551,0


In [7]:
# upsample for a more balanced dataset.
def upsample(dataset, no_samples):
    '''
    INPUT: 
    -dataset = dataset without team names and year.
    -n_samples = number of minority_unsampled.
    
    OUTPUT:
    -X_train_scaled = scaled X train data.
    -X_test_scaled = scaled X test data.
    -y_train = y train data
    -y_test = y test data
    
    DESCRIPTION:
    -dataset is taken in and split into minority and majority classes.
    -dataset is then upsampled for the mainority class
    -split the data into features and targets
    -split data into train and test sets
    -train and test sets were are scaled.
    '''
    
    # separate majority and minority classes.
    df_majority = dataset.loc[dataset["winners"] == 0]
    df_minority = dataset.loc[dataset["winners"] == 1]

    # upsample minority class.
    df_minority_unsampled = resample(df_minority,
                                    replace=True,
                                    n_samples=no_samples,
                                    random_state=123)

    # combine majority class with upsampled minority class.
    df_upsampled = pd.concat([df_majority, df_minority_unsampled])

    # separate features and target.
    y = df_upsampled["winners"]
    X = df_upsampled[feature_columns]
    
    # split into train and test sets.
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    # scale X_train and X_test.
    scaler = StandardScaler()

    # transform the training and testing data.
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.fit_transform(X_test)
    
    return X_train_scaled, X_test_scaled, y_train, y_test

In [8]:
# Do three different upsamplings.
X_train_100, X_test_100, y_train_100, y_test_100 = upsample(team_data_new, 2234)
X_train_50, X_test_50, y_train_50, y_test_50 = upsample(team_data_new, 1117)
X_train_25, X_test_25, y_train_25, y_test_25 = upsample(team_data_new, 559)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


#### STEP3: Grid Search Model--Logistic Regression.

In [9]:
def grid_search_logistic(X_train, X_test, y_train, y_test):
    '''
    INPUT: 
    -X_train = scaled X train data.
    -X_test = scaled X test data.
    -y_train = y train data.
    -y_test = y test data.
    
    OUTPUT:
    -classification report (has F1 score, precision and recall).
    -grid = saved model for prediction. 
    
    DESCRIPTION:
    -the scaled and split data is put through a grid search with logistic.
    -the model is trained.
    -a prediction is made.
    -print out the classification report and give the model.
    '''
    
    # fit the model.
    model = LogisticRegression(max_iter= 2000)
    
    # create gridsearch estimator.
    param_grid = {"C": [0.001, 0.01, 0.1, 1, 10, 100],
                 "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]}
    grid = GridSearchCV(model, param_grid, verbose=3, cv=5)

    # fit the model.
    grid.fit(X_train, y_train)

    # predict.
    prediction = grid.predict(X_test)
    
    # print out the basic information about the grid search.
    print (grid.best_params_)
    print (grid.best_score_)
    print (grid.best_estimator_)
    
    grid = grid.best_estimator_
    predictions = grid.predict(X_test)
    print (classification_report(y_test, prediction, target_names=["0", "1"]))
    
    return grid

In [10]:
# for 1 part 0 to 1 part 1
model_100 = grid_search_logistic(X_train_100, X_test_100, y_train_100, y_test_100)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] C=0.001, solver=newton-cg .......................................
[CV]  C=0.001, solver=newton-cg, score=0.7206703910614525, total=   0.0s
[CV] C=0.001, solver=newton-cg .......................................
[CV]  C=0.001, solver=newton-cg, score=0.746268656716418, total=   0.0s
[CV] C=0.001, solver=newton-cg .......................................
[CV]  C=0.001, solver=newton-cg, score=0.7102803738317757, total=   0.0s
[CV] C=0.001, solver=newton-cg .......................................
[CV]  C=0.001, solver=newton-cg, score=0.7364485981308411, total=   0.0s
[CV] C=0.001, solver=newton-cg .......................................
[CV]  C=0.001, solver=newton-cg, score=0.7252336448598131, total=   0.0s
[CV] C=0.001, solver=lbfgs ...........................................
[CV] .. C=0.001, solver=lbfgs, score=0.7206703910614525, total=   0.0s
[CV] C=0.001, solver=lbfgs ...........................................
[CV] .

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s


[CV]  C=0.001, solver=liblinear, score=0.6927374301675978, total=   0.0s
[CV] C=0.001, solver=liblinear .......................................
[CV]  C=0.001, solver=liblinear, score=0.7164179104477612, total=   0.0s
[CV] C=0.001, solver=liblinear .......................................
[CV]  C=0.001, solver=liblinear, score=0.7102803738317757, total=   0.0s
[CV] C=0.001, solver=liblinear .......................................
[CV]  C=0.001, solver=liblinear, score=0.6728971962616822, total=   0.0s
[CV] C=0.001, solver=liblinear .......................................
[CV]  C=0.001, solver=liblinear, score=0.6934579439252336, total=   0.0s
[CV] C=0.001, solver=sag .............................................
[CV] .... C=0.001, solver=sag, score=0.7206703910614525, total=   0.0s
[CV] C=0.001, solver=sag .............................................
[CV] ..... C=0.001, solver=sag, score=0.746268656716418, total=   0.0s
[CV] C=0.001, solver=sag ..........................................

[CV] ...... C=0.1, solver=sag, score=0.7700934579439253, total=   0.1s
[CV] C=0.1, solver=sag ...............................................
[CV] ...... C=0.1, solver=sag, score=0.7906542056074767, total=   0.1s
[CV] C=0.1, solver=saga ..............................................
[CV] ..... C=0.1, solver=saga, score=0.7988826815642458, total=   0.2s
[CV] C=0.1, solver=saga ..............................................
[CV] ..... C=0.1, solver=saga, score=0.7630597014925373, total=   0.2s
[CV] C=0.1, solver=saga ..............................................
[CV] ..... C=0.1, solver=saga, score=0.7607476635514019, total=   0.2s
[CV] C=0.1, solver=saga ..............................................
[CV] ..... C=0.1, solver=saga, score=0.7700934579439253, total=   0.2s
[CV] C=0.1, solver=saga ..............................................
[CV] ..... C=0.1, solver=saga, score=0.7906542056074767, total=   0.2s
[CV] C=1, solver=newton-cg ...........................................
[CV] .



[CV] ....... C=10, solver=sag, score=0.8081936685288641, total=   2.7s
[CV] C=10, solver=sag ................................................




[CV] ....... C=10, solver=sag, score=0.7742537313432836, total=   2.6s
[CV] C=10, solver=sag ................................................




[CV] ....... C=10, solver=sag, score=0.7700934579439253, total=   2.6s
[CV] C=10, solver=sag ................................................




[CV] ....... C=10, solver=sag, score=0.7962616822429907, total=   2.6s
[CV] C=10, solver=sag ................................................




[CV] ....... C=10, solver=sag, score=0.7663551401869159, total=   2.6s
[CV] C=10, solver=saga ...............................................




[CV] ...... C=10, solver=saga, score=0.8063314711359404, total=   2.7s
[CV] C=10, solver=saga ...............................................




[CV] ...... C=10, solver=saga, score=0.7723880597014925, total=   2.6s
[CV] C=10, solver=saga ...............................................




[CV] ...... C=10, solver=saga, score=0.7700934579439253, total=   2.7s
[CV] C=10, solver=saga ...............................................




[CV] ...... C=10, solver=saga, score=0.7962616822429907, total=   2.8s
[CV] C=10, solver=saga ...............................................




[CV] ...... C=10, solver=saga, score=0.7663551401869159, total=   2.8s
[CV] C=100, solver=newton-cg .........................................
[CV]  C=100, solver=newton-cg, score=0.8100558659217877, total=   0.1s
[CV] C=100, solver=newton-cg .........................................
[CV]  C=100, solver=newton-cg, score=0.7761194029850746, total=   0.2s
[CV] C=100, solver=newton-cg .........................................
[CV]  C=100, solver=newton-cg, score=0.7682242990654206, total=   0.1s
[CV] C=100, solver=newton-cg .........................................
[CV]  C=100, solver=newton-cg, score=0.8074766355140187, total=   0.1s
[CV] C=100, solver=newton-cg .........................................
[CV] . C=100, solver=newton-cg, score=0.794392523364486, total=   0.1s
[CV] C=100, solver=lbfgs .............................................
[CV] .... C=100, solver=lbfgs, score=0.8100558659217877, total=   0.5s
[CV] C=100, solver=lbfgs .............................................
[CV] .



[CV] ...... C=100, solver=sag, score=0.8081936685288641, total=   2.6s
[CV] C=100, solver=sag ...............................................




[CV] ...... C=100, solver=sag, score=0.7761194029850746, total=   2.6s
[CV] C=100, solver=sag ...............................................




[CV] ........ C=100, solver=sag, score=0.77196261682243, total=   2.6s
[CV] C=100, solver=sag ...............................................




[CV] ...... C=100, solver=sag, score=0.8018691588785046, total=   2.6s
[CV] C=100, solver=sag ...............................................




[CV] ...... C=100, solver=sag, score=0.7831775700934579, total=   2.5s
[CV] C=100, solver=saga ..............................................




[CV] ..... C=100, solver=saga, score=0.8100558659217877, total=   2.9s
[CV] C=100, solver=saga ..............................................




[CV] ..... C=100, solver=saga, score=0.7779850746268657, total=   2.8s
[CV] C=100, solver=saga ..............................................




[CV] ..... C=100, solver=saga, score=0.7757009345794392, total=   2.8s
[CV] C=100, solver=saga ..............................................




[CV] ..... C=100, solver=saga, score=0.7981308411214953, total=   2.8s
[CV] C=100, solver=saga ..............................................
[CV] ..... C=100, solver=saga, score=0.7813084112149533, total=   2.7s
{'C': 100, 'solver': 'newton-cg'}
0.7912621359223301
LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=2000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False)
              precision    recall  f1-score   support

           0       0.77      0.47      0.59       356
           1       0.72      0.91      0.80       537

   micro avg       0.73      0.73      0.73       893
   macro avg       0.75      0.69      0.70       893
weighted avg       0.74      0.73      0.72       893



[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:  1.2min finished


In [11]:
# for 1 part 0 to 0.5 part 1
model_50 = grid_search_logistic(X_train_50, X_test_50, y_train_50, y_test_50)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] C=0.001, solver=newton-cg .......................................
[CV]  C=0.001, solver=newton-cg, score=0.6684782608695652, total=   0.0s
[CV] C=0.001, solver=newton-cg .......................................
[CV]  C=0.001, solver=newton-cg, score=0.6793478260869565, total=   0.0s
[CV] C=0.001, solver=newton-cg .......................................
[CV]  C=0.001, solver=newton-cg, score=0.7119565217391305, total=   0.0s
[CV] C=0.001, solver=newton-cg .......................................
[CV]  C=0.001, solver=newton-cg, score=0.6766304347826086, total=   0.0s
[CV] C=0.001, solver=newton-cg .......................................
[CV]  C=0.001, solver=newton-cg, score=0.6902173913043478, total=   0.0s
[CV] C=0.001, solver=lbfgs ...........................................
[CV] .. C=0.001, solver=lbfgs, score=0.6684782608695652, total=   0.0s
[CV] C=0.001, solver=lbfgs ...........................................
[CV] 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s


[CV]  C=0.001, solver=liblinear, score=0.6847826086956522, total=   0.0s
[CV] C=0.001, solver=sag .............................................
[CV] .... C=0.001, solver=sag, score=0.6684782608695652, total=   0.0s
[CV] C=0.001, solver=sag .............................................
[CV] .... C=0.001, solver=sag, score=0.6793478260869565, total=   0.0s
[CV] C=0.001, solver=sag .............................................
[CV] .... C=0.001, solver=sag, score=0.7119565217391305, total=   0.0s
[CV] C=0.001, solver=sag .............................................
[CV] .... C=0.001, solver=sag, score=0.6766304347826086, total=   0.0s
[CV] C=0.001, solver=sag .............................................
[CV] .... C=0.001, solver=sag, score=0.6902173913043478, total=   0.0s
[CV] C=0.001, solver=saga ............................................
[CV] ... C=0.001, solver=saga, score=0.6684782608695652, total=   0.0s
[CV] C=0.001, solver=saga ............................................
[CV]

[CV] ..... C=0.1, solver=saga, score=0.7364130434782609, total=   0.1s
[CV] C=0.1, solver=saga ..............................................
[CV] ..... C=0.1, solver=saga, score=0.7554347826086957, total=   0.1s
[CV] C=0.1, solver=saga ..............................................
[CV] ..... C=0.1, solver=saga, score=0.7690217391304348, total=   0.1s
[CV] C=1, solver=newton-cg ...........................................
[CV] .. C=1, solver=newton-cg, score=0.7336956521739131, total=   0.0s
[CV] C=1, solver=newton-cg ...........................................
[CV] .. C=1, solver=newton-cg, score=0.7364130434782609, total=   0.0s
[CV] C=1, solver=newton-cg ...........................................
[CV] .. C=1, solver=newton-cg, score=0.7581521739130435, total=   0.0s
[CV] C=1, solver=newton-cg ...........................................
[CV] .. C=1, solver=newton-cg, score=0.7581521739130435, total=   0.0s
[CV] C=1, solver=newton-cg ...........................................
[CV] .



[CV] ...... C=10, solver=saga, score=0.7445652173913043, total=   1.8s
[CV] C=10, solver=saga ...............................................




[CV] ...... C=10, solver=saga, score=0.7445652173913043, total=   1.8s
[CV] C=10, solver=saga ...............................................




[CV] ...... C=10, solver=saga, score=0.7989130434782609, total=   1.8s
[CV] C=10, solver=saga ...............................................




[CV] ...... C=10, solver=saga, score=0.7527173913043478, total=   1.8s
[CV] C=10, solver=saga ...............................................




[CV] ...... C=10, solver=saga, score=0.7880434782608695, total=   1.8s
[CV] C=100, solver=newton-cg .........................................
[CV]  C=100, solver=newton-cg, score=0.7717391304347826, total=   0.1s
[CV] C=100, solver=newton-cg .........................................
[CV]  C=100, solver=newton-cg, score=0.7554347826086957, total=   0.1s
[CV] C=100, solver=newton-cg .........................................
[CV]  C=100, solver=newton-cg, score=0.7934782608695652, total=   0.1s
[CV] C=100, solver=newton-cg .........................................
[CV]  C=100, solver=newton-cg, score=0.7445652173913043, total=   0.1s
[CV] C=100, solver=newton-cg .........................................
[CV]  C=100, solver=newton-cg, score=0.7717391304347826, total=   0.1s
[CV] C=100, solver=lbfgs .............................................
[CV] .... C=100, solver=lbfgs, score=0.7717391304347826, total=   0.3s
[CV] C=100, solver=lbfgs .............................................
[CV] .



[CV] ...... C=100, solver=sag, score=0.7717391304347826, total=   1.8s
[CV] C=100, solver=sag ...............................................




[CV] ...... C=100, solver=sag, score=0.7527173913043478, total=   1.8s
[CV] C=100, solver=sag ...............................................




[CV] ...... C=100, solver=sag, score=0.7907608695652174, total=   1.7s
[CV] C=100, solver=sag ...............................................




[CV] ...... C=100, solver=sag, score=0.7554347826086957, total=   1.7s
[CV] C=100, solver=sag ...............................................




[CV] ...... C=100, solver=sag, score=0.7907608695652174, total=   1.8s
[CV] C=100, solver=saga ..............................................




[CV] ..... C=100, solver=saga, score=0.7690217391304348, total=   2.0s
[CV] C=100, solver=saga ..............................................




[CV] ..... C=100, solver=saga, score=0.7472826086956522, total=   1.8s
[CV] C=100, solver=saga ..............................................




[CV] ..... C=100, solver=saga, score=0.7961956521739131, total=   1.8s
[CV] C=100, solver=saga ..............................................




[CV] ..... C=100, solver=saga, score=0.7527173913043478, total=   1.8s
[CV] C=100, solver=saga ..............................................


[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:   46.8s finished


[CV] ..... C=100, solver=saga, score=0.7961956521739131, total=   2.0s
{'C': 100, 'solver': 'sag'}
0.7722826086956521
LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=2000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='sag',
          tol=0.0001, verbose=0, warm_start=False)
              precision    recall  f1-score   support

           0       0.80      0.74      0.77       337
           1       0.71      0.77      0.74       277

   micro avg       0.76      0.76      0.76       614
   macro avg       0.75      0.76      0.75       614
weighted avg       0.76      0.76      0.76       614





In [12]:
# for 1 part 0 to 0.25 part 1
model_25 = grid_search_logistic(X_train_25, X_test_25, y_train_25, y_test_25)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] C=0.001, solver=newton-cg .......................................
[CV]  C=0.001, solver=newton-cg, score=0.6958041958041958, total=   0.0s
[CV] C=0.001, solver=newton-cg .......................................
[CV]  C=0.001, solver=newton-cg, score=0.704225352112676, total=   0.0s
[CV] C=0.001, solver=newton-cg .......................................
[CV]  C=0.001, solver=newton-cg, score=0.7323943661971831, total=   0.0s
[CV] C=0.001, solver=newton-cg .......................................
[CV]  C=0.001, solver=newton-cg, score=0.7112676056338029, total=   0.0s
[CV] C=0.001, solver=newton-cg .......................................
[CV]  C=0.001, solver=newton-cg, score=0.704225352112676, total=   0.0s
[CV] C=0.001, solver=lbfgs ...........................................
[CV] .. C=0.001, solver=lbfgs, score=0.6958041958041958, total=   0.0s
[CV] C=0.001, solver=lbfgs ...........................................
[CV] ..

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV] .... C=0.001, solver=sag, score=0.6958041958041958, total=   0.0s
[CV] C=0.001, solver=sag .............................................
[CV] ..... C=0.001, solver=sag, score=0.704225352112676, total=   0.0s
[CV] C=0.001, solver=sag .............................................
[CV] .... C=0.001, solver=sag, score=0.7323943661971831, total=   0.0s
[CV] C=0.001, solver=sag .............................................
[CV] .... C=0.001, solver=sag, score=0.7112676056338029, total=   0.0s
[CV] C=0.001, solver=sag .............................................
[CV] ..... C=0.001, solver=sag, score=0.704225352112676, total=   0.0s
[CV] C=0.001, solver=saga ............................................
[CV] ... C=0.001, solver=saga, score=0.6958041958041958, total=   0.0s
[CV] C=0.001, solver=saga ............................................
[CV] .... C=0.001, solver=saga, score=0.704225352112676, total=   0.0s
[CV] C=0.001, solver=saga ............................................
[CV] .

[CV] ..... C=0.1, solver=saga, score=0.7429577464788732, total=   0.1s
[CV] C=0.1, solver=saga ..............................................
[CV] ...... C=0.1, solver=saga, score=0.704225352112676, total=   0.1s
[CV] C=1, solver=newton-cg ...........................................
[CV] .. C=1, solver=newton-cg, score=0.7552447552447552, total=   0.0s
[CV] C=1, solver=newton-cg ...........................................
[CV] .. C=1, solver=newton-cg, score=0.7359154929577465, total=   0.0s
[CV] C=1, solver=newton-cg ...........................................
[CV] .. C=1, solver=newton-cg, score=0.7394366197183099, total=   0.0s
[CV] C=1, solver=newton-cg ...........................................
[CV] .. C=1, solver=newton-cg, score=0.7394366197183099, total=   0.0s
[CV] C=1, solver=newton-cg ...........................................
[CV] .. C=1, solver=newton-cg, score=0.7112676056338029, total=   0.0s
[CV] C=1, solver=lbfgs ...............................................
[CV] .



[CV] ...... C=10, solver=saga, score=0.7867132867132867, total=   1.4s
[CV] C=10, solver=saga ...............................................




[CV] .................... C=10, solver=saga, score=0.75, total=   1.4s
[CV] C=10, solver=saga ...............................................




[CV] ...... C=10, solver=saga, score=0.7570422535211268, total=   1.4s
[CV] C=10, solver=saga ...............................................




[CV] ...... C=10, solver=saga, score=0.7323943661971831, total=   1.4s
[CV] C=10, solver=saga ...............................................




[CV] ...... C=10, solver=saga, score=0.7288732394366197, total=   1.4s
[CV] C=100, solver=newton-cg .........................................
[CV]  C=100, solver=newton-cg, score=0.7867132867132867, total=   0.1s
[CV] C=100, solver=newton-cg .........................................
[CV]  C=100, solver=newton-cg, score=0.7394366197183099, total=   0.1s
[CV] C=100, solver=newton-cg .........................................
[CV]  C=100, solver=newton-cg, score=0.7640845070422535, total=   0.1s
[CV] C=100, solver=newton-cg .........................................
[CV]  C=100, solver=newton-cg, score=0.7323943661971831, total=   0.1s
[CV] C=100, solver=newton-cg .........................................
[CV]  C=100, solver=newton-cg, score=0.7394366197183099, total=   0.1s
[CV] C=100, solver=lbfgs .............................................
[CV] .... C=100, solver=lbfgs, score=0.7867132867132867, total=   0.2s
[CV] C=100, solver=lbfgs .............................................
[CV] .



[CV] ...... C=100, solver=sag, score=0.7902097902097902, total=   1.8s
[CV] C=100, solver=sag ...............................................




[CV] ...... C=100, solver=sag, score=0.7394366197183099, total=   1.7s
[CV] C=100, solver=sag ...............................................




[CV] .................... C=100, solver=sag, score=0.75, total=   1.4s
[CV] C=100, solver=sag ...............................................




[CV] ...... C=100, solver=sag, score=0.7359154929577465, total=   1.3s
[CV] C=100, solver=sag ...............................................




[CV] ...... C=100, solver=sag, score=0.7429577464788732, total=   1.3s
[CV] C=100, solver=saga ..............................................




[CV] ..... C=100, solver=saga, score=0.7902097902097902, total=   1.4s
[CV] C=100, solver=saga ..............................................




[CV] ..... C=100, solver=saga, score=0.7429577464788732, total=   1.4s
[CV] C=100, solver=saga ..............................................




[CV] ..... C=100, solver=saga, score=0.7535211267605634, total=   1.5s
[CV] C=100, solver=saga ..............................................




[CV] ..... C=100, solver=saga, score=0.7359154929577465, total=   1.4s
[CV] C=100, solver=saga ..............................................
[CV] ..... C=100, solver=saga, score=0.7394366197183099, total=   1.5s
{'C': 100, 'solver': 'newton-cg'}
0.7524613220815752
LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=2000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False)
              precision    recall  f1-score   support

           0       0.78      0.87      0.82       341
           1       0.52      0.35      0.42       133

   micro avg       0.73      0.73      0.73       474
   macro avg       0.65      0.61      0.62       474
weighted avg       0.70      0.73      0.71       474



[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:   36.2s finished


This is not significantly better than the straight logistic regression.

In [13]:
def predict_the_winner(model, year, team_data, X_train):
    '''
    INPUT: 
    -X_train = scaled X train data.
    -model = the saved model.
    -team_data = complete dataframe with all data.
    -year = the year want to look at.
    
    OUTPUT:
    -printed prediction.
    
    DESCRIPTION:
    -data from year of interest is isolated.
    -the data are scaled.
    -the prediction is made.
    -print out the resulting probability and the name of the team.
    '''
    
    # grab the data.
    team_data = team_data.loc[team_data["year"] == year].reset_index()

    # set features (no team, year, winners).
    # set target (winners).
    features = team_data[feature_columns]
    
    # scale X_train and X_test.
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    features = scaler.fit_transform(features)
    
    # fit the model.
    probabilities = model.predict_proba(features)

    # convert predictions to datafram.e
    WS_predictions = pd.DataFrame(probabilities[:,1])

    # Sort the DataFrame (descending)
    WS_predictions = WS_predictions.sort_values(0, ascending=False)

    WS_predictions['Probability'] = WS_predictions[0]

    # Print 50 highest probability HoF inductees from still eligible players
    for i, row in WS_predictions.head(50).iterrows():
       prob = ' '.join(('WS Probability =', str(row['Probability'])))
       print(prob)
       print(team_data.iloc[i,1:27]["team"])
       print('')

Prediction for 2018

In [14]:
print ("model_100----------------------------------------")
predict_the_winner(model_100, 2018, team_data, X_train_100)

model_100----------------------------------------
WS Probability = 0.9999999782604523
Detroit Tigers

WS Probability = 0.9999999736642939
Chicago White Sox

WS Probability = 0.9999998838473336
Miami Marlins

WS Probability = 0.9999997749311071
Cincinnati Reds

WS Probability = 0.9999877807975708
Minnesota Twins

WS Probability = 0.9999818191926908
Cleveland Indians

WS Probability = 0.9994769526714442
Pittsburgh Pirates

WS Probability = 0.997422594192222
Baltimore Orioles

WS Probability = 0.9973161150174978
St. Louis Cardinals

WS Probability = 0.9969791639040348
Washington Nationals

WS Probability = 0.9933464061760664
Arizona Diamondbacks

WS Probability = 0.9929378331759867
New York Mets

WS Probability = 0.9512642157088542
Los Angeles Angels

WS Probability = 0.7391998559103201
Houston Astros

WS Probability = 0.6045613917226097
Los Angeles Dodgers

WS Probability = 0.4495103608455275
Boston Red Sox

WS Probability = 0.430323288028354
New York Yankees

WS Probability = 0.12599727

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [15]:
print ("model_50----------------------------------------")
predict_the_winner(model_50, 2018, team_data, X_train_50)

model_50----------------------------------------
WS Probability = 0.99955681828879
Minnesota Twins

WS Probability = 0.9983572429698656
Houston Astros

WS Probability = 0.9948222014798827
Miami Marlins

WS Probability = 0.9917524712565098
Cincinnati Reds

WS Probability = 0.9909253238088531
Tampa Bay Rays

WS Probability = 0.9895003705594901
Arizona Diamondbacks

WS Probability = 0.9727144750762786
Boston Red Sox

WS Probability = 0.9722085420056601
Cleveland Indians

WS Probability = 0.9505499686279417
Detroit Tigers

WS Probability = 0.9299195153116382
Los Angeles Angels

WS Probability = 0.8154741981144067
Los Angeles Dodgers

WS Probability = 0.7948559108890336
New York Mets

WS Probability = 0.6684105451110249
St. Louis Cardinals

WS Probability = 0.6367811718284374
Pittsburgh Pirates

WS Probability = 0.6327877571783991
Washington Nationals

WS Probability = 0.49080838828409473
San Francisco Giants

WS Probability = 0.3075386387250637
Atlanta Braves

WS Probability = 0.2004278632

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [16]:
print ("model_25----------------------------------------")
predict_the_winner(model_25, 2018, team_data, X_train_25)

model_25----------------------------------------
WS Probability = 0.999997470967751
Detroit Tigers

WS Probability = 0.9999544294300281
Miami Marlins

WS Probability = 0.9998295267824855
Cincinnati Reds

WS Probability = 0.9991741550398853
Chicago White Sox

WS Probability = 0.9959648393113707
Minnesota Twins

WS Probability = 0.9838494166082896
Pittsburgh Pirates

WS Probability = 0.9557605635075817
Baltimore Orioles

WS Probability = 0.937734256873439
Arizona Diamondbacks

WS Probability = 0.8244619976196327
Washington Nationals

WS Probability = 0.8115459803800784
Los Angeles Angels

WS Probability = 0.583629008080197
New York Mets

WS Probability = 0.5462892214471617
Cleveland Indians

WS Probability = 0.49971251276078743
Houston Astros

WS Probability = 0.4370423950185507
St. Louis Cardinals

WS Probability = 0.1747019049650412
San Diego Padres

WS Probability = 0.10752945418211189
Boston Red Sox

WS Probability = 0.08690233044723936
New York Yankees

WS Probability = 0.0440419208

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Precition for 2017

In [17]:
# predict for 2017.
print ("model_100----------------------------------------")
predict_the_winner(model_100, 2017, team_data, X_train_100)

model_100----------------------------------------
WS Probability = 0.9999999999994598
Cleveland Indians

WS Probability = 0.9999999999921205
Boston Red Sox

WS Probability = 0.9999999999888429
Miami Marlins

WS Probability = 0.9999999975560929
Atlanta Braves

WS Probability = 0.9999999841079461
Pittsburgh Pirates

WS Probability = 0.9999995568463503
Detroit Tigers

WS Probability = 0.9999909860056091
Washington Nationals

WS Probability = 0.9999908768223079
Oakland Athletics

WS Probability = 0.9983435358112226
Minnesota Twins

WS Probability = 0.9973594437813035
Seattle Mariners

WS Probability = 0.996288579373326
Los Angeles Angels

WS Probability = 0.977675176313142
Houston Astros

WS Probability = 0.9600510995121289
Chicago White Sox

WS Probability = 0.9480115236838605
San Francisco Giants

WS Probability = 0.936377739418358
New York Yankees

WS Probability = 0.8375638271929086
Kansas City Royals

WS Probability = 0.3672612526493443
Cincinnati Reds

WS Probability = 0.267528153146

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [18]:
print ("model_50----------------------------------------")
predict_the_winner(model_50, 2017, team_data, X_train_50)

model_50----------------------------------------
WS Probability = 0.9999997348701207
Cleveland Indians

WS Probability = 0.999999502347679
Boston Red Sox

WS Probability = 0.9999932349784955
Atlanta Braves

WS Probability = 0.9998972816714242
Miami Marlins

WS Probability = 0.9994629474797923
Oakland Athletics

WS Probability = 0.9994235994278279
Washington Nationals

WS Probability = 0.995647809192693
New York Yankees

WS Probability = 0.9941985170251374
Houston Astros

WS Probability = 0.9925864310080335
Pittsburgh Pirates

WS Probability = 0.6774144296204657
San Francisco Giants

WS Probability = 0.6331142710484409
New York Mets

WS Probability = 0.5672989470221802
Minnesota Twins

WS Probability = 0.43800430826899167
Tampa Bay Rays

WS Probability = 0.2780003091457526
Seattle Mariners

WS Probability = 0.19514672052101562
Los Angeles Angels

WS Probability = 0.17550728179914718
St. Louis Cardinals

WS Probability = 0.16522980208791987
Chicago Cubs

WS Probability = 0.10747731903894

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [19]:
print ("model_25----------------------------------------")
predict_the_winner(model_25, 2017, team_data, X_train_25)

model_25----------------------------------------
WS Probability = 0.9999944869668562
Cleveland Indians

WS Probability = 0.9999854086878237
Boston Red Sox

WS Probability = 0.9999417078836151
Miami Marlins

WS Probability = 0.999857737942763
Atlanta Braves

WS Probability = 0.9976239530917363
Washington Nationals

WS Probability = 0.9964374295009782
Pittsburgh Pirates

WS Probability = 0.9961038251133563
Detroit Tigers

WS Probability = 0.935661367021604
Chicago White Sox

WS Probability = 0.7885880220981903
Minnesota Twins

WS Probability = 0.7621470509869217
Oakland Athletics

WS Probability = 0.7460547839247833
Los Angeles Angels

WS Probability = 0.4950789621158265
New York Yankees

WS Probability = 0.4946177854365386
Kansas City Royals

WS Probability = 0.4399093809979812
Seattle Mariners

WS Probability = 0.4102239666762821
Baltimore Orioles

WS Probability = 0.3604899108815529
Cincinnati Reds

WS Probability = 0.12940093801772176
San Francisco Giants

WS Probability = 0.12604629

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Prediction for 2016

In [20]:
# predict for 2016.
print ("model_100----------------------------------------")
predict_the_winner(model_100, 2016, team_data, X_train_100)

model_100----------------------------------------
WS Probability = 0.9999999995191746
Cleveland Indians

WS Probability = 0.9999999983822894
Houston Astros

WS Probability = 0.9999999874942161
New York Yankees

WS Probability = 0.9999992716610169
Washington Nationals

WS Probability = 0.9999990490892664
Detroit Tigers

WS Probability = 0.9999989406060815
Baltimore Orioles

WS Probability = 0.9999885468514206
Boston Red Sox

WS Probability = 0.9999697549063205
Seattle Mariners

WS Probability = 0.9997633180448677
Minnesota Twins

WS Probability = 0.9982241833294789
Kansas City Royals

WS Probability = 0.9939375622168857
Chicago White Sox

WS Probability = 0.9891811745918971
Atlanta Braves

WS Probability = 0.9879398937062728
Arizona Diamondbacks

WS Probability = 0.9408611949471772
Miami Marlins

WS Probability = 0.5282879698122732
Tampa Bay Rays

WS Probability = 0.2736661505695799
Philadelphia Phillies

WS Probability = 0.1434461970492908
San Francisco Giants

WS Probability = 0.11544

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [21]:
print ("model_50----------------------------------------")
predict_the_winner(model_50, 2016, team_data, X_train_50)

model_50----------------------------------------
WS Probability = 0.9999999973853455
Boston Red Sox

WS Probability = 0.9999996194665175
New York Yankees

WS Probability = 0.9999961856769625
Cleveland Indians

WS Probability = 0.9999947346175232
Houston Astros

WS Probability = 0.9999198549244095
Washington Nationals

WS Probability = 0.9531060390821534
Seattle Mariners

WS Probability = 0.9272434557909819
Minnesota Twins

WS Probability = 0.9157701690377954
Los Angeles Dodgers

WS Probability = 0.911292326985706
Tampa Bay Rays

WS Probability = 0.8427179273882576
Arizona Diamondbacks

WS Probability = 0.8123966844696189
Miami Marlins

WS Probability = 0.7844092937194191
San Francisco Giants

WS Probability = 0.7389571027375844
Chicago Cubs

WS Probability = 0.6045435086070338
Atlanta Braves

WS Probability = 0.5858365784958312
Baltimore Orioles

WS Probability = 0.21300078960084048
St. Louis Cardinals

WS Probability = 0.07258668511955671
Detroit Tigers

WS Probability = 0.04261552979

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [22]:
print ("model_25----------------------------------------")
predict_the_winner(model_25, 2016, team_data, X_train_25)

model_25----------------------------------------
WS Probability = 0.9999241848314029
Cleveland Indians

WS Probability = 0.999905731892709
New York Yankees

WS Probability = 0.9996646568471165
Houston Astros

WS Probability = 0.9994934471126166
Detroit Tigers

WS Probability = 0.9974777780253008
Washington Nationals

WS Probability = 0.9907699089175646
Baltimore Orioles

WS Probability = 0.9903079333193144
Minnesota Twins

WS Probability = 0.9815765719620956
Seattle Mariners

WS Probability = 0.9795952389814863
Boston Red Sox

WS Probability = 0.8953169235742957
Atlanta Braves

WS Probability = 0.8728420927447612
Chicago White Sox

WS Probability = 0.8704779213661804
Miami Marlins

WS Probability = 0.507741413283478
Kansas City Royals

WS Probability = 0.3822701187616777
Tampa Bay Rays

WS Probability = 0.350189298789115
Arizona Diamondbacks

WS Probability = 0.2946601654806726
San Francisco Giants

WS Probability = 0.1269196411851953
New York Mets

WS Probability = 0.10657146889730758

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


#### STEP4: Grid Search Model--SVC.

In [23]:
def grid_search_svc(X_train, X_test, y_train, y_test):
    '''
    INPUT: 
    -X_train = scaled X train data.
    -X_test = scaled X test data.
    -y_train = y train data.
    -y_test = y test data.
    
    OUTPUT:
    -classification report (has F1 score, precision and recall).
    -grid = saved model for prediction. 
    
    DESCRIPTION:
    -the scaled and split data is put through a grid search with svc.
    -the model is trained.
    -a prediction is made.
    -print out the classification report and give the model.
    '''
    
    # set up svc model.
    model = SVC(probability=True)

    # create gridsearch estimator.
    param_grid = {"C": [0.0001, 0.001, 0.01, 0.1, 1],
                  "gamma": ["auto", "scale"],
                 "kernel": ["poly"]}
    grid = GridSearchCV(model, param_grid, verbose=3)

    # fit the model.
    grid.fit(X_train, y_train)

    # predict.
    prediction = grid.predict(X_test)
    
    # print out the basic information about the grid search.
    print (grid.best_params_)
    print (grid.best_score_)
    print (grid.best_estimator_)
    
    grid = grid.best_estimator_
    predictions = grid.predict(X_test)
    print (classification_report(y_test, prediction, target_names=["0", "1"]))
    
    return grid

In [24]:
# for 1 part 0 to 1 part 1
model_100 = grid_search_svc(X_train_100, X_test_100, y_train_100, y_test_100)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] C=0.0001, gamma=auto, kernel=poly ...............................
[CV]  C=0.0001, gamma=auto, kernel=poly, score=0.6338185890257558, total=   0.9s
[CV] C=0.0001, gamma=auto, kernel=poly ...............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s


[CV]  C=0.0001, gamma=auto, kernel=poly, score=0.6338185890257558, total=   0.8s
[CV] C=0.0001, gamma=auto, kernel=poly ...............................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.9s remaining:    0.0s


[CV]  C=0.0001, gamma=auto, kernel=poly, score=0.6334080717488789, total=   0.8s
[CV] C=0.0001, gamma=scale, kernel=poly ..............................
[CV]  C=0.0001, gamma=scale, kernel=poly, score=0.6338185890257558, total=   0.8s
[CV] C=0.0001, gamma=scale, kernel=poly ..............................
[CV]  C=0.0001, gamma=scale, kernel=poly, score=0.6338185890257558, total=   0.9s
[CV] C=0.0001, gamma=scale, kernel=poly ..............................
[CV]  C=0.0001, gamma=scale, kernel=poly, score=0.6334080717488789, total=   0.8s
[CV] C=0.001, gamma=auto, kernel=poly ................................
[CV]  C=0.001, gamma=auto, kernel=poly, score=0.6371780515117581, total=   0.9s
[CV] C=0.001, gamma=auto, kernel=poly ................................
[CV]  C=0.001, gamma=auto, kernel=poly, score=0.6382978723404256, total=   0.8s
[CV] C=0.001, gamma=auto, kernel=poly ................................
[CV]  C=0.001, gamma=auto, kernel=poly, score=0.6345291479820628, total=   0.9s
[CV] C=

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   26.6s finished


{'C': 1, 'gamma': 'auto', 'kernel': 'poly'}
0.9115011202389843
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='poly',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
              precision    recall  f1-score   support

           0       1.00      0.86      0.93       356
           1       0.92      1.00      0.96       537

   micro avg       0.95      0.95      0.95       893
   macro avg       0.96      0.93      0.94       893
weighted avg       0.95      0.95      0.94       893



In [25]:
# for 1 part 0\ to 0.5 part 1
model_50 = grid_search_svc(X_train_50, X_test_50, y_train_50, y_test_50)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] C=0.0001, gamma=auto, kernel=poly ...............................
[CV]  C=0.0001, gamma=auto, kernel=poly, score=0.5439739413680782, total=   0.5s
[CV] C=0.0001, gamma=auto, kernel=poly ...............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s


[CV]  C=0.0001, gamma=auto, kernel=poly, score=0.5432300163132137, total=   0.5s
[CV] C=0.0001, gamma=auto, kernel=poly ...............................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.1s remaining:    0.0s


[CV]  C=0.0001, gamma=auto, kernel=poly, score=0.5432300163132137, total=   0.5s
[CV] C=0.0001, gamma=scale, kernel=poly ..............................
[CV]  C=0.0001, gamma=scale, kernel=poly, score=0.5439739413680782, total=   0.6s
[CV] C=0.0001, gamma=scale, kernel=poly ..............................
[CV]  C=0.0001, gamma=scale, kernel=poly, score=0.5432300163132137, total=   0.5s
[CV] C=0.0001, gamma=scale, kernel=poly ..............................
[CV]  C=0.0001, gamma=scale, kernel=poly, score=0.5432300163132137, total=   0.5s
[CV] C=0.001, gamma=auto, kernel=poly ................................
[CV]  C=0.001, gamma=auto, kernel=poly, score=0.5439739413680782, total=   0.5s
[CV] C=0.001, gamma=auto, kernel=poly ................................
[CV]  C=0.001, gamma=auto, kernel=poly, score=0.5432300163132137, total=   0.5s
[CV] C=0.001, gamma=auto, kernel=poly ................................
[CV]  C=0.001, gamma=auto, kernel=poly, score=0.5432300163132137, total=   0.5s
[CV] C=

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   15.5s finished


{'C': 1, 'gamma': 'scale', 'kernel': 'poly'}
0.8608695652173913
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='poly',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
              precision    recall  f1-score   support

           0       0.92      0.93      0.93       337
           1       0.92      0.90      0.91       277

   micro avg       0.92      0.92      0.92       614
   macro avg       0.92      0.92      0.92       614
weighted avg       0.92      0.92      0.92       614



In [26]:
# for 1 part 0 to 0.25 part 1
model_25 = grid_search_svc(X_train_25, X_test_25, y_train_25, y_test_25)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] C=0.0001, gamma=auto, kernel=poly ...............................
[CV]  C=0.0001, gamma=auto, kernel=poly, score=0.70042194092827, total=   0.2s
[CV] C=0.0001, gamma=auto, kernel=poly ...............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV]  C=0.0001, gamma=auto, kernel=poly, score=0.70042194092827, total=   0.2s
[CV] C=0.0001, gamma=auto, kernel=poly ...............................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.5s remaining:    0.0s


[CV]  C=0.0001, gamma=auto, kernel=poly, score=0.70042194092827, total=   0.2s
[CV] C=0.0001, gamma=scale, kernel=poly ..............................
[CV]  C=0.0001, gamma=scale, kernel=poly, score=0.70042194092827, total=   0.2s
[CV] C=0.0001, gamma=scale, kernel=poly ..............................
[CV]  C=0.0001, gamma=scale, kernel=poly, score=0.70042194092827, total=   0.2s
[CV] C=0.0001, gamma=scale, kernel=poly ..............................
[CV]  C=0.0001, gamma=scale, kernel=poly, score=0.70042194092827, total=   0.2s
[CV] C=0.001, gamma=auto, kernel=poly ................................
[CV]  C=0.001, gamma=auto, kernel=poly, score=0.70042194092827, total=   0.2s
[CV] C=0.001, gamma=auto, kernel=poly ................................
[CV]  C=0.001, gamma=auto, kernel=poly, score=0.70042194092827, total=   0.2s
[CV] C=0.001, gamma=auto, kernel=poly ................................
[CV]  C=0.001, gamma=auto, kernel=poly, score=0.70042194092827, total=   0.2s
[CV] C=0.001, gamma=s

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    7.6s finished


{'C': 1, 'gamma': 'scale', 'kernel': 'poly'}
0.8143459915611815
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='poly',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
              precision    recall  f1-score   support

           0       0.86      0.98      0.92       341
           1       0.92      0.59      0.72       133

   micro avg       0.87      0.87      0.87       474
   macro avg       0.89      0.78      0.82       474
weighted avg       0.88      0.87      0.86       474



Huh.  That's pretty good.  Try out model_100 and model_50 with the 2016-2018 stuff.

#### STEP5: Predict 2016-2018 winners with SVC Grid Search.

In [27]:
# predict for 2018.
predict_the_winner(model_100, 2018, team_data, X_train_100)

WS Probability = 0.9956239611867604
Boston Red Sox

WS Probability = 0.9590764825898606
Cleveland Indians

WS Probability = 0.9546190573819126
Detroit Tigers

WS Probability = 0.6172689312091171
Minnesota Twins

WS Probability = 0.21864411205472745
Los Angeles Angels

WS Probability = 0.2149358895643747
Tampa Bay Rays

WS Probability = 0.16610583702922316
St. Louis Cardinals

WS Probability = 0.0844524235043789
Arizona Diamondbacks

WS Probability = 0.07134292035712121
New York Yankees

WS Probability = 0.04929907469852173
Atlanta Braves

WS Probability = 0.04383248506579274
Pittsburgh Pirates

WS Probability = 0.04309544109443426
Washington Nationals

WS Probability = 0.03130203132052158
San Diego Padres

WS Probability = 0.03014107457827023
Houston Astros

WS Probability = 0.029299982826124223
Chicago Cubs

WS Probability = 0.026485685753982177
San Francisco Giants

WS Probability = 0.018974727881140477
New York Mets

WS Probability = 0.015327532806735234
Cincinnati Reds

WS Probabil

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [28]:
# predict for 2017.
predict_the_winner(model_100, 2017, team_data, X_train_100)

WS Probability = 0.9999999999999699
Houston Astros

WS Probability = 0.9999999996523358
Boston Red Sox

WS Probability = 0.9749576223720854
Chicago White Sox

WS Probability = 0.9291133235167777
Cleveland Indians

WS Probability = 0.867631140634842
Los Angeles Dodgers

WS Probability = 0.7065827801729095
New York Yankees

WS Probability = 0.6920123813353085
Washington Nationals

WS Probability = 0.25711850222272403
Colorado Rockies

WS Probability = 0.17084236635351466
Atlanta Braves

WS Probability = 0.1534409589864023
Tampa Bay Rays

WS Probability = 0.14395321312189635
Pittsburgh Pirates

WS Probability = 0.14057239347736306
Seattle Mariners

WS Probability = 0.05253368895532756
Minnesota Twins

WS Probability = 0.0246142937279138
Philadelphia Phillies

WS Probability = 0.02129074400449587
Los Angeles Angels

WS Probability = 0.013617615175373998
New York Mets

WS Probability = 0.013584396374493261
Arizona Diamondbacks

WS Probability = 0.008888065026672283
St. Louis Cardinals

WS P

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [29]:
# predict for 2016
predict_the_winner(model_100, 2016, team_data, X_train_100)

WS Probability = 0.9999999999999699
Cleveland Indians

WS Probability = 0.999999999999906
Houston Astros

WS Probability = 0.9841012301502365
Boston Red Sox

WS Probability = 0.9217132417202071
Washington Nationals

WS Probability = 0.9114796677031863
New York Yankees

WS Probability = 0.7326049425342304
Chicago Cubs

WS Probability = 0.5332611745343127
Los Angeles Dodgers

WS Probability = 0.4603590990017047
Arizona Diamondbacks

WS Probability = 0.30649578601432137
Detroit Tigers

WS Probability = 0.2714666010375934
Los Angeles Angels

WS Probability = 0.06603601729556853
Tampa Bay Rays

WS Probability = 0.04952055048911798
Philadelphia Phillies

WS Probability = 0.04486975314792595
Minnesota Twins

WS Probability = 0.03860343855506139
Colorado Rockies

WS Probability = 0.030943270808182054
Seattle Mariners

WS Probability = 0.022361392923149804
Miami Marlins

WS Probability = 0.021370654468489052
Atlanta Braves

WS Probability = 0.020224028530232835
Milwaukee Brewers

WS Probability

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [38]:
# predict for 2019.
predict_the_winner(model_100, 2019, team_data, X_train_100)

WS Probability = 0.9956239611867604
Boston Red Sox

WS Probability = 0.9590764825898606
Cleveland Indians

WS Probability = 0.9546190573819126
Detroit Tigers

WS Probability = 0.6172689312091171
Minnesota Twins

WS Probability = 0.21864411205472745
Los Angeles Angels

WS Probability = 0.2149358895643747
Tampa Bay Rays

WS Probability = 0.16610583702922316
St. Louis Cardinals

WS Probability = 0.0844524235043789
Arizona Diamondbacks

WS Probability = 0.07134292035712121
New York Yankees

WS Probability = 0.04929907469852173
Atlanta Braves

WS Probability = 0.04383248506579274
Pittsburgh Pirates

WS Probability = 0.04309544109443426
Washington Nationals

WS Probability = 0.03130203132052158
San Diego Padres

WS Probability = 0.03014107457827023
Houston Astros

WS Probability = 0.029299982826124223
Chicago Cubs

WS Probability = 0.026485685753982177
San Francisco Giants

WS Probability = 0.018974727881140477
New York Mets

WS Probability = 0.015327532806735234
Cincinnati Reds

WS Probabil

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [30]:
# predict for 2018
predict_the_winner(model_50, 2018, team_data, X_train_50)

WS Probability = 0.9864225268542672
Boston Red Sox

WS Probability = 0.6260946935018998
Houston Astros

WS Probability = 0.5759645768371823
Cleveland Indians

WS Probability = 0.5342340242751272
Los Angeles Angels

WS Probability = 0.3788515985418775
Tampa Bay Rays

WS Probability = 0.23537217685958542
St. Louis Cardinals

WS Probability = 0.2332680515746027
Atlanta Braves

WS Probability = 0.1788807847785954
Arizona Diamondbacks

WS Probability = 0.16479271329049727
Chicago Cubs

WS Probability = 0.15988655514727404
Pittsburgh Pirates

WS Probability = 0.15119653566691998
San Francisco Giants

WS Probability = 0.14656532349122325
New York Yankees

WS Probability = 0.1434206464645377
Washington Nationals

WS Probability = 0.13938732818605779
San Diego Padres

WS Probability = 0.1349339391990964
New York Mets

WS Probability = 0.12125681447687245
Detroit Tigers

WS Probability = 0.09014307639011586
Oakland Athletics

WS Probability = 0.08958689884133877
Philadelphia Phillies

WS Probabi

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [31]:
# predict for 2017
predict_the_winner(model_50, 2017, team_data, X_train_50)

WS Probability = 0.9999993406030236
Houston Astros

WS Probability = 0.9622844251980901
Los Angeles Dodgers

WS Probability = 0.8682386831473867
Boston Red Sox

WS Probability = 0.5571326739214212
Chicago White Sox

WS Probability = 0.43914568593020004
Washington Nationals

WS Probability = 0.42603069133280275
Cleveland Indians

WS Probability = 0.4118730729558242
Colorado Rockies

WS Probability = 0.26596563840146653
Seattle Mariners

WS Probability = 0.24875532053214858
Atlanta Braves

WS Probability = 0.22753102725904953
New York Yankees

WS Probability = 0.17628538251699785
Minnesota Twins

WS Probability = 0.1635035353972247
Philadelphia Phillies

WS Probability = 0.1510409579606459
Pittsburgh Pirates

WS Probability = 0.13711939637953913
Tampa Bay Rays

WS Probability = 0.12489844305355147
Los Angeles Angels

WS Probability = 0.09415718761520218
New York Mets

WS Probability = 0.08566760851868437
St. Louis Cardinals

WS Probability = 0.06904873766771169
Miami Marlins

WS Probabil

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [32]:
# predict for 2016
predict_the_winner(model_50, 2016, team_data, X_train_50)

WS Probability = 0.9888494883378908
Cleveland Indians

WS Probability = 0.981124955574024
Houston Astros

WS Probability = 0.9075593817995368
Boston Red Sox

WS Probability = 0.7221545818566063
Washington Nationals

WS Probability = 0.5660751590566543
Chicago Cubs

WS Probability = 0.5459950834852813
Detroit Tigers

WS Probability = 0.545214350326214
New York Yankees

WS Probability = 0.4787863752401891
Los Angeles Angels

WS Probability = 0.3968921144929021
Arizona Diamondbacks

WS Probability = 0.20606978847722002
Colorado Rockies

WS Probability = 0.18101445001908376
Philadelphia Phillies

WS Probability = 0.16815641156311986
Los Angeles Dodgers

WS Probability = 0.16026808388144836
Milwaukee Brewers

WS Probability = 0.15596734914268687
Tampa Bay Rays

WS Probability = 0.15470056558937156
Atlanta Braves

WS Probability = 0.14042782071988347
Seattle Mariners

WS Probability = 0.1312543525495099
Minnesota Twins

WS Probability = 0.10587396390395568
St. Louis Cardinals

WS Probability

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [37]:
# predict for 2019.
predict_the_winner(model_50, 2019, team_data, X_train_50)

WS Probability = 0.9864225268542672
Boston Red Sox

WS Probability = 0.6260946935018998
Houston Astros

WS Probability = 0.5759645768371823
Cleveland Indians

WS Probability = 0.5342340242751272
Los Angeles Angels

WS Probability = 0.3788515985418775
Tampa Bay Rays

WS Probability = 0.23537217685958542
St. Louis Cardinals

WS Probability = 0.2332680515746027
Atlanta Braves

WS Probability = 0.1788807847785954
Arizona Diamondbacks

WS Probability = 0.16479271329049727
Chicago Cubs

WS Probability = 0.15988655514727404
Pittsburgh Pirates

WS Probability = 0.15119653566691998
San Francisco Giants

WS Probability = 0.14656532349122325
New York Yankees

WS Probability = 0.1434206464645377
Washington Nationals

WS Probability = 0.13938732818605779
San Diego Padres

WS Probability = 0.1349339391990964
New York Mets

WS Probability = 0.12125681447687245
Detroit Tigers

WS Probability = 0.09014307639011586
Oakland Athletics

WS Probability = 0.08958689884133877
Philadelphia Phillies

WS Probabi

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [33]:
# predict for 2018
predict_the_winner(model_25, 2018, team_data, X_train_25)

WS Probability = 0.8934392592134591
Boston Red Sox

WS Probability = 0.4299158242955364
Los Angeles Angels

WS Probability = 0.2248668974001906
Kansas City Royals

WS Probability = 0.22443688956429875
Atlanta Braves

WS Probability = 0.2096197853517553
San Diego Padres

WS Probability = 0.19269271845833236
Houston Astros

WS Probability = 0.18163750218246563
Oakland Athletics

WS Probability = 0.17695267533612566
Arizona Diamondbacks

WS Probability = 0.1651740955613368
Chicago Cubs

WS Probability = 0.16456729315928073
St. Louis Cardinals

WS Probability = 0.15626188330465815
Pittsburgh Pirates

WS Probability = 0.14731215598634617
New York Mets

WS Probability = 0.14679783279159522
San Francisco Giants

WS Probability = 0.14313921981576072
Washington Nationals

WS Probability = 0.138657742319001
Milwaukee Brewers

WS Probability = 0.13193994262840308
Toronto Blue Jays

WS Probability = 0.12432133311409375
Philadelphia Phillies

WS Probability = 0.11245076757477078
Baltimore Orioles



  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [34]:
# predict for 2017
predict_the_winner(model_25, 2017, team_data, X_train_25)

WS Probability = 0.9469535985851513
Houston Astros

WS Probability = 0.5
Boston Red Sox

WS Probability = 0.37477335238417986
Los Angeles Dodgers

WS Probability = 0.32308783164246613
Chicago White Sox

WS Probability = 0.27610529357704566
Washington Nationals

WS Probability = 0.27006612582636175
Seattle Mariners

WS Probability = 0.22820297019111976
Atlanta Braves

WS Probability = 0.22003851821462841
Colorado Rockies

WS Probability = 0.20271594689252304
Minnesota Twins

WS Probability = 0.19673248562000828
Cleveland Indians

WS Probability = 0.1751485132448546
Tampa Bay Rays

WS Probability = 0.16702020214772903
Philadelphia Phillies

WS Probability = 0.16609089160194057
Miami Marlins

WS Probability = 0.1477029694169547
Los Angeles Angels

WS Probability = 0.1330522682708825
St. Louis Cardinals

WS Probability = 0.13257564702842192
Kansas City Royals

WS Probability = 0.12773582129400055
Cincinnati Reds

WS Probability = 0.1237210686872658
New York Mets

WS Probability = 0.1206441

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [35]:
# predict for 2016
predict_the_winner(model_25, 2016, team_data, X_train_25)

WS Probability = 0.8800316496063236
Houston Astros

WS Probability = 0.5855889634798465
Cleveland Indians

WS Probability = 0.46388257808880895
Boston Red Sox

WS Probability = 0.42180372824413215
Chicago Cubs

WS Probability = 0.400147825414875
Detroit Tigers

WS Probability = 0.36565510801336665
Los Angeles Angels

WS Probability = 0.26250359560383524
San Diego Padres

WS Probability = 0.22607998371134233
Colorado Rockies

WS Probability = 0.2203410505858895
Philadelphia Phillies

WS Probability = 0.20116915911275612
Atlanta Braves

WS Probability = 0.1748659739576072
Minnesota Twins

WS Probability = 0.17368566716836212
Baltimore Orioles

WS Probability = 0.17084577052921096
Tampa Bay Rays

WS Probability = 0.14867704941666723
Washington Nationals

WS Probability = 0.14652596824164096
Seattle Mariners

WS Probability = 0.14567633139303618
New York Yankees

WS Probability = 0.1413984071234174
Pittsburgh Pirates

WS Probability = 0.1375198515036931
St. Louis Cardinals

WS Probability 

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [36]:
# predict for 2019.
predict_the_winner(model_25, 2019, team_data, X_train_25)

WS Probability = 0.8934392592134591
Boston Red Sox

WS Probability = 0.4299158242955364
Los Angeles Angels

WS Probability = 0.2248668974001906
Kansas City Royals

WS Probability = 0.22443688956429875
Atlanta Braves

WS Probability = 0.2096197853517553
San Diego Padres

WS Probability = 0.19269271845833236
Houston Astros

WS Probability = 0.18163750218246563
Oakland Athletics

WS Probability = 0.17695267533612566
Arizona Diamondbacks

WS Probability = 0.1651740955613368
Chicago Cubs

WS Probability = 0.16456729315928073
St. Louis Cardinals

WS Probability = 0.15626188330465815
Pittsburgh Pirates

WS Probability = 0.14731215598634617
New York Mets

WS Probability = 0.14679783279159522
San Francisco Giants

WS Probability = 0.14313921981576072
Washington Nationals

WS Probability = 0.138657742319001
Milwaukee Brewers

WS Probability = 0.13193994262840308
Toronto Blue Jays

WS Probability = 0.12432133311409375
Philadelphia Phillies

WS Probability = 0.11245076757477078
Baltimore Orioles



  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


All three models are rock stars for 2018 and 2017.  I think we can agree that 2016 was a fluke.  I mean, it was the Cubs.  But all model_100 and model_50 both predicted the Indians--who the Cubs went up against.


In [39]:
# save model_100, model_50, model_25.
import pickle

with open("svc_grid_100.pkl", "wb") as f:
    pickle.dump(model_100, f)

with open("svc_grid_50.pkl", "wb") as f:
    pickle.dump(model_50, f)
    
with open("svc_grid_25.pkl", "wb") as f:
    pickle.dump(model_25, f)