## Purpose: Try different models-- Part3.
### Grid search with upsampling and scaling.

In [1]:
# import dependencies.
import pandas as pd
import numpy as np

from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC

#### STEP1: Read in dataset.  Remove data from 2016-2019.
- data from 2016-2018 will be used to bs test the model.
- data from 2019 will be used to predict the winners of the 2019 WS.

In [2]:
# read in the data.
team_data = pd.read_csv("../../Resources/clean_data_1905.csv")
del team_data["Unnamed: 0"]
team_data.head()

Unnamed: 0,team,year,A,DP,E,G2,GS2,INN,PB,PO,...,R1,SHO,SO1,SV,TBF,W,WHIP,WP,WPCT,winners
0,St. Louis Cardinals,2019,1033,114,43,104,936,8313.0,3,2771,...,456,4,895,33,3896,56,1.29,21,0.538,0
1,Arizona Diamondbacks,2019,1010,83,45,105,945,8538.0,2,2846,...,472,7,925,24,4001,53,1.28,35,0.505,0
2,Kansas City Royals,2019,990,105,45,106,954,8421.0,6,2807,...,543,5,816,24,4125,39,1.46,34,0.368,0
3,Houston Astros,2019,875,54,50,106,954,8589.0,6,2863,...,432,7,1074,27,3929,67,1.14,31,0.632,0
4,Tampa Bay Rays,2019,975,92,53,107,963,8760.0,11,2920,...,409,6,1037,26,3985,59,1.16,40,0.551,0


In [3]:
# remove data from 2016 through 2019.
team_data_new = team_data.loc[team_data["year"] < 2016]
team_data_new.head()

Unnamed: 0,team,year,A,DP,E,G2,GS2,INN,PB,PO,...,R1,SHO,SO1,SV,TBF,W,WHIP,WP,WPCT,winners
120,San Francisco Giants,2015,1639,136,72,162,1458,13143.0,6,4381,...,631,11,1309,43,6048,87,1.21,40,0.537,0
121,Washington Nationals,2015,1425,142,73,162,1458,13137.0,17,4379,...,612,12,1476,46,6036,95,1.19,47,0.586,0
122,Houston Astros,2015,1599,135,77,162,1458,13212.0,18,4404,...,701,8,1396,44,6180,84,1.29,98,0.519,0
123,Detroit Tigers,2015,1537,148,75,161,1449,12852.0,5,4284,...,721,8,1232,47,6048,86,1.32,44,0.534,0
124,Boston Red Sox,2015,1427,139,75,162,1458,12957.0,37,4319,...,694,5,1362,43,6073,93,1.27,52,0.574,0


In [4]:
target = team_data_new["winners"]
features = team_data_new.drop({"team", "year", "winners"}, axis=1)
feature_columns = list(features.columns)
print (target.shape)
print (features.shape)
print (feature_columns)

(2344,)
(2344, 52)
['A', 'DP', 'E', 'G2', 'GS2', 'INN', 'PB', 'PO', 'TC', '2B', '3B', 'AB', 'AO', 'BB', 'G', 'H', 'HBP', 'HR', 'NP_x', 'OBP', 'OPS_x', 'PA', 'R', 'RBI', 'SAC', 'SB', 'SLG', 'TB', 'XBH', 'BB1', 'BK', 'CG', 'ER', 'ERA', 'G1', 'GF', 'GS', 'H1', 'HB', 'HR1', 'IP', 'L', 'OBP1', 'R1', 'SHO', 'SO1', 'SV', 'TBF', 'W', 'WHIP', 'WP', 'WPCT']


#### STEP2: Upsample and scale data.

In [5]:
# reset the index.
team_data_new = team_data_new.reset_index().drop({"index"}, axis=1)
team_data_new.head()

Unnamed: 0,team,year,A,DP,E,G2,GS2,INN,PB,PO,...,R1,SHO,SO1,SV,TBF,W,WHIP,WP,WPCT,winners
0,San Francisco Giants,2015,1639,136,72,162,1458,13143.0,6,4381,...,631,11,1309,43,6048,87,1.21,40,0.537,0
1,Washington Nationals,2015,1425,142,73,162,1458,13137.0,17,4379,...,612,12,1476,46,6036,95,1.19,47,0.586,0
2,Houston Astros,2015,1599,135,77,162,1458,13212.0,18,4404,...,701,8,1396,44,6180,84,1.29,98,0.519,0
3,Detroit Tigers,2015,1537,148,75,161,1449,12852.0,5,4284,...,721,8,1232,47,6048,86,1.32,44,0.534,0
4,Boston Red Sox,2015,1427,139,75,162,1458,12957.0,37,4319,...,694,5,1362,43,6073,93,1.27,52,0.574,0


In [6]:
# remove team and year.
feature_columns_new = feature_columns + ["winners"]
team_data_new = team_data[feature_columns_new]
team_data_new.head()

Unnamed: 0,A,DP,E,G2,GS2,INN,PB,PO,TC,2B,...,R1,SHO,SO1,SV,TBF,W,WHIP,WP,WPCT,winners
0,1033,114,43,104,936,8313.0,3,2771,3847,157,...,456,4,895,33,3896,56,1.29,21,0.538,0
1,1010,83,45,105,945,8538.0,2,2846,3901,203,...,472,7,925,24,4001,53,1.28,35,0.505,0
2,990,105,45,106,954,8421.0,6,2807,3842,185,...,543,5,816,24,4125,39,1.46,34,0.368,0
3,875,54,50,106,954,8589.0,6,2863,3788,200,...,432,7,1074,27,3929,67,1.14,31,0.632,0
4,975,92,53,107,963,8760.0,11,2920,3948,195,...,409,6,1037,26,3985,59,1.16,40,0.551,0


In [7]:
# upsample for a more balanced dataset.
def upsample(dataset, no_samples):
    '''
    INPUT: 
    -dataset = dataset without team names and year.
    -n_samples = number of minority_unsampled.
    
    OUTPUT:
    -X_train_scaled = scaled X train data.
    -X_test_scaled = scaled X test data.
    -y_train = y train data
    -y_test = y test data
    
    DESCRIPTION:
    -dataset is taken in and split into minority and majority classes.
    -dataset is then upsampled for the mainority class
    -split the data into features and targets
    -split data into train and test sets
    -train and test sets were are scaled.
    '''
    
    # separate majority and minority classes.
    df_majority = dataset.loc[dataset["winners"] == 0]
    df_minority = dataset.loc[dataset["winners"] == 1]

    # upsample minority class.
    df_minority_unsampled = resample(df_minority,
                                    replace=True,
                                    n_samples=no_samples,
                                    random_state=123)

    # combine majority class with upsampled minority class.
    df_upsampled = pd.concat([df_majority, df_minority_unsampled])

    # separate features and target.
    y = df_upsampled["winners"]
    X = df_upsampled[feature_columns]
    
    # split into train and test sets.
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    # scale X_train and X_test.
    scaler = StandardScaler()

    # transform the training and testing data.
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.fit_transform(X_test)
    
    return X_train_scaled, X_test_scaled, y_train, y_test

In [8]:
# Do three different upsamplings.
X_train_100, X_test_100, y_train_100, y_test_100 = upsample(team_data_new, 2234)
X_train_50, X_test_50, y_train_50, y_test_50 = upsample(team_data_new, 1117)
X_train_25, X_test_25, y_train_25, y_test_25 = upsample(team_data_new, 559)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


#### STEP3: Grid Search Model--Logistic Regression.

In [9]:
def grid_search_logistic(X_train, X_test, y_train, y_test):
    '''
    INPUT: 
    -X_train = scaled X train data.
    -X_test = scaled X test data.
    -y_train = y train data.
    -y_test = y test data.
    
    OUTPUT:
    -classification report (has F1 score, precision and recall).
    -grid = saved model for prediction. 
    
    DESCRIPTION:
    -the scaled and split data is put through a grid search with logistic.
    -the model is trained.
    -a prediction is made.
    -print out the classification report and give the model.
    '''
    
    # fit the model.
    model = LogisticRegression(max_iter= 2000)
    
    # create gridsearch estimator.
    param_grid = {"C": [0.001, 0.01, 0.1, 1, 10, 100],
                 "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]}
    grid = GridSearchCV(model, param_grid, verbose=3, cv=5)

    # fit the model.
    grid.fit(X_train, y_train)

    # predict.
    prediction = grid.predict(X_test)
    
    # print out the basic information about the grid search.
    print (grid.best_params_)
    print (grid.best_score_)
    print (grid.best_estimator_)
    
    grid = grid.best_estimator_
    predictions = grid.predict(X_test)
    print (classification_report(y_test, prediction, target_names=["0", "1"]))
    
    return grid

In [10]:
# for 1 part 0 to 1 part 1
model_100 = grid_search_logistic(X_train_100, X_test_100, y_train_100, y_test_100)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] C=0.001, solver=newton-cg .......................................
[CV]  C=0.001, solver=newton-cg, score=0.7151162790697675, total=   0.0s
[CV] C=0.001, solver=newton-cg .......................................
[CV]  C=0.001, solver=newton-cg, score=0.7311046511627907, total=   0.0s
[CV] C=0.001, solver=newton-cg .......................................
[CV]  C=0.001, solver=newton-cg, score=0.7456395348837209, total=   0.0s
[CV] C=0.001, solver=newton-cg .......................................
[CV]  C=0.001, solver=newton-cg, score=0.7122093023255814, total=   0.0s
[CV] C=0.001, solver=newton-cg .......................................
[CV]  C=0.001, solver=newton-cg, score=0.7317784256559767, total=   0.0s
[CV] C=0.001, solver=lbfgs ...........................................
[CV] .. C=0.001, solver=lbfgs, score=0.7151162790697675, total=   0.0s
[CV] C=0.001, solver=lbfgs ...........................................
[CV] 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s


[CV]  C=0.001, solver=liblinear, score=0.748546511627907, total=   0.0s
[CV] C=0.001, solver=liblinear .......................................
[CV]  C=0.001, solver=liblinear, score=0.7107558139534884, total=   0.0s
[CV] C=0.001, solver=liblinear .......................................
[CV]  C=0.001, solver=liblinear, score=0.7259475218658892, total=   0.0s
[CV] C=0.001, solver=sag .............................................
[CV] .... C=0.001, solver=sag, score=0.7151162790697675, total=   0.0s
[CV] C=0.001, solver=sag .............................................
[CV] .... C=0.001, solver=sag, score=0.7311046511627907, total=   0.0s
[CV] C=0.001, solver=sag .............................................
[CV] .... C=0.001, solver=sag, score=0.7456395348837209, total=   0.0s
[CV] C=0.001, solver=sag .............................................
[CV] .... C=0.001, solver=sag, score=0.7122093023255814, total=   0.0s
[CV] C=0.001, solver=sag .............................................
[

[CV] ..... C=0.1, solver=saga, score=0.7252906976744186, total=   0.2s
[CV] C=0.1, solver=saga ..............................................
[CV] ..... C=0.1, solver=saga, score=0.7383720930232558, total=   0.2s
[CV] C=0.1, solver=saga ..............................................
[CV] ..... C=0.1, solver=saga, score=0.7543604651162791, total=   0.2s
[CV] C=0.1, solver=saga ..............................................
[CV] ..... C=0.1, solver=saga, score=0.7238372093023255, total=   0.2s
[CV] C=0.1, solver=saga ..............................................
[CV] ..... C=0.1, solver=saga, score=0.7405247813411079, total=   0.2s
[CV] C=1, solver=newton-cg ...........................................
[CV] .. C=1, solver=newton-cg, score=0.7572674418604651, total=   0.1s
[CV] C=1, solver=newton-cg ...........................................
[CV] ................ C=1, solver=newton-cg, score=0.75, total=   0.1s
[CV] C=1, solver=newton-cg ...........................................
[CV] .



[CV] ...... C=10, solver=saga, score=0.7674418604651163, total=   3.1s
[CV] C=10, solver=saga ...............................................




[CV] ...... C=10, solver=saga, score=0.7223837209302325, total=   3.2s
[CV] C=10, solver=saga ...............................................




[CV] ...... C=10, solver=saga, score=0.7718023255813954, total=   3.1s
[CV] C=10, solver=saga ...............................................




[CV] ...... C=10, solver=saga, score=0.7688953488372093, total=   3.1s
[CV] C=10, solver=saga ...............................................




[CV] ...... C=10, solver=saga, score=0.7594752186588921, total=   3.2s
[CV] C=100, solver=newton-cg .........................................
[CV]  C=100, solver=newton-cg, score=0.7645348837209303, total=   0.1s
[CV] C=100, solver=newton-cg .........................................
[CV]  C=100, solver=newton-cg, score=0.7383720930232558, total=   0.2s
[CV] C=100, solver=newton-cg .........................................
[CV]  C=100, solver=newton-cg, score=0.7776162790697675, total=   0.1s
[CV] C=100, solver=newton-cg .........................................
[CV]  C=100, solver=newton-cg, score=0.7659883720930233, total=   0.1s
[CV] C=100, solver=newton-cg .........................................
[CV]  C=100, solver=newton-cg, score=0.7696793002915452, total=   0.1s
[CV] C=100, solver=lbfgs .............................................
[CV] .... C=100, solver=lbfgs, score=0.7659883720930233, total=   0.4s
[CV] C=100, solver=lbfgs .............................................
[CV] .



[CV] ...... C=100, solver=sag, score=0.7674418604651163, total=   3.0s
[CV] C=100, solver=sag ...............................................




[CV] ...... C=100, solver=sag, score=0.7340116279069767, total=   3.0s
[CV] C=100, solver=sag ...............................................




[CV] ...... C=100, solver=sag, score=0.7790697674418605, total=   3.0s
[CV] C=100, solver=sag ...............................................




[CV] ...... C=100, solver=sag, score=0.7645348837209303, total=   3.0s
[CV] C=100, solver=sag ...............................................




[CV] ....... C=100, solver=sag, score=0.760932944606414, total=   3.0s
[CV] C=100, solver=saga ..............................................




[CV] ..... C=100, solver=saga, score=0.7587209302325582, total=   3.4s
[CV] C=100, solver=saga ..............................................




[CV] ..... C=100, solver=saga, score=0.7267441860465116, total=   4.2s
[CV] C=100, solver=saga ..............................................




[CV] ..... C=100, solver=saga, score=0.7761627906976745, total=   3.2s
[CV] C=100, solver=saga ..............................................




[CV] ..... C=100, solver=saga, score=0.7718023255813954, total=   3.1s
[CV] C=100, solver=saga ..............................................


[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:  1.3min finished


[CV] ..... C=100, solver=saga, score=0.7623906705539358, total=   3.1s
{'C': 100, 'solver': 'liblinear'}
0.7638161721931356
LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=2000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)
              precision    recall  f1-score   support

           0       0.77      0.72      0.75       607
           1       0.71      0.76      0.73       540

   micro avg       0.74      0.74      0.74      1147
   macro avg       0.74      0.74      0.74      1147
weighted avg       0.74      0.74      0.74      1147



In [11]:
# for 1 part 0 to 0.5 part 1
model_50 = grid_search_logistic(X_train_50, X_test_50, y_train_50, y_test_50)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] C=0.001, solver=newton-cg .......................................
[CV]  C=0.001, solver=newton-cg, score=0.727447216890595, total=   0.0s
[CV] C=0.001, solver=newton-cg .......................................
[CV]  C=0.001, solver=newton-cg, score=0.7423076923076923, total=   0.0s
[CV] C=0.001, solver=newton-cg .......................................
[CV]  C=0.001, solver=newton-cg, score=0.7442307692307693, total=   0.0s
[CV] C=0.001, solver=newton-cg .......................................
[CV]  C=0.001, solver=newton-cg, score=0.7096153846153846, total=   0.0s
[CV] C=0.001, solver=newton-cg .......................................
[CV] ........... C=0.001, solver=newton-cg, score=0.725, total=   0.0s
[CV] C=0.001, solver=lbfgs ...........................................
[CV] ... C=0.001, solver=lbfgs, score=0.727447216890595, total=   0.0s
[CV] C=0.001, solver=lbfgs ...........................................
[CV] .. 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s


[CV]  C=0.001, solver=liblinear, score=0.7153846153846154, total=   0.0s
[CV] C=0.001, solver=liblinear .......................................
[CV]  C=0.001, solver=liblinear, score=0.7153846153846154, total=   0.0s
[CV] C=0.001, solver=sag .............................................
[CV] ..... C=0.001, solver=sag, score=0.727447216890595, total=   0.0s
[CV] C=0.001, solver=sag .............................................
[CV] .... C=0.001, solver=sag, score=0.7423076923076923, total=   0.0s
[CV] C=0.001, solver=sag .............................................
[CV] .... C=0.001, solver=sag, score=0.7442307692307693, total=   0.0s
[CV] C=0.001, solver=sag .............................................
[CV] .... C=0.001, solver=sag, score=0.7096153846153846, total=   0.0s
[CV] C=0.001, solver=sag .............................................
[CV] ................. C=0.001, solver=sag, score=0.725, total=   0.0s
[CV] C=0.001, solver=saga ............................................
[C

[CV] ..... C=0.1, solver=saga, score=0.7653846153846153, total=   0.1s
[CV] C=0.1, solver=saga ..............................................
[CV] ..... C=0.1, solver=saga, score=0.7519230769230769, total=   0.1s
[CV] C=0.1, solver=saga ..............................................
[CV] ..... C=0.1, solver=saga, score=0.7423076923076923, total=   0.1s
[CV] C=1, solver=newton-cg ...........................................
[CV] .. C=1, solver=newton-cg, score=0.7485604606525912, total=   0.0s
[CV] C=1, solver=newton-cg ...........................................
[CV] .. C=1, solver=newton-cg, score=0.7788461538461539, total=   0.0s
[CV] C=1, solver=newton-cg ...........................................
[CV] .. C=1, solver=newton-cg, score=0.7673076923076924, total=   0.0s
[CV] C=1, solver=newton-cg ...........................................
[CV] .. C=1, solver=newton-cg, score=0.7576923076923077, total=   0.0s
[CV] C=1, solver=newton-cg ...........................................
[CV] .



[CV] ...... C=10, solver=saga, score=0.7581573896353166, total=   2.3s
[CV] C=10, solver=saga ...............................................




[CV] ...... C=10, solver=saga, score=0.7634615384615384, total=   2.3s
[CV] C=10, solver=saga ...............................................




[CV] ...... C=10, solver=saga, score=0.7596153846153846, total=   2.3s
[CV] C=10, solver=saga ...............................................




[CV] ...... C=10, solver=saga, score=0.7576923076923077, total=   2.3s
[CV] C=10, solver=saga ...............................................




[CV] ................... C=10, solver=saga, score=0.725, total=   2.3s
[CV] C=100, solver=newton-cg .........................................
[CV]  C=100, solver=newton-cg, score=0.7485604606525912, total=   0.1s
[CV] C=100, solver=newton-cg .........................................
[CV]  C=100, solver=newton-cg, score=0.7673076923076924, total=   0.1s
[CV] C=100, solver=newton-cg .........................................
[CV]  C=100, solver=newton-cg, score=0.7557692307692307, total=   0.1s
[CV] C=100, solver=newton-cg .........................................
[CV]  C=100, solver=newton-cg, score=0.7653846153846153, total=   0.1s
[CV] C=100, solver=newton-cg .........................................
[CV] ............. C=100, solver=newton-cg, score=0.725, total=   0.1s
[CV] C=100, solver=lbfgs .............................................
[CV] .... C=100, solver=lbfgs, score=0.7485604606525912, total=   0.3s
[CV] C=100, solver=lbfgs .............................................
[CV] .



[CV] ...... C=100, solver=sag, score=0.7523992322456814, total=   2.2s
[CV] C=100, solver=sag ...............................................




[CV] ...... C=100, solver=sag, score=0.7711538461538462, total=   2.2s
[CV] C=100, solver=sag ...............................................




[CV] ...... C=100, solver=sag, score=0.7538461538461538, total=   2.3s
[CV] C=100, solver=sag ...............................................




[CV] ...... C=100, solver=sag, score=0.7653846153846153, total=   2.2s
[CV] C=100, solver=sag ...............................................




[CV] ...... C=100, solver=sag, score=0.7269230769230769, total=   2.2s
[CV] C=100, solver=saga ..............................................




[CV] ..... C=100, solver=saga, score=0.7485604606525912, total=   2.3s
[CV] C=100, solver=saga ..............................................




[CV] ..... C=100, solver=saga, score=0.7634615384615384, total=   2.3s
[CV] C=100, solver=saga ..............................................




[CV] ..... C=100, solver=saga, score=0.7538461538461538, total=   2.3s
[CV] C=100, solver=saga ..............................................




[CV] ..... C=100, solver=saga, score=0.7596153846153846, total=   2.3s
[CV] C=100, solver=saga ..............................................
[CV] ..... C=100, solver=saga, score=0.7269230769230769, total=   2.3s
{'C': 1, 'solver': 'newton-cg'}
0.7581699346405228
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=2000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False)
              precision    recall  f1-score   support

           0       0.82      0.89      0.85       606
           1       0.68      0.55      0.61       261

   micro avg       0.79      0.79      0.79       867
   macro avg       0.75      0.72      0.73       867
weighted avg       0.78      0.79      0.78       867



[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:   55.8s finished


In [12]:
# for 1 part 0 to 0.25 part 1
model_25 = grid_search_logistic(X_train_25, X_test_25, y_train_25, y_test_25)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] C=0.001, solver=newton-cg .......................................
[CV]  C=0.001, solver=newton-cg, score=0.8100686498855835, total=   0.0s
[CV] C=0.001, solver=newton-cg .......................................
[CV]  C=0.001, solver=newton-cg, score=0.8146453089244852, total=   0.0s
[CV] C=0.001, solver=newton-cg .......................................
[CV]  C=0.001, solver=newton-cg, score=0.8119266055045872, total=   0.0s
[CV] C=0.001, solver=newton-cg .......................................
[CV]  C=0.001, solver=newton-cg, score=0.8119266055045872, total=   0.0s
[CV] C=0.001, solver=newton-cg .......................................
[CV]  C=0.001, solver=newton-cg, score=0.8119266055045872, total=   0.0s
[CV] C=0.001, solver=lbfgs ...........................................
[CV] .. C=0.001, solver=lbfgs, score=0.8100686498855835, total=   0.0s
[CV] C=0.001, solver=lbfgs ...........................................
[CV] 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s


[CV] .... C=0.001, solver=sag, score=0.8146453089244852, total=   0.0s
[CV] C=0.001, solver=sag .............................................
[CV] .... C=0.001, solver=sag, score=0.8119266055045872, total=   0.0s
[CV] C=0.001, solver=sag .............................................
[CV] .... C=0.001, solver=sag, score=0.8119266055045872, total=   0.0s
[CV] C=0.001, solver=sag .............................................
[CV] .... C=0.001, solver=sag, score=0.8119266055045872, total=   0.0s
[CV] C=0.001, solver=saga ............................................
[CV] ... C=0.001, solver=saga, score=0.8100686498855835, total=   0.0s
[CV] C=0.001, solver=saga ............................................
[CV] ... C=0.001, solver=saga, score=0.8146453089244852, total=   0.0s
[CV] C=0.001, solver=saga ............................................
[CV] ... C=0.001, solver=saga, score=0.8119266055045872, total=   0.0s
[CV] C=0.001, solver=saga ............................................
[CV] .

[CV] ..... C=0.1, solver=saga, score=0.8211009174311926, total=   0.1s
[CV] C=1, solver=newton-cg ...........................................
[CV] .. C=1, solver=newton-cg, score=0.8123569794050344, total=   0.0s
[CV] C=1, solver=newton-cg ...........................................
[CV] .. C=1, solver=newton-cg, score=0.8398169336384439, total=   0.0s
[CV] C=1, solver=newton-cg ...........................................
[CV] ... C=1, solver=newton-cg, score=0.841743119266055, total=   0.0s
[CV] C=1, solver=newton-cg ...........................................
[CV] .. C=1, solver=newton-cg, score=0.8279816513761468, total=   0.0s
[CV] C=1, solver=newton-cg ...........................................
[CV] .. C=1, solver=newton-cg, score=0.8211009174311926, total=   0.0s
[CV] C=1, solver=lbfgs ...............................................
[CV] ...... C=1, solver=lbfgs, score=0.8123569794050344, total=   0.0s
[CV] C=1, solver=lbfgs ...............................................
[CV] .



[CV] ...... C=10, solver=saga, score=0.8123569794050344, total=   1.9s
[CV] C=10, solver=saga ...............................................




[CV] ...... C=10, solver=saga, score=0.8352402745995423, total=   1.9s
[CV] C=10, solver=saga ...............................................




[CV] ....... C=10, solver=saga, score=0.841743119266055, total=   1.9s
[CV] C=10, solver=saga ...............................................




[CV] ...... C=10, solver=saga, score=0.8302752293577982, total=   2.0s
[CV] C=10, solver=saga ...............................................




[CV] ...... C=10, solver=saga, score=0.8256880733944955, total=   2.1s
[CV] C=100, solver=newton-cg .........................................
[CV]  C=100, solver=newton-cg, score=0.8100686498855835, total=   0.1s
[CV] C=100, solver=newton-cg .........................................
[CV]  C=100, solver=newton-cg, score=0.8306636155606407, total=   0.1s
[CV] C=100, solver=newton-cg .........................................
[CV]  C=100, solver=newton-cg, score=0.8371559633027523, total=   0.1s
[CV] C=100, solver=newton-cg .........................................
[CV]  C=100, solver=newton-cg, score=0.8325688073394495, total=   0.1s
[CV] C=100, solver=newton-cg .........................................
[CV]  C=100, solver=newton-cg, score=0.8279816513761468, total=   0.1s
[CV] C=100, solver=lbfgs .............................................
[CV] .... C=100, solver=lbfgs, score=0.8100686498855835, total=   0.3s
[CV] C=100, solver=lbfgs .............................................
[CV] .



[CV] ...... C=100, solver=sag, score=0.8077803203661327, total=   2.1s
[CV] C=100, solver=sag ...............................................




[CV] ...... C=100, solver=sag, score=0.8329519450800915, total=   2.0s
[CV] C=100, solver=sag ...............................................




[CV] ...... C=100, solver=sag, score=0.8394495412844036, total=   1.9s
[CV] C=100, solver=sag ...............................................




[CV] ...... C=100, solver=sag, score=0.8348623853211009, total=   1.9s
[CV] C=100, solver=sag ...............................................




[CV] ...... C=100, solver=sag, score=0.8302752293577982, total=   1.8s
[CV] C=100, solver=saga ..............................................




[CV] ..... C=100, solver=saga, score=0.8123569794050344, total=   1.9s
[CV] C=100, solver=saga ..............................................




[CV] ..... C=100, solver=saga, score=0.8329519450800915, total=   1.9s
[CV] C=100, solver=saga ..............................................




[CV] ..... C=100, solver=saga, score=0.8394495412844036, total=   1.9s
[CV] C=100, solver=saga ..............................................




[CV] ..... C=100, solver=saga, score=0.8302752293577982, total=   1.9s
[CV] C=100, solver=saga ..............................................


[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:   46.7s finished


[CV] ..... C=100, solver=saga, score=0.8279816513761468, total=   1.9s
{'C': 10, 'solver': 'sag'}
0.8290559120073328
LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=2000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='sag',
          tol=0.0001, verbose=0, warm_start=False)
              precision    recall  f1-score   support

           0       0.84      0.95      0.89       579
           1       0.60      0.27      0.37       149

   micro avg       0.81      0.81      0.81       728
   macro avg       0.72      0.61      0.63       728
weighted avg       0.79      0.81      0.78       728



This is not significantly better than the straight logistic regression.

In [13]:
def predict_the_winner(model, year, team_data, X_train):
    '''
    INPUT: 
    -X_train = scaled X train data.
    -model = the saved model.
    -team_data = complete dataframe with all data.
    -year = the year want to look at.
    
    OUTPUT:
    -printed prediction.
    
    DESCRIPTION:
    -data from year of interest is isolated.
    -the data are scaled.
    -the prediction is made.
    -print out the resulting probability and the name of the team.
    '''
    
    # grab the data.
    team_data = team_data.loc[team_data["year"] == year].reset_index()

    # set features (no team, year, winners).
    # set target (winners).
    features = team_data[feature_columns]
    
    # scale X_train and X_test.
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    features = scaler.fit_transform(features)
    
    # fit the model.
    probabilities = model.predict_proba(features)

    # convert predictions to datafram.e
    WS_predictions = pd.DataFrame(probabilities[:,1])

    # Sort the DataFrame (descending)
    WS_predictions = WS_predictions.sort_values(0, ascending=False)

    WS_predictions['Probability'] = WS_predictions[0]

    # Print 50 highest probability HoF inductees from still eligible players
    for i, row in WS_predictions.head(50).iterrows():
       prob = ' '.join(('WS Probability =', str(row['Probability'])))
       print(prob)
       print(team_data.iloc[i,1:27]["team"])
       print('')

Prediction for 2018

In [14]:
print ("model_100----------------------------------------")
predict_the_winner(model_100, 2018, team_data, X_train_100)

model_100----------------------------------------
WS Probability = 0.9999999507832171
Detroit Tigers

WS Probability = 0.9999995467046062
Cincinnati Reds

WS Probability = 0.9999849646049372
Washington Nationals

WS Probability = 0.9997809036939881
St. Louis Cardinals

WS Probability = 0.9995403353892793
Los Angeles Angels

WS Probability = 0.9984988544849107
Chicago White Sox

WS Probability = 0.9949636755278962
Houston Astros

WS Probability = 0.9918219395658201
Los Angeles Dodgers

WS Probability = 0.9872645944073091
Minnesota Twins

WS Probability = 0.9668929376606923
Atlanta Braves

WS Probability = 0.9259132305095067
New York Yankees

WS Probability = 0.9213012179329771
Miami Marlins

WS Probability = 0.9108287526384264
Kansas City Royals

WS Probability = 0.8833878057060507
Arizona Diamondbacks

WS Probability = 0.870908134526758
Baltimore Orioles

WS Probability = 0.6128919875394272
Toronto Blue Jays

WS Probability = 0.3308201108001493
Oakland Athletics

WS Probability = 0.148

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [15]:
print ("model_50----------------------------------------")
predict_the_winner(model_50, 2018, team_data, X_train_50)

model_50----------------------------------------
WS Probability = 0.9873425039755489
Los Angeles Dodgers

WS Probability = 0.9861004835313941
Cleveland Indians

WS Probability = 0.98251149010313
Houston Astros

WS Probability = 0.956312121432039
Washington Nationals

WS Probability = 0.9518531682432401
Minnesota Twins

WS Probability = 0.9410812386375842
St. Louis Cardinals

WS Probability = 0.9195503403977436
Cincinnati Reds

WS Probability = 0.8003720111678071
Oakland Athletics

WS Probability = 0.793528952833421
Arizona Diamondbacks

WS Probability = 0.7468851288962158
Tampa Bay Rays

WS Probability = 0.6562118203440391
Boston Red Sox

WS Probability = 0.5604944547452536
New York Yankees

WS Probability = 0.5328698499518533
Atlanta Braves

WS Probability = 0.5000547117732359
Los Angeles Angels

WS Probability = 0.20222927547548325
Miami Marlins

WS Probability = 0.1507930820220956
New York Mets

WS Probability = 0.0778761568158136
Texas Rangers

WS Probability = 0.06554868701431861


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [16]:
print ("model_25----------------------------------------")
predict_the_winner(model_25, 2018, team_data, X_train_25)

model_25----------------------------------------
WS Probability = 0.9876767281504318
Cincinnati Reds

WS Probability = 0.9085251512209893
Houston Astros

WS Probability = 0.864875289717516
Minnesota Twins

WS Probability = 0.8138905764966672
St. Louis Cardinals

WS Probability = 0.7506103267056896
Washington Nationals

WS Probability = 0.733682737099985
Arizona Diamondbacks

WS Probability = 0.7152819913798588
Tampa Bay Rays

WS Probability = 0.6910700294911415
Los Angeles Dodgers

WS Probability = 0.6156636444549811
Miami Marlins

WS Probability = 0.5649932751731225
Cleveland Indians

WS Probability = 0.5276434123265982
Oakland Athletics

WS Probability = 0.5216158086797604
New York Yankees

WS Probability = 0.4924571875768348
Atlanta Braves

WS Probability = 0.2849147194209658
Detroit Tigers

WS Probability = 0.2621025934423404
Chicago Cubs

WS Probability = 0.24101087499141857
Los Angeles Angels

WS Probability = 0.07396391754580155
San Diego Padres

WS Probability = 0.0648610993766

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Precition for 2017

In [17]:
# predict for 2017.
print ("model_100----------------------------------------")
predict_the_winner(model_100, 2017, team_data, X_train_100)

model_100----------------------------------------
WS Probability = 0.99999999882261
Colorado Rockies

WS Probability = 0.9999998928164447
Detroit Tigers

WS Probability = 0.9999998035062123
Milwaukee Brewers

WS Probability = 0.9999983643361261
Los Angeles Angels

WS Probability = 0.999991743049209
Los Angeles Dodgers

WS Probability = 0.9999746648151113
Texas Rangers

WS Probability = 0.9998265050397431
Washington Nationals

WS Probability = 0.9992274306742231
Kansas City Royals

WS Probability = 0.9991172424801942
Toronto Blue Jays

WS Probability = 0.9950263661475003
Houston Astros

WS Probability = 0.985268210190956
Tampa Bay Rays

WS Probability = 0.9783247010315036
Cincinnati Reds

WS Probability = 0.9601963653017139
Chicago Cubs

WS Probability = 0.9127057752193434
Baltimore Orioles

WS Probability = 0.8388795548874193
Philadelphia Phillies

WS Probability = 0.8283102858272211
Minnesota Twins

WS Probability = 0.7186036830742364
Cleveland Indians

WS Probability = 0.471945183658

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [18]:
print ("model_50----------------------------------------")
predict_the_winner(model_50, 2017, team_data, X_train_50)

model_50----------------------------------------
WS Probability = 0.9999458756548997
Houston Astros

WS Probability = 0.9989956506131888
Cleveland Indians

WS Probability = 0.9805611254024863
Boston Red Sox

WS Probability = 0.9599856218698107
Atlanta Braves

WS Probability = 0.9499002312201494
New York Yankees

WS Probability = 0.924390263390398
Washington Nationals

WS Probability = 0.9167404244430902
Los Angeles Dodgers

WS Probability = 0.890906760960234
Los Angeles Angels

WS Probability = 0.8526350794733162
Oakland Athletics

WS Probability = 0.738125696779178
Tampa Bay Rays

WS Probability = 0.6398622562267948
Seattle Mariners

WS Probability = 0.4640262703499726
Colorado Rockies

WS Probability = 0.37688014296347727
Milwaukee Brewers

WS Probability = 0.3109300033432931
New York Mets

WS Probability = 0.21837653319202155
Minnesota Twins

WS Probability = 0.2128897073820588
Arizona Diamondbacks

WS Probability = 0.16520717717716485
Chicago Cubs

WS Probability = 0.15202655430923

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [19]:
print ("model_25----------------------------------------")
predict_the_winner(model_25, 2017, team_data, X_train_25)

model_25----------------------------------------
WS Probability = 0.9998796941538302
Houston Astros

WS Probability = 0.9671919230640552
New York Yankees

WS Probability = 0.9608950640542029
Cleveland Indians

WS Probability = 0.9583152505871078
Los Angeles Dodgers

WS Probability = 0.8251318303251219
Oakland Athletics

WS Probability = 0.8206102841601355
Boston Red Sox

WS Probability = 0.803729220607319
Washington Nationals

WS Probability = 0.7936490020549267
Tampa Bay Rays

WS Probability = 0.6644508790365156
Atlanta Braves

WS Probability = 0.6551990301786715
Arizona Diamondbacks

WS Probability = 0.5593731866551522
Milwaukee Brewers

WS Probability = 0.5525032372438307
Los Angeles Angels

WS Probability = 0.387358463097562
Colorado Rockies

WS Probability = 0.35020975280533856
Chicago Cubs

WS Probability = 0.31839411059667944
New York Mets

WS Probability = 0.12296688191915638
Seattle Mariners

WS Probability = 0.079341804595618
Philadelphia Phillies

WS Probability = 0.05574799

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Prediction for 2016

In [20]:
# predict for 2016.
print ("model_100----------------------------------------")
predict_the_winner(model_100, 2016, team_data, X_train_100)

model_100----------------------------------------
WS Probability = 0.9999999998584037
Cleveland Indians

WS Probability = 0.9999999969415321
Detroit Tigers

WS Probability = 0.9999996089117351
Minnesota Twins

WS Probability = 0.9999675730777194
Cincinnati Reds

WS Probability = 0.9998210695073947
Oakland Athletics

WS Probability = 0.9993820518236723
Chicago White Sox

WS Probability = 0.9993759841606927
New York Yankees

WS Probability = 0.9992579643904091
Los Angeles Dodgers

WS Probability = 0.9960542762319331
Arizona Diamondbacks

WS Probability = 0.9954804290878182
New York Mets

WS Probability = 0.9873290631075351
Houston Astros

WS Probability = 0.9768067811130619
Texas Rangers

WS Probability = 0.7596006521488354
Chicago Cubs

WS Probability = 0.7340899087859907
Colorado Rockies

WS Probability = 0.7141126029881957
Washington Nationals

WS Probability = 0.6095752681612683
Seattle Mariners

WS Probability = 0.4746101831045438
Los Angeles Angels

WS Probability = 0.1443751030960

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [21]:
print ("model_50----------------------------------------")
predict_the_winner(model_50, 2016, team_data, X_train_50)

model_50----------------------------------------
WS Probability = 0.9999929154747303
Cleveland Indians

WS Probability = 0.9945561617822243
Houston Astros

WS Probability = 0.98847307805359
New York Yankees

WS Probability = 0.987906484611737
Los Angeles Dodgers

WS Probability = 0.9715602711150808
Washington Nationals

WS Probability = 0.8713293185014833
Minnesota Twins

WS Probability = 0.8409713538851562
Boston Red Sox

WS Probability = 0.744085461364684
Arizona Diamondbacks

WS Probability = 0.527138089823609
Kansas City Royals

WS Probability = 0.5046890875920984
Los Angeles Angels

WS Probability = 0.4857919923818349
Chicago Cubs

WS Probability = 0.48210200840012135
St. Louis Cardinals

WS Probability = 0.22008811250756527
Colorado Rockies

WS Probability = 0.20422090655835579
Seattle Mariners

WS Probability = 0.16401053287129386
Tampa Bay Rays

WS Probability = 0.1508596413172163
Detroit Tigers

WS Probability = 0.0763017882871379
New York Mets

WS Probability = 0.048615447981

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [22]:
print ("model_25----------------------------------------")
predict_the_winner(model_25, 2016, team_data, X_train_25)

model_25----------------------------------------
WS Probability = 0.999979568965724
Cleveland Indians

WS Probability = 0.9952757645040042
Los Angeles Dodgers

WS Probability = 0.9934884015233895
New York Yankees

WS Probability = 0.9421885096356508
Boston Red Sox

WS Probability = 0.921166544205374
Chicago Cubs

WS Probability = 0.9124871693422674
Houston Astros

WS Probability = 0.8947348944871364
Washington Nationals

WS Probability = 0.8070546001677497
Arizona Diamondbacks

WS Probability = 0.7975560477318313
Minnesota Twins

WS Probability = 0.6416985293669119
St. Louis Cardinals

WS Probability = 0.4193102673625983
Los Angeles Angels

WS Probability = 0.22944240207341215
Colorado Rockies

WS Probability = 0.1596342596334687
Tampa Bay Rays

WS Probability = 0.031724018543375525
Pittsburgh Pirates

WS Probability = 0.02655154592891983
Seattle Mariners

WS Probability = 0.026278666166044923
Toronto Blue Jays

WS Probability = 0.022589254663456806
Miami Marlins

WS Probability = 0.01

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


#### STEP4: Grid Search Model--SVC.

In [62]:
def grid_search_svc(X_train, X_test, y_train, y_test):
    '''
    INPUT: 
    -X_train = scaled X train data.
    -X_test = scaled X test data.
    -y_train = y train data.
    -y_test = y test data.
    
    OUTPUT:
    -classification report (has F1 score, precision and recall).
    -grid = saved model for prediction. 
    
    DESCRIPTION:
    -the scaled and split data is put through a grid search with svc.
    -the model is trained.
    -a prediction is made.
    -print out the classification report and give the model.
    '''
    
    # set up svc model.
    model = SVC(probability=True)

    # create gridsearch estimator.
    param_grid = {"C": [0.0001, 0.001, 0.01, 0.1, 1],
                  "gamma": ["auto", "scale"],
                 "kernel": ["poly"]}
    grid = GridSearchCV(model, param_grid, verbose=3)

    # fit the model.
    grid.fit(X_train, y_train)

    # predict.
    prediction = grid.predict(X_test)
    
    # print out the basic information about the grid search.
    print (grid.best_params_)
    print (grid.best_score_)
    print (grid.best_estimator_)
    
    grid = grid.best_estimator_
    predictions = grid.predict(X_test)
    print (classification_report(y_test, prediction, target_names=["0", "1"]))
    
    return grid

In [63]:
# for 1 part 0 to 1 part 1
model_100 = grid_search_svc(X_train_100, X_test_100, y_train_100, y_test_100)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] C=0.0001, gamma=auto, kernel=poly ...............................
[CV]  C=0.0001, gamma=auto, kernel=poly, score=0.5074106364428945, total=   1.6s
[CV] C=0.0001, gamma=auto, kernel=poly ...............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.8s remaining:    0.0s


[CV]  C=0.0001, gamma=auto, kernel=poly, score=0.5069808027923212, total=   1.6s
[CV] C=0.0001, gamma=auto, kernel=poly ...............................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.6s remaining:    0.0s


[CV]  C=0.0001, gamma=auto, kernel=poly, score=0.5074235807860262, total=   1.6s
[CV] C=0.0001, gamma=scale, kernel=poly ..............................
[CV]  C=0.0001, gamma=scale, kernel=poly, score=0.5074106364428945, total=   1.6s
[CV] C=0.0001, gamma=scale, kernel=poly ..............................
[CV]  C=0.0001, gamma=scale, kernel=poly, score=0.5069808027923212, total=   1.6s
[CV] C=0.0001, gamma=scale, kernel=poly ..............................
[CV]  C=0.0001, gamma=scale, kernel=poly, score=0.5074235807860262, total=   1.6s
[CV] C=0.001, gamma=auto, kernel=poly ................................
[CV]  C=0.001, gamma=auto, kernel=poly, score=0.5108979947689625, total=   1.6s
[CV] C=0.001, gamma=auto, kernel=poly ................................
[CV]  C=0.001, gamma=auto, kernel=poly, score=0.5069808027923212, total=   1.6s
[CV] C=0.001, gamma=auto, kernel=poly ................................
[CV]  C=0.001, gamma=auto, kernel=poly, score=0.5065502183406113, total=   1.5s
[CV] C=

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   49.5s finished


{'C': 1, 'gamma': 'auto', 'kernel': 'poly'}
0.8411867364746946
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='poly',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
              precision    recall  f1-score   support

           0       0.88      0.84      0.86       607
           1       0.83      0.87      0.85       540

   micro avg       0.85      0.85      0.85      1147
   macro avg       0.85      0.85      0.85      1147
weighted avg       0.85      0.85      0.85      1147



In [64]:
# for 1 part 0\ to 0.5 part 1
model_50 = grid_search_svc(X_train_50, X_test_50, y_train_50, y_test_50)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] C=0.0001, gamma=auto, kernel=poly ...............................
[CV]  C=0.0001, gamma=auto, kernel=poly, score=0.6705069124423964, total=   0.7s
[CV] C=0.0001, gamma=auto, kernel=poly ...............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s remaining:    0.0s


[CV]  C=0.0001, gamma=auto, kernel=poly, score=0.671280276816609, total=   0.6s
[CV] C=0.0001, gamma=auto, kernel=poly ...............................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.5s remaining:    0.0s


[CV]  C=0.0001, gamma=auto, kernel=poly, score=0.6709006928406467, total=   0.6s
[CV] C=0.0001, gamma=scale, kernel=poly ..............................
[CV]  C=0.0001, gamma=scale, kernel=poly, score=0.6705069124423964, total=   0.6s
[CV] C=0.0001, gamma=scale, kernel=poly ..............................
[CV]  C=0.0001, gamma=scale, kernel=poly, score=0.671280276816609, total=   0.6s
[CV] C=0.0001, gamma=scale, kernel=poly ..............................
[CV]  C=0.0001, gamma=scale, kernel=poly, score=0.6709006928406467, total=   0.6s
[CV] C=0.001, gamma=auto, kernel=poly ................................
[CV]  C=0.001, gamma=auto, kernel=poly, score=0.6705069124423964, total=   0.6s
[CV] C=0.001, gamma=auto, kernel=poly ................................
[CV]  C=0.001, gamma=auto, kernel=poly, score=0.671280276816609, total=   0.6s
[CV] C=0.001, gamma=auto, kernel=poly ................................
[CV]  C=0.001, gamma=auto, kernel=poly, score=0.6709006928406467, total=   0.6s
[CV] C=0.

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   21.8s finished


{'C': 1, 'gamma': 'auto', 'kernel': 'poly'}
0.7973856209150327
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='poly',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
              precision    recall  f1-score   support

           0       0.82      0.96      0.88       606
           1       0.85      0.51      0.64       261

   micro avg       0.82      0.82      0.82       867
   macro avg       0.83      0.73      0.76       867
weighted avg       0.83      0.82      0.81       867



In [65]:
# for 1 part 0 to 0.25 part 1
model_25 = grid_search_svc(X_train_25, X_test_25, y_train_25, y_test_25)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] C=0.0001, gamma=auto, kernel=poly ...............................
[CV]  C=0.0001, gamma=auto, kernel=poly, score=0.8118131868131868, total=   0.3s
[CV] C=0.0001, gamma=auto, kernel=poly ...............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV]  C=0.0001, gamma=auto, kernel=poly, score=0.8118131868131868, total=   0.3s
[CV] C=0.0001, gamma=auto, kernel=poly ...............................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.6s remaining:    0.0s


[CV]  C=0.0001, gamma=auto, kernel=poly, score=0.8126721763085399, total=   0.3s
[CV] C=0.0001, gamma=scale, kernel=poly ..............................
[CV]  C=0.0001, gamma=scale, kernel=poly, score=0.8118131868131868, total=   0.3s
[CV] C=0.0001, gamma=scale, kernel=poly ..............................
[CV]  C=0.0001, gamma=scale, kernel=poly, score=0.8118131868131868, total=   0.3s
[CV] C=0.0001, gamma=scale, kernel=poly ..............................
[CV]  C=0.0001, gamma=scale, kernel=poly, score=0.8126721763085399, total=   0.3s
[CV] C=0.001, gamma=auto, kernel=poly ................................
[CV]  C=0.001, gamma=auto, kernel=poly, score=0.8118131868131868, total=   0.3s
[CV] C=0.001, gamma=auto, kernel=poly ................................
[CV]  C=0.001, gamma=auto, kernel=poly, score=0.8118131868131868, total=   0.3s
[CV] C=0.001, gamma=auto, kernel=poly ................................
[CV]  C=0.001, gamma=auto, kernel=poly, score=0.8126721763085399, total=   0.3s
[CV] C=

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   10.2s finished


{'C': 1, 'gamma': 'scale', 'kernel': 'poly'}
0.8547204399633364
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='poly',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
              precision    recall  f1-score   support

           0       0.85      0.97      0.91       579
           1       0.77      0.36      0.49       149

   micro avg       0.85      0.85      0.85       728
   macro avg       0.81      0.66      0.70       728
weighted avg       0.84      0.85      0.82       728



Huh.  That's pretty good.  Try out model_100 and model_50 with the 2016-2018 stuff.

#### STEP5: Predict 2016-2018 winners with SVC Grid Search.

In [66]:
# predict for 2018.
predict_the_winner(model_100, 2018, team_data, X_train_100)

WS Probability = 0.9672681274154478
Tampa Bay Rays

WS Probability = 0.8380433619258659
New York Yankees

WS Probability = 0.8258653281915737
Cleveland Indians

WS Probability = 0.7675545468923061
St. Louis Cardinals

WS Probability = 0.6932602636988303
Minnesota Twins

WS Probability = 0.672279482445197
Cincinnati Reds

WS Probability = 0.45122816682086053
Washington Nationals

WS Probability = 0.42069818004077575
Chicago White Sox

WS Probability = 0.4010662014721142
Chicago Cubs

WS Probability = 0.3910822495608485
Los Angeles Angels

WS Probability = 0.3731939146907646
San Diego Padres

WS Probability = 0.33753143668217245
New York Mets

WS Probability = 0.2771476589489649
Arizona Diamondbacks

WS Probability = 0.26532127248186493
Baltimore Orioles

WS Probability = 0.2388098303702007
San Francisco Giants

WS Probability = 0.23490797877407696
Oakland Athletics

WS Probability = 0.2248918101826335
Philadelphia Phillies

WS Probability = 0.22401599906395436
Texas Rangers

WS Probabil

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [67]:
# predict for 2017.
predict_the_winner(model_100, 2017, team_data, X_train_100)

WS Probability = 0.987296710839614
Houston Astros

WS Probability = 0.9790525570344367
Boston Red Sox

WS Probability = 0.8677110306372905
Cleveland Indians

WS Probability = 0.7493516063931069
New York Yankees

WS Probability = 0.6524807333566126
Colorado Rockies

WS Probability = 0.5820173774748483
Chicago White Sox

WS Probability = 0.46969655290930373
Tampa Bay Rays

WS Probability = 0.44991635658540463
Seattle Mariners

WS Probability = 0.41668870356349386
Washington Nationals

WS Probability = 0.39657968858920767
Atlanta Braves

WS Probability = 0.35153186028985756
Milwaukee Brewers

WS Probability = 0.3395206004839719
Los Angeles Angels

WS Probability = 0.31073039818095005
St. Louis Cardinals

WS Probability = 0.30850731192920167
Philadelphia Phillies

WS Probability = 0.28574713719617734
Arizona Diamondbacks

WS Probability = 0.2819991667166262
Minnesota Twins

WS Probability = 0.21218775605680604
Pittsburgh Pirates

WS Probability = 0.19636899998566903
New York Mets

WS Proba

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [68]:
# predict for 2016
predict_the_winner(model_100, 2016, team_data, X_train_100)

WS Probability = 0.9999999765896943
Cleveland Indians

WS Probability = 0.999987049005604
Houston Astros

WS Probability = 0.9304841793301912
Los Angeles Dodgers

WS Probability = 0.8444112198386886
Colorado Rockies

WS Probability = 0.8163435005323842
New York Yankees

WS Probability = 0.7555108051749393
Chicago Cubs

WS Probability = 0.7507357563633643
Los Angeles Angels

WS Probability = 0.6951046687807705
Boston Red Sox

WS Probability = 0.6481039549509179
Washington Nationals

WS Probability = 0.5679446431842795
Arizona Diamondbacks

WS Probability = 0.5
Tampa Bay Rays

WS Probability = 0.4303512598882254
St. Louis Cardinals

WS Probability = 0.41928831511295755
Chicago White Sox

WS Probability = 0.4135124477888461
Minnesota Twins

WS Probability = 0.3028548940673995
Seattle Mariners

WS Probability = 0.27486919674029914
Milwaukee Brewers

WS Probability = 0.23654494172749513
New York Mets

WS Probability = 0.22473638590922493
Oakland Athletics

WS Probability = 0.202738868946679

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [69]:
# predict for 2018
predict_the_winner(model_50, 2018, team_data, X_train_50)

WS Probability = 0.7975398553719102
Tampa Bay Rays

WS Probability = 0.5901846721034164
Cleveland Indians

WS Probability = 0.42462875817217394
St. Louis Cardinals

WS Probability = 0.40642007030055194
New York Yankees

WS Probability = 0.34498810153828724
Cincinnati Reds

WS Probability = 0.2961954108142491
Boston Red Sox

WS Probability = 0.2465596572981947
Washington Nationals

WS Probability = 0.22671522998117466
Los Angeles Angels

WS Probability = 0.20776475856396684
Chicago Cubs

WS Probability = 0.2011144702186935
Atlanta Braves

WS Probability = 0.19396757824854624
New York Mets

WS Probability = 0.19062123910289838
Houston Astros

WS Probability = 0.17881621366810954
Miami Marlins

WS Probability = 0.17242754297577717
Oakland Athletics

WS Probability = 0.16914282969381333
San Francisco Giants

WS Probability = 0.1671686549167636
Arizona Diamondbacks

WS Probability = 0.16613444783593648
San Diego Padres

WS Probability = 0.14381910187915484
Philadelphia Phillies

WS Probabil

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [70]:
# predict for 2017
predict_the_winner(model_50, 2017, team_data, X_train_50)

WS Probability = 0.8427022432788881
Houston Astros

WS Probability = 0.4223987827727567
Chicago White Sox

WS Probability = 0.2907878847271067
Colorado Rockies

WS Probability = 0.2726538261651381
Seattle Mariners

WS Probability = 0.23105784259359113
Tampa Bay Rays

WS Probability = 0.21423692354884888
Minnesota Twins

WS Probability = 0.2073463446136215
Milwaukee Brewers

WS Probability = 0.2065411791686153
Atlanta Braves

WS Probability = 0.20559789645788953
Washington Nationals

WS Probability = 0.19838975200473422
New York Mets

WS Probability = 0.1975014490238672
Boston Red Sox

WS Probability = 0.19490385538168722
Philadelphia Phillies

WS Probability = 0.18596410699731128
Los Angeles Angels

WS Probability = 0.18080759644980302
Pittsburgh Pirates

WS Probability = 0.18041784734460561
St. Louis Cardinals

WS Probability = 0.1718268071755939
Arizona Diamondbacks

WS Probability = 0.1613541096818689
San Francisco Giants

WS Probability = 0.15017342546518264
New York Yankees

WS Pr

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [71]:
# predict for 2016
predict_the_winner(model_50, 2016, team_data, X_train_50)

WS Probability = 0.9602257526759964
Cleveland Indians

WS Probability = 0.5240449691019795
Los Angeles Dodgers

WS Probability = 0.49055231351691386
Houston Astros

WS Probability = 0.46259247926200153
Colorado Rockies

WS Probability = 0.44035753960036855
Los Angeles Angels

WS Probability = 0.37914483016449263
Boston Red Sox

WS Probability = 0.27661027653619413
San Francisco Giants

WS Probability = 0.2607423199708921
New York Yankees

WS Probability = 0.2515361365784386
Tampa Bay Rays

WS Probability = 0.24511220514230253
Arizona Diamondbacks

WS Probability = 0.24087361977352922
Pittsburgh Pirates

WS Probability = 0.2379157683438693
Minnesota Twins

WS Probability = 0.2345034840903647
St. Louis Cardinals

WS Probability = 0.2329766881265978
Chicago Cubs

WS Probability = 0.20887928950040346
Washington Nationals

WS Probability = 0.1808913269904801
Philadelphia Phillies

WS Probability = 0.18048545742184008
Seattle Mariners

WS Probability = 0.17609411639731204
Miami Marlins

WS P

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [72]:
# predict for 2018
predict_the_winner(model_25, 2018, team_data, X_train_25)

WS Probability = 0.465636808618478
Cleveland Indians

WS Probability = 0.4211690196282958
Boston Red Sox

WS Probability = 0.2441341491840563
Tampa Bay Rays

WS Probability = 0.23918937488688846
New York Yankees

WS Probability = 0.2299291347689194
Houston Astros

WS Probability = 0.20753199667805572
St. Louis Cardinals

WS Probability = 0.1865756072517449
Miami Marlins

WS Probability = 0.15385950202031584
Washington Nationals

WS Probability = 0.13286803823398288
Los Angeles Angels

WS Probability = 0.13090510656047083
Chicago Cubs

WS Probability = 0.12772297055660692
San Diego Padres

WS Probability = 0.11725720339526755
San Francisco Giants

WS Probability = 0.11275629933449453
New York Mets

WS Probability = 0.11213069585108555
Oakland Athletics

WS Probability = 0.11105730301980869
Atlanta Braves

WS Probability = 0.11085040770421538
Minnesota Twins

WS Probability = 0.10986975837625425
Texas Rangers

WS Probability = 0.09729933897071673
Philadelphia Phillies

WS Probability = 0

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [74]:
# predict for 2017
predict_the_winner(model_25, 2017, team_data, X_train_25)

WS Probability = 0.8649270133260044
Houston Astros

WS Probability = 0.1575466675148739
Colorado Rockies

WS Probability = 0.15070096228720414
Minnesota Twins

WS Probability = 0.1461229995634083
Seattle Mariners

WS Probability = 0.1334177444805497
Philadelphia Phillies

WS Probability = 0.11797252192970367
New York Mets

WS Probability = 0.11689288203000858
Atlanta Braves

WS Probability = 0.11667238812643295
Arizona Diamondbacks

WS Probability = 0.11197853742793157
Los Angeles Dodgers

WS Probability = 0.10429583788931064
New York Yankees

WS Probability = 0.10390945774961756
St. Louis Cardinals

WS Probability = 0.10237419968395202
Tampa Bay Rays

WS Probability = 0.10028838242133929
Los Angeles Angels

WS Probability = 0.0982347771740441
Chicago White Sox

WS Probability = 0.0956823015361471
Oakland Athletics

WS Probability = 0.09305358243727041
Washington Nationals

WS Probability = 0.08994834321967701
Chicago Cubs

WS Probability = 0.08670743447570958
Milwaukee Brewers

WS Pro

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [75]:
# predict for 2016
predict_the_winner(model_25, 2016, team_data, X_train_25)

WS Probability = 0.9999942753558461
Cleveland Indians

WS Probability = 0.44605610939929435
Los Angeles Dodgers

WS Probability = 0.288731484876201
Pittsburgh Pirates

WS Probability = 0.23316545065843852
Colorado Rockies

WS Probability = 0.21678508048837428
Houston Astros

WS Probability = 0.2154198753566374
Los Angeles Angels

WS Probability = 0.20581963765732075
New York Yankees

WS Probability = 0.19595376679177975
San Francisco Giants

WS Probability = 0.19136591570110784
Boston Red Sox

WS Probability = 0.15049123064045772
Minnesota Twins

WS Probability = 0.1493669788539413
Chicago Cubs

WS Probability = 0.14299825810634004
St. Louis Cardinals

WS Probability = 0.12973857541668984
Tampa Bay Rays

WS Probability = 0.12790165169367937
Arizona Diamondbacks

WS Probability = 0.11833723415696912
New York Mets

WS Probability = 0.11334321434941935
Milwaukee Brewers

WS Probability = 0.11271537280775212
Seattle Mariners

WS Probability = 0.10638338858846726
Philadelphia Phillies

WS P

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [76]:
# predict for 2019.
predict_the_winner(model_25, 2019, team_data, X_train_25)

WS Probability = 0.465636808618478
Cleveland Indians

WS Probability = 0.4211690196282958
Boston Red Sox

WS Probability = 0.2441341491840563
Tampa Bay Rays

WS Probability = 0.23918937488688846
New York Yankees

WS Probability = 0.2299291347689194
Houston Astros

WS Probability = 0.20753199667805572
St. Louis Cardinals

WS Probability = 0.1865756072517449
Miami Marlins

WS Probability = 0.15385950202031584
Washington Nationals

WS Probability = 0.13286803823398288
Los Angeles Angels

WS Probability = 0.13090510656047083
Chicago Cubs

WS Probability = 0.12772297055660692
San Diego Padres

WS Probability = 0.11725720339526755
San Francisco Giants

WS Probability = 0.11275629933449453
New York Mets

WS Probability = 0.11213069585108555
Oakland Athletics

WS Probability = 0.11105730301980869
Atlanta Braves

WS Probability = 0.11085040770421538
Minnesota Twins

WS Probability = 0.10986975837625425
Texas Rangers

WS Probability = 0.09729933897071673
Philadelphia Phillies

WS Probability = 0

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


hm.  model_25 seems to be pretty good.  I mean, red sox is #2 for 2018, astros #1 for 2017, and the indians for 2016 (and they made it to the finals.