## Purpose: Try different models-- Part3.
### Grid search with upsampling and scaling.

In [1]:
# import dependencies.
import pandas as pd
import numpy as np

from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC

#### STEP1: Read in dataset.  Remove data from 2016-2019.
- data from 2016-2018 will be used to bs test the model.
- data from 2019 will be used to predict the winners of the 2019 WS.

In [2]:
# read in the data.
team_data = pd.read_csv("../Resources/clean_data.csv")
del team_data["Unnamed: 0"]
team_data.head()

Unnamed: 0,team,year,A,DP,E,GS2,INN,PB,PO,TC,...,OBP1,R1,SHO,SO1,SV,TBF,W,WHIP,WP,winners
0,St. Louis Cardinals,2019,1033,114,43,936,8313.0,3,2771,3847,...,0.319,456,4,895,33,3896,56,1.29,21,0
1,Arizona Diamondbacks,2019,1010,83,45,945,8538.0,2,2846,3901,...,0.315,472,7,925,24,4001,53,1.28,35,0
2,Kansas City Royals,2019,990,105,45,954,8421.0,6,2807,3842,...,0.346,543,5,816,24,4125,39,1.46,34,0
3,Houston Astros,2019,875,54,50,954,8589.0,6,2863,3788,...,0.284,432,7,1074,27,3929,67,1.14,31,0
4,Tampa Bay Rays,2019,975,92,53,963,8760.0,11,2920,3948,...,0.291,409,6,1037,26,3985,59,1.16,40,0


In [3]:
# remove data from 2016 through 2019.
team_data_new = team_data.loc[team_data["year"] < 2016]
team_data_new.head()

Unnamed: 0,team,year,A,DP,E,GS2,INN,PB,PO,TC,...,OBP1,R1,SHO,SO1,SV,TBF,W,WHIP,WP,winners
120,San Francisco Giants,2015,1639,136,72,1458,13143.0,6,4381,6092,...,0.303,631,11,1309,43,6048,87,1.21,40,0
121,Washington Nationals,2015,1425,142,73,1458,13137.0,17,4379,5877,...,0.3,612,12,1476,46,6036,95,1.19,47,0
122,Houston Astros,2015,1599,135,77,1458,13212.0,18,4404,6080,...,0.314,701,8,1396,44,6180,84,1.29,98,0
123,Detroit Tigers,2015,1537,148,75,1449,12852.0,5,4284,5896,...,0.32,721,8,1232,47,6048,86,1.32,44,0
124,Boston Red Sox,2015,1427,139,75,1458,12957.0,37,4319,5821,...,0.314,694,5,1362,43,6073,93,1.27,52,0


In [4]:
target = team_data_new["winners"]
features = team_data_new.drop({"team", "year", "winners"}, axis=1)
feature_columns = list(features.columns)
print (target.shape)
print (features.shape)
print (feature_columns)

(2344,)
(2344, 46)
['A', 'DP', 'E', 'GS2', 'INN', 'PB', 'PO', 'TC', '2B', '3B', 'AB', 'AO', 'BB', 'H', 'HBP', 'HR', 'NP_x', 'OBP', 'OPS_x', 'R', 'RBI', 'SAC', 'SB', 'SLG', 'BB1', 'BK', 'CG', 'ER', 'ERA', 'G1', 'GF', 'GS', 'H1', 'HB', 'HR1', 'IP', 'L', 'OBP1', 'R1', 'SHO', 'SO1', 'SV', 'TBF', 'W', 'WHIP', 'WP']


#### STEP2: Upsample and scale data.

In [5]:
# reset the index.
team_data_new = team_data_new.reset_index().drop({"index"}, axis=1)
team_data_new.head()

Unnamed: 0,team,year,A,DP,E,GS2,INN,PB,PO,TC,...,OBP1,R1,SHO,SO1,SV,TBF,W,WHIP,WP,winners
0,San Francisco Giants,2015,1639,136,72,1458,13143.0,6,4381,6092,...,0.303,631,11,1309,43,6048,87,1.21,40,0
1,Washington Nationals,2015,1425,142,73,1458,13137.0,17,4379,5877,...,0.3,612,12,1476,46,6036,95,1.19,47,0
2,Houston Astros,2015,1599,135,77,1458,13212.0,18,4404,6080,...,0.314,701,8,1396,44,6180,84,1.29,98,0
3,Detroit Tigers,2015,1537,148,75,1449,12852.0,5,4284,5896,...,0.32,721,8,1232,47,6048,86,1.32,44,0
4,Boston Red Sox,2015,1427,139,75,1458,12957.0,37,4319,5821,...,0.314,694,5,1362,43,6073,93,1.27,52,0


In [6]:
# remove team and year.
feature_columns_new = feature_columns + ["winners"]
team_data_new = team_data[feature_columns_new]
team_data_new.head()

Unnamed: 0,A,DP,E,GS2,INN,PB,PO,TC,2B,3B,...,OBP1,R1,SHO,SO1,SV,TBF,W,WHIP,WP,winners
0,1033,114,43,936,8313.0,3,2771,3847,157,10,...,0.319,456,4,895,33,3896,56,1.29,21,0
1,1010,83,45,945,8538.0,2,2846,3901,203,27,...,0.315,472,7,925,24,4001,53,1.28,35,0
2,990,105,45,954,8421.0,6,2807,3842,185,31,...,0.346,543,5,816,24,4125,39,1.46,34,0
3,875,54,50,954,8589.0,6,2863,3788,200,15,...,0.284,432,7,1074,27,3929,67,1.14,31,0
4,975,92,53,963,8760.0,11,2920,3948,195,21,...,0.291,409,6,1037,26,3985,59,1.16,40,0


In [7]:
# upsample for a more balanced dataset.
def upsample(dataset, no_samples):
    '''
    INPUT: 
    -dataset = dataset without team names and year.
    -n_samples = number of minority_unsampled.
    
    OUTPUT:
    -X_train_scaled = scaled X train data.
    -X_test_scaled = scaled X test data.
    -y_train = y train data
    -y_test = y test data
    
    DESCRIPTION:
    -dataset is taken in and split into minority and majority classes.
    -dataset is then upsampled for the mainority class
    -split the data into features and targets
    -split data into train and test sets
    -train and test sets were are scaled.
    '''
    
    # separate majority and minority classes.
    df_majority = dataset.loc[dataset["winners"] == 0]
    df_minority = dataset.loc[dataset["winners"] == 1]

    # upsample minority class.
    df_minority_unsampled = resample(df_minority,
                                    replace=True,
                                    n_samples=no_samples,
                                    random_state=123)

    # combine majority class with upsampled minority class.
    df_upsampled = pd.concat([df_majority, df_minority_unsampled])

    # separate features and target.
    y = df_upsampled["winners"]
    X = df_upsampled[feature_columns]
    
    # split into train and test sets.
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    # scale X_train and X_test.
    scaler = StandardScaler()

    # transform the training and testing data.
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.fit_transform(X_test)
    
    return X_train_scaled, X_test_scaled, y_train, y_test

In [8]:
# Do three different upsamplings.
X_train_100, X_test_100, y_train_100, y_test_100 = upsample(team_data_new, 2234)
X_train_50, X_test_50, y_train_50, y_test_50 = upsample(team_data_new, 1117)
X_train_25, X_test_25, y_train_25, y_test_25 = upsample(team_data_new, 559)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


#### STEP3: Grid Search Model--Logistic Regression.

In [9]:
def grid_search_logistic(X_train, X_test, y_train, y_test):
    '''
    INPUT: 
    -X_train = scaled X train data.
    -X_test = scaled X test data.
    -y_train = y train data.
    -y_test = y test data.
    
    OUTPUT:
    -classification report (has F1 score, precision and recall).
    -grid = saved model for prediction. 
    
    DESCRIPTION:
    -the scaled and split data is put through a grid search with logistic.
    -the model is trained.
    -a prediction is made.
    -print out the classification report and give the model.
    '''
    
    # fit the model.
    model = LogisticRegression(solver="lbfgs", max_iter= 2000)
    
    # create gridsearch estimator.
    param_grid = {"C": [0.001, 0.01, 0.1, 1, 10, 100]}
    grid = GridSearchCV(model, param_grid, verbose=3)

    # fit the model.
    grid.fit(X_train, y_train)

    # predict.
    prediction = grid.predict(X_test)
    
    # print out the basic information about the grid search.
    print (grid.best_params_)
    print (grid.best_score_)
    print (grid.best_estimator_)
    
    grid = grid.best_estimator_
    predictions = grid.predict(X_test)
    print (classification_report(y_test, prediction, target_names=["0", "1"]))
    
    return grid

In [10]:
# for 1 part 0 to 1 part 1
model_100 = grid_search_logistic(X_train_100, X_test_100, y_train_100, y_test_100)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] C=0.001 .........................................................
[CV] ................ C=0.001, score=0.7096774193548387, total=   0.0s
[CV] C=0.001 .........................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV] ................ C=0.001, score=0.7329842931937173, total=   0.0s
[CV] C=0.001 .........................................................
[CV] ................ C=0.001, score=0.7161572052401747, total=   0.0s
[CV] C=0.01 ..........................................................
[CV] ................. C=0.01, score=0.7253705318221447, total=   0.0s
[CV] C=0.01 ..........................................................
[CV] ................. C=0.01, score=0.7382198952879581, total=   0.0s
[CV] C=0.01 ..........................................................
[CV] ................. C=0.01, score=0.7222707423580786, total=   0.0s
[CV] C=0.1 ...........................................................
[CV] .................. C=0.1, score=0.7340889276373147, total=   0.0s
[CV] C=0.1 ...........................................................
[CV] .................. C=0.1, score=0.7478184991273996, total=   0.0s
[CV] C=0.1 ...........................................................
[CV] .

[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:    1.6s finished


{'C': 100}
0.7646887725421757
LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=2000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)
              precision    recall  f1-score   support

           0       0.78      0.72      0.75       607
           1       0.71      0.77      0.74       540

   micro avg       0.74      0.74      0.74      1147
   macro avg       0.74      0.74      0.74      1147
weighted avg       0.75      0.74      0.74      1147



In [11]:
# for 1 part 0 to 0.5 part 1
model_50 = grid_search_logistic(X_train_50, X_test_50, y_train_50, y_test_50)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] C=0.001 .........................................................
[CV] ................ C=0.001, score=0.7350230414746544, total=   0.0s
[CV] C=0.001 .........................................................
[CV] ................ C=0.001, score=0.7220299884659747, total=   0.0s
[CV] C=0.001 .........................................................
[CV] ................ C=0.001, score=0.7066974595842956, total=   0.0s
[CV] C=0.01 ..........................................................
[CV] ................. C=0.01, score=0.7419354838709677, total=   0.0s
[CV] C=0.01 ..........................................................
[CV] ................. C=0.01, score=0.7531718569780853, total=   0.0s
[CV] C=0.01 ..........................................................
[CV] ................. C=0.01, score=0.7159353348729792, total=   0.0s
[CV] C=0.1 ...........................................................
[CV] ............

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV] .................... C=1, score=0.7424942263279446, total=   0.0s
[CV] C=10 ............................................................
[CV] ................... C=10, score=0.7557603686635944, total=   0.1s
[CV] C=10 ............................................................
[CV] ................... C=10, score=0.7670126874279123, total=   0.1s
[CV] C=10 ............................................................
[CV] ................... C=10, score=0.7355658198614319, total=   0.1s
[CV] C=100 ...........................................................
[CV] .................. C=100, score=0.7557603686635944, total=   0.2s
[CV] C=100 ...........................................................
[CV] .................. C=100, score=0.7704728950403691, total=   0.2s
[CV] C=100 ...........................................................
[CV] .................. C=100, score=0.7251732101616628, total=   0.2s
{'C': 1}
0.7566320645905421
LogisticRegression(C=1, class_weight=None, dual=F

[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:    1.0s finished


In [12]:
# for 1 part 0 to 0.25 part 1
model_25 = grid_search_logistic(X_train_25, X_test_25, y_train_25, y_test_25)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] C=0.001 .........................................................
[CV] ................ C=0.001, score=0.8118131868131868, total=   0.0s
[CV] C=0.001 .........................................................
[CV] ................ C=0.001, score=0.8118131868131868, total=   0.0s
[CV] C=0.001 .........................................................
[CV] ................ C=0.001, score=0.8112947658402204, total=   0.0s
[CV] C=0.01 ..........................................................
[CV] ................. C=0.01, score=0.8159340659340659, total=   0.0s
[CV] C=0.01 ..........................................................
[CV] ................. C=0.01, score=0.8255494505494505, total=   0.0s
[CV] C=0.01 ..........................................................
[CV] ................. C=0.01, score=0.8305785123966942, total=   0.0s
[CV] C=0.1 ...........................................................
[CV] ............

[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:    0.8s finished


This is not significantly better than the straight logistic regression.

In [13]:
def predict_the_winner(model, year, team_data, X_train):
    '''
    INPUT: 
    -X_train = scaled X train data.
    -model = the saved model.
    -team_data = complete dataframe with all data.
    -year = the year want to look at.
    
    OUTPUT:
    -printed prediction.
    
    DESCRIPTION:
    -data from year of interest is isolated.
    -the data are scaled.
    -the prediction is made.
    -print out the resulting probability and the name of the team.
    '''
    
    # grab the data.
    team_data = team_data.loc[team_data["year"] == year].reset_index()

    # set features (no team, year, winners).
    # set target (winners).
    features = team_data[feature_columns]
    
    # scale X_train and X_test.
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    features = scaler.fit_transform(features)
    
    # fit the model.
    probabilities = model.predict_proba(features)

    # convert predictions to datafram.e
    WS_predictions = pd.DataFrame(probabilities[:,1])

    # Sort the DataFrame (descending)
    WS_predictions = WS_predictions.sort_values(0, ascending=False)

    WS_predictions['Probability'] = WS_predictions[0]

    # Print 50 highest probability HoF inductees from still eligible players
    for i, row in WS_predictions.head(50).iterrows():
       prob = ' '.join(('WS Probability =', str(row['Probability'])))
       print('')
       print(prob)
       print(team_data.iloc[i,1:27]["team"])

In [14]:
# predict for 2018.
predict_the_winner(model_100, 2018, team_data, X_train_100)


WS Probability = 0.9999999052489915
Detroit Tigers

WS Probability = 0.9999988139366737
Cincinnati Reds

WS Probability = 0.9999661522153225
Washington Nationals

WS Probability = 0.9996989976312443
Chicago White Sox

WS Probability = 0.9996300155270222
St. Louis Cardinals

WS Probability = 0.9979037566965535
Los Angeles Angels

WS Probability = 0.9713211341655993
Minnesota Twins

WS Probability = 0.9648146757028638
Houston Astros

WS Probability = 0.9643392737378867
Los Angeles Dodgers

WS Probability = 0.951954740649106
Miami Marlins

WS Probability = 0.9476277694388671
Oakland Athletics

WS Probability = 0.9452527965028643
Baltimore Orioles

WS Probability = 0.9069092987114178
Kansas City Royals

WS Probability = 0.8466571100136441
Atlanta Braves

WS Probability = 0.7830384184371494
New York Yankees

WS Probability = 0.766045035414245
Arizona Diamondbacks

WS Probability = 0.6917632939184729
Toronto Blue Jays

WS Probability = 0.5522730341042508
San Diego Padres

WS Probability = 0

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [15]:
# predict for 2017.
predict_the_winner(model_100, 2017, team_data, X_train_100)


WS Probability = 0.9999999998107856
Colorado Rockies

WS Probability = 0.999999998496695
Detroit Tigers

WS Probability = 0.9999997111619617
Milwaukee Brewers

WS Probability = 0.9999975212713917
Los Angeles Angels

WS Probability = 0.9999836645950128
Kansas City Royals

WS Probability = 0.9997314905952179
Los Angeles Dodgers

WS Probability = 0.9997228531158449
Toronto Blue Jays

WS Probability = 0.9990434508090177
Baltimore Orioles

WS Probability = 0.9988635497806689
Texas Rangers

WS Probability = 0.9921326454825007
Washington Nationals

WS Probability = 0.969491264255341
Minnesota Twins

WS Probability = 0.9577827568696026
Cincinnati Reds

WS Probability = 0.942225602708828
Houston Astros

WS Probability = 0.8670104005197429
Chicago White Sox

WS Probability = 0.8338589617781162
Cleveland Indians

WS Probability = 0.8236404023976702
Chicago Cubs

WS Probability = 0.5563154011390936
Tampa Bay Rays

WS Probability = 0.30358826992761
Atlanta Braves

WS Probability = 0.27825136657953

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [16]:
# predict for 2016.
predict_the_winner(model_100, 2016, team_data, X_train_100)


WS Probability = 0.9999999998158646
Detroit Tigers

WS Probability = 0.9999999958109862
Cleveland Indians

WS Probability = 0.9999978556755493
Chicago White Sox

WS Probability = 0.9999949613706851
Minnesota Twins

WS Probability = 0.9998081124205839
Oakland Athletics

WS Probability = 0.9994167804282162
Cincinnati Reds

WS Probability = 0.9972216258454755
Kansas City Royals

WS Probability = 0.9970866668067313
New York Mets

WS Probability = 0.9871614182268569
Houston Astros

WS Probability = 0.9675442036148363
Arizona Diamondbacks

WS Probability = 0.9193772417899525
Seattle Mariners

WS Probability = 0.8713933684805023
Texas Rangers

WS Probability = 0.8441804855960847
Colorado Rockies

WS Probability = 0.82725135775933
New York Yankees

WS Probability = 0.7942786266339376
Washington Nationals

WS Probability = 0.7832414336715168
Los Angeles Dodgers

WS Probability = 0.7593246394172488
San Diego Padres

WS Probability = 0.6015781196943032
Baltimore Orioles

WS Probability = 0.45517

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)



Philadelphia Phillies

WS Probability = 0.03319089598581181
Miami Marlins

WS Probability = 0.00565910390965712
Chicago Cubs

WS Probability = 0.0027731247060876272
Tampa Bay Rays

WS Probability = 0.00015584351434147384
St. Louis Cardinals

WS Probability = 1.395529057535134e-05
Milwaukee Brewers

WS Probability = 4.345555995010309e-06
San Francisco Giants

WS Probability = 6.366127100842571e-13
Toronto Blue Jays

WS Probability = 2.369822029725234e-15
Boston Red Sox


#### STEP4: Grid Search Model--SVC.

In [17]:
def grid_search_svc(X_train, X_test, y_train, y_test):
    '''
    INPUT: 
    -X_train = scaled X train data.
    -X_test = scaled X test data.
    -y_train = y train data.
    -y_test = y test data.
    
    OUTPUT:
    -classification report (has F1 score, precision and recall).
    -grid = saved model for prediction. 
    
    DESCRIPTION:
    -the scaled and split data is put through a grid search with svc.
    -the model is trained.
    -a prediction is made.
    -print out the classification report and give the model.
    '''
    
    # set up svc model.
    model = SVC(kernel="rbf", probability=True)

    # create gridsearch estimator.
    param_grid = {"C": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
                 "gamma": [0.0001, 0.001, 0.01, 0.1]}
    grid = GridSearchCV(model, param_grid, verbose=3)

    # fit the model.
    grid.fit(X_train, y_train)

    # predict.
    prediction = grid.predict(X_test)
    
    # print out the basic information about the grid search.
    print (grid.best_params_)
    print (grid.best_score_)
    print (grid.best_estimator_)
    
    grid = grid.best_estimator_
    predictions = grid.predict(X_test)
    print (classification_report(y_test, prediction, target_names=["0", "1"]))
    
    return grid

In [18]:
# for 1 part 0 to 1 part 1
model_100 = grid_search_svc(X_train_100, X_test_100, y_train_100, y_test_100)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 28 candidates, totalling 84 fits
[CV] C=0.0001, gamma=0.0001 ..........................................
[CV] . C=0.0001, gamma=0.0001, score=0.5074106364428945, total=   1.8s
[CV] C=0.0001, gamma=0.0001 ..........................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.1s remaining:    0.0s


[CV] . C=0.0001, gamma=0.0001, score=0.5069808027923212, total=   1.9s
[CV] C=0.0001, gamma=0.0001 ..........................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    4.3s remaining:    0.0s


[CV] . C=0.0001, gamma=0.0001, score=0.5074235807860262, total=   1.9s
[CV] C=0.0001, gamma=0.001 ...........................................
[CV] .. C=0.0001, gamma=0.001, score=0.5074106364428945, total=   1.9s
[CV] C=0.0001, gamma=0.001 ...........................................
[CV] .. C=0.0001, gamma=0.001, score=0.5069808027923212, total=   1.8s
[CV] C=0.0001, gamma=0.001 ...........................................
[CV] .. C=0.0001, gamma=0.001, score=0.5074235807860262, total=   1.8s
[CV] C=0.0001, gamma=0.01 ............................................
[CV] ... C=0.0001, gamma=0.01, score=0.5074106364428945, total=   1.9s
[CV] C=0.0001, gamma=0.01 ............................................
[CV] ... C=0.0001, gamma=0.01, score=0.5069808027923212, total=   1.8s
[CV] C=0.0001, gamma=0.01 ............................................
[CV] ... C=0.0001, gamma=0.01, score=0.5074235807860262, total=   2.1s
[CV] C=0.0001, gamma=0.1 .............................................
[CV] .

[CV] ..... C=10, gamma=0.0001, score=0.7384481255448997, total=   1.4s
[CV] C=10, gamma=0.0001 ..............................................
[CV] ..... C=10, gamma=0.0001, score=0.7399650959860384, total=   1.4s
[CV] C=10, gamma=0.0001 ..............................................
[CV] ..... C=10, gamma=0.0001, score=0.7100436681222707, total=   1.4s
[CV] C=10, gamma=0.001 ...............................................
[CV] ...... C=10, gamma=0.001, score=0.7602441150828247, total=   1.3s
[CV] C=10, gamma=0.001 ...............................................
[CV] ...... C=10, gamma=0.001, score=0.7914485165794066, total=   1.3s
[CV] C=10, gamma=0.001 ...............................................
[CV] ...... C=10, gamma=0.001, score=0.7606986899563318, total=   1.3s
[CV] C=10, gamma=0.01 ................................................
[CV] ....... C=10, gamma=0.01, score=0.9032258064516129, total=   1.0s
[CV] C=10, gamma=0.01 ................................................
[CV] .

[Parallel(n_jobs=1)]: Done  84 out of  84 | elapsed:  2.4min finished


{'C': 10, 'gamma': 0.1}
0.9880744618964514
SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       607
           1       0.99      1.00      0.99       540

   micro avg       0.99      0.99      0.99      1147
   macro avg       0.99      0.99      0.99      1147
weighted avg       0.99      0.99      0.99      1147



In [19]:
# for 1 part 0 to 0.5 part 1
model_50 = grid_search_svc(X_train_50, X_test_50, y_train_50, y_test_50)

Fitting 3 folds for each of 28 candidates, totalling 84 fits
[CV] C=0.0001, gamma=0.0001 ..........................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] . C=0.0001, gamma=0.0001, score=0.6705069124423964, total=   0.7s
[CV] C=0.0001, gamma=0.0001 ..........................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s remaining:    0.0s


[CV] .. C=0.0001, gamma=0.0001, score=0.671280276816609, total=   0.7s
[CV] C=0.0001, gamma=0.0001 ..........................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.6s remaining:    0.0s


[CV] . C=0.0001, gamma=0.0001, score=0.6709006928406467, total=   0.7s
[CV] C=0.0001, gamma=0.001 ...........................................
[CV] .. C=0.0001, gamma=0.001, score=0.6705069124423964, total=   0.7s
[CV] C=0.0001, gamma=0.001 ...........................................
[CV] ... C=0.0001, gamma=0.001, score=0.671280276816609, total=   0.7s
[CV] C=0.0001, gamma=0.001 ...........................................
[CV] .. C=0.0001, gamma=0.001, score=0.6709006928406467, total=   0.7s
[CV] C=0.0001, gamma=0.01 ............................................
[CV] ... C=0.0001, gamma=0.01, score=0.6705069124423964, total=   0.7s
[CV] C=0.0001, gamma=0.01 ............................................
[CV] .... C=0.0001, gamma=0.01, score=0.671280276816609, total=   0.7s
[CV] C=0.0001, gamma=0.01 ............................................
[CV] ... C=0.0001, gamma=0.01, score=0.6709006928406467, total=   0.7s
[CV] C=0.0001, gamma=0.1 .............................................
[CV] .

[CV] ................... C=10, gamma=0.0001, score=0.75, total=   0.7s
[CV] C=10, gamma=0.0001 ..............................................
[CV] ..... C=10, gamma=0.0001, score=0.7381776239907728, total=   0.7s
[CV] C=10, gamma=0.0001 ..............................................
[CV] ..... C=10, gamma=0.0001, score=0.7228637413394919, total=   0.7s
[CV] C=10, gamma=0.001 ...............................................
[CV] ...... C=10, gamma=0.001, score=0.7661290322580645, total=   0.7s
[CV] C=10, gamma=0.001 ...............................................
[CV] ...... C=10, gamma=0.001, score=0.7773933102652826, total=   0.7s
[CV] C=10, gamma=0.001 ...............................................
[CV] ...... C=10, gamma=0.001, score=0.7355658198614319, total=   0.7s
[CV] C=10, gamma=0.01 ................................................
[CV] ....... C=10, gamma=0.01, score=0.8525345622119815, total=   0.6s
[CV] C=10, gamma=0.01 ................................................
[CV] .

[Parallel(n_jobs=1)]: Done  84 out of  84 | elapsed:  1.1min finished


{'C': 10, 'gamma': 0.1}
0.9846212995001923
SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       606
           1       0.97      1.00      0.98       261

   micro avg       0.99      0.99      0.99       867
   macro avg       0.98      0.99      0.99       867
weighted avg       0.99      0.99      0.99       867



In [20]:
# for 1 part 0 to 0.25 part 1
model_25 = grid_search_svc(X_train_25, X_test_25, y_train_25, y_test_25)

Fitting 3 folds for each of 28 candidates, totalling 84 fits
[CV] C=0.0001, gamma=0.0001 ..........................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] . C=0.0001, gamma=0.0001, score=0.8118131868131868, total=   0.3s
[CV] C=0.0001, gamma=0.0001 ..........................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV] . C=0.0001, gamma=0.0001, score=0.8118131868131868, total=   0.3s
[CV] C=0.0001, gamma=0.0001 ..........................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.6s remaining:    0.0s


[CV] . C=0.0001, gamma=0.0001, score=0.8126721763085399, total=   0.3s
[CV] C=0.0001, gamma=0.001 ...........................................
[CV] .. C=0.0001, gamma=0.001, score=0.8118131868131868, total=   0.3s
[CV] C=0.0001, gamma=0.001 ...........................................
[CV] .. C=0.0001, gamma=0.001, score=0.8118131868131868, total=   0.3s
[CV] C=0.0001, gamma=0.001 ...........................................
[CV] .. C=0.0001, gamma=0.001, score=0.8126721763085399, total=   0.3s
[CV] C=0.0001, gamma=0.01 ............................................
[CV] ... C=0.0001, gamma=0.01, score=0.8118131868131868, total=   0.3s
[CV] C=0.0001, gamma=0.01 ............................................
[CV] ... C=0.0001, gamma=0.01, score=0.8118131868131868, total=   0.3s
[CV] C=0.0001, gamma=0.01 ............................................
[CV] ... C=0.0001, gamma=0.01, score=0.8126721763085399, total=   0.3s
[CV] C=0.0001, gamma=0.1 .............................................
[CV] .

[CV] ..... C=10, gamma=0.0001, score=0.8118131868131868, total=   0.4s
[CV] C=10, gamma=0.0001 ..............................................
[CV] ..... C=10, gamma=0.0001, score=0.8118131868131868, total=   0.4s
[CV] C=10, gamma=0.0001 ..............................................
[CV] ..... C=10, gamma=0.0001, score=0.8126721763085399, total=   0.4s
[CV] C=10, gamma=0.001 ...............................................
[CV] ...... C=10, gamma=0.001, score=0.8118131868131868, total=   0.4s
[CV] C=10, gamma=0.001 ...............................................
[CV] ...... C=10, gamma=0.001, score=0.8118131868131868, total=   0.4s
[CV] C=10, gamma=0.001 ...............................................
[CV] ...... C=10, gamma=0.001, score=0.8126721763085399, total=   0.3s
[CV] C=10, gamma=0.01 ................................................
[CV] ....... C=10, gamma=0.01, score=0.8763736263736264, total=   0.3s
[CV] C=10, gamma=0.01 ................................................
[CV] .

[Parallel(n_jobs=1)]: Done  84 out of  84 | elapsed:   32.8s finished


{'C': 10, 'gamma': 0.1}
0.965627864344638
SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       579
           1       0.94      0.91      0.93       149

   micro avg       0.97      0.97      0.97       728
   macro avg       0.96      0.95      0.96       728
weighted avg       0.97      0.97      0.97       728



Huh.  That's pretty good.  Try out model_100 and model_50 with the 2016-2018 stuff.

#### STEP5: Predict 2016-2018 winners with SVC Grid Search.

In [21]:
# predict for 2018.
predict_the_winner(model_100, 2018, team_data, X_train_100)


WS Probability = 3.6760639449814633e-06
Arizona Diamondbacks

WS Probability = 2.5582409892237516e-06
St. Louis Cardinals

WS Probability = 2.5560855939961604e-06
Boston Red Sox

WS Probability = 1.9576242631179643e-06
Oakland Athletics

WS Probability = 1.7472926034905176e-06
Chicago Cubs

WS Probability = 1.521496618033603e-06
Texas Rangers

WS Probability = 1.3765462578049393e-06
Tampa Bay Rays

WS Probability = 1.3547950455168502e-06
Cleveland Indians

WS Probability = 1.3498599040851868e-06
Washington Nationals

WS Probability = 1.1941428856819847e-06
Miami Marlins

WS Probability = 1.1693690384421752e-06
Houston Astros

WS Probability = 1.1388021277191365e-06
Chicago White Sox

WS Probability = 1.0955416220284706e-06
Detroit Tigers

WS Probability = 1.0848589854978277e-06
Minnesota Twins

WS Probability = 1.0750665280407774e-06
Los Angeles Dodgers

WS Probability = 1.0615076164991741e-06
New York Yankees

WS Probability = 9.880081293562964e-07
Cincinnati Reds

WS Probability = 9

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [22]:
# predict for 2017.
predict_the_winner(model_100, 2017, team_data, X_train_100)


WS Probability = 2.6852731897058092e-06
Los Angeles Angels

WS Probability = 1.8160005734892082e-06
Los Angeles Dodgers

WS Probability = 1.6955679713593655e-06
Washington Nationals

WS Probability = 1.6091551391193876e-06
Houston Astros

WS Probability = 1.3931619602313293e-06
Tampa Bay Rays

WS Probability = 1.3453662421882901e-06
Milwaukee Brewers

WS Probability = 1.3149935718325086e-06
Kansas City Royals

WS Probability = 1.2066128880071095e-06
Oakland Athletics

WS Probability = 1.1653714261935242e-06
Chicago Cubs

WS Probability = 1.1528460466227418e-06
Chicago White Sox

WS Probability = 1.146519356336665e-06
Boston Red Sox

WS Probability = 1.1266185481733852e-06
Cleveland Indians

WS Probability = 1.0053776806940313e-06
Baltimore Orioles

WS Probability = 9.582567588170387e-07
Miami Marlins

WS Probability = 9.077908089294138e-07
Atlanta Braves

WS Probability = 8.68069960613252e-07
Arizona Diamondbacks

WS Probability = 8.553107595778217e-07
Colorado Rockies

WS Probability

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [23]:
# predict for 2016
predict_the_winner(model_100, 2016, team_data, X_train_100)


WS Probability = 1.8248384048249684e-06
Oakland Athletics

WS Probability = 1.52440395548903e-06
Colorado Rockies

WS Probability = 1.4349436878984857e-06
Minnesota Twins

WS Probability = 1.2801482029644495e-06
New York Yankees

WS Probability = 1.244369764580155e-06
Seattle Mariners

WS Probability = 1.2110856137968616e-06
Washington Nationals

WS Probability = 1.2056961044780404e-06
Houston Astros

WS Probability = 1.1347632773862286e-06
Cleveland Indians

WS Probability = 1.1262399184143883e-06
Boston Red Sox

WS Probability = 1.0727842034721411e-06
Los Angeles Dodgers

WS Probability = 1.000294744530405e-06
Chicago Cubs

WS Probability = 9.479875612891692e-07
San Diego Padres

WS Probability = 9.26333827420569e-07
San Francisco Giants

WS Probability = 9.176077961977231e-07
Detroit Tigers

WS Probability = 8.407647924564116e-07
Los Angeles Angels

WS Probability = 8.022776501415965e-07
St. Louis Cardinals

WS Probability = 7.331854945277729e-07
Chicago White Sox

WS Probability =

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [24]:
# predict for 2018
predict_the_winner(model_50, 2018, team_data, X_train_50)


WS Probability = 2.2330870600882815e-06
Boston Red Sox

WS Probability = 2.088285456705992e-06
Arizona Diamondbacks

WS Probability = 1.7699167413565397e-06
Cleveland Indians

WS Probability = 1.6299286646924794e-06
St. Louis Cardinals

WS Probability = 1.570787230054825e-06
Texas Rangers

WS Probability = 1.4782182160242947e-06
Tampa Bay Rays

WS Probability = 1.450523618790206e-06
Washington Nationals

WS Probability = 1.184446688755143e-06
Oakland Athletics

WS Probability = 1.1630284471452722e-06
Chicago Cubs

WS Probability = 1.1611235876666644e-06
Chicago White Sox

WS Probability = 1.1562021491075153e-06
Miami Marlins

WS Probability = 1.1270177559628125e-06
Detroit Tigers

WS Probability = 1.0594875085746544e-06
Los Angeles Dodgers

WS Probability = 1.0031458945631e-06
Houston Astros

WS Probability = 9.86296408986498e-07
Seattle Mariners

WS Probability = 9.854387865433162e-07
Cincinnati Reds

WS Probability = 8.980266545758968e-07
Minnesota Twins

WS Probability = 8.94403184

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [25]:
# predict for 2017
predict_the_winner(model_50, 2017, team_data, X_train_50)


WS Probability = 4.435803472991916e-06
Los Angeles Angels

WS Probability = 3.0317059228436747e-06
Washington Nationals

WS Probability = 2.0099579263214645e-06
Tampa Bay Rays

WS Probability = 1.9127924761816737e-06
Milwaukee Brewers

WS Probability = 1.7878867077440746e-06
Houston Astros

WS Probability = 1.7791793673669025e-06
Los Angeles Dodgers

WS Probability = 1.3535134582069066e-06
Chicago White Sox

WS Probability = 1.2961231319767085e-06
Kansas City Royals

WS Probability = 1.2159367304216571e-06
Cleveland Indians

WS Probability = 1.1568287321212171e-06
Chicago Cubs

WS Probability = 1.1365328186998113e-06
Boston Red Sox

WS Probability = 1.1329154316495082e-06
Oakland Athletics

WS Probability = 1.0711390423793435e-06
Baltimore Orioles

WS Probability = 1.00133009063549e-06
Miami Marlins

WS Probability = 9.97086644998179e-07
Atlanta Braves

WS Probability = 8.047500954776112e-07
Pittsburgh Pirates

WS Probability = 7.457325974552707e-07
New York Yankees

WS Probability = 

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [26]:
# predict for 2016
predict_the_winner(model_50, 2016, team_data, X_train_50)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)



WS Probability = 3.431271612863942e-06
Oakland Athletics

WS Probability = 2.0002482260870595e-06
Seattle Mariners

WS Probability = 1.8518680926599044e-06
Washington Nationals

WS Probability = 1.294097497843082e-06
New York Yankees

WS Probability = 1.245563975952465e-06
Minnesota Twins

WS Probability = 1.1913326951150271e-06
Cleveland Indians

WS Probability = 1.1557454429334776e-06
Boston Red Sox

WS Probability = 1.131008152215997e-06
Houston Astros

WS Probability = 1.0994566075199944e-06
Los Angeles Dodgers

WS Probability = 1.0229249529485798e-06
San Diego Padres

WS Probability = 9.413790316254095e-07
Detroit Tigers

WS Probability = 8.997081990604724e-07
Colorado Rockies

WS Probability = 8.985727783706932e-07
San Francisco Giants

WS Probability = 8.091018419139008e-07
Chicago White Sox

WS Probability = 6.676247443435629e-07
Cincinnati Reds

WS Probability = 6.387539328559e-07
Baltimore Orioles

WS Probability = 4.861584750155302e-07
Philadelphia Phillies

WS Probability 

This is not better.  The logistic regression without grid and upsampling works better than either upsampled thing.