## Purpose: Try different models-- Part5.
### Penalized_SVM.

In [1]:
# import dependencies.
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC

#### STEP1: Read in dataset.  Remove data from 2016-2019.
- data from 2016-2018 will be used to bs test the model.
- data from 2019 will be used to predict the winners of the 2019 WS.

In [2]:
# read in the data.
team_data = pd.read_csv("../../Resources/clean_data_1969.csv")
del team_data["Unnamed: 0"]
team_data.head()

Unnamed: 0,team,year,A,DP,E,G2,GS2,INN,PB,PO,...,SHO,SO1,SV,SVO,TBF,W,WHIP,WP,WPCT,winners
0,St. Louis Cardinals,2019,1033,114,43,104,936,8313.0,3,2771,...,4,895,33,42,3896,56,1.29,21,0.538,0
1,Arizona Diamondbacks,2019,1010,83,45,105,945,8538.0,2,2846,...,7,925,24,37,4001,53,1.28,35,0.505,0
2,Kansas City Royals,2019,990,105,45,106,954,8421.0,6,2807,...,5,816,24,41,4125,39,1.46,34,0.368,0
3,Houston Astros,2019,875,54,50,106,954,8589.0,6,2863,...,7,1074,27,42,3929,67,1.14,31,0.632,0
4,Tampa Bay Rays,2019,975,92,53,107,963,8760.0,11,2920,...,6,1037,26,43,3985,59,1.16,40,0.551,0


In [3]:
# remove data from 2016 through 2019.
team_data_new = team_data.loc[team_data["year"] < 2016]
team_data_new.head()

Unnamed: 0,team,year,A,DP,E,G2,GS2,INN,PB,PO,...,SHO,SO1,SV,SVO,TBF,W,WHIP,WP,WPCT,winners
120,San Francisco Giants,2015,1639,136,72,162,1458,13143.0,6,4381,...,11,1309,43,72,6048,87,1.21,40,0.537,0
121,Washington Nationals,2015,1425,142,73,162,1458,13137.0,17,4379,...,12,1476,46,60,6036,95,1.19,47,0.586,0
122,Houston Astros,2015,1599,135,77,162,1458,13212.0,18,4404,...,8,1396,44,64,6180,84,1.29,98,0.519,0
123,Detroit Tigers,2015,1537,148,75,161,1449,12852.0,5,4284,...,8,1232,47,66,6048,86,1.32,44,0.534,0
124,Boston Red Sox,2015,1427,139,75,162,1458,12957.0,37,4319,...,5,1362,43,61,6073,93,1.27,52,0.574,0


In [4]:
target = team_data_new["winners"]
features = team_data_new.drop({"team", "year", "winners"}, axis=1)
feature_columns = list(features.columns)
print (target.shape)
print (features.shape)
print (feature_columns)

(1266,)
(1266, 59)
['A', 'DP', 'E', 'G2', 'GS2', 'INN', 'PB', 'PO', 'TC', '2B', '3B', 'AB', 'AO', 'BB', 'CS', 'G', 'GDP', 'H', 'HBP', 'HR', 'IBB', 'NP_x', 'OBP', 'OPS_x', 'PA', 'R', 'RBI', 'SAC', 'SB', 'SF', 'SLG', 'SO', 'TB', 'XBH', 'BB1', 'BK', 'CG', 'ER', 'ERA', 'G1', 'GF', 'GS', 'H1', 'HB', 'HR1', 'IBB1', 'IP', 'L', 'OBP1', 'R1', 'SHO', 'SO1', 'SV', 'SVO', 'TBF', 'W', 'WHIP', 'WP', 'WPCT']


#### STEP2: Split and scale the data.

In [5]:
# split data.
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=42)

# scale data.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


#### STEP3: Try the SVC model.

In [6]:
# generate the model.
model = SVC(kernel="rbf",
           class_weight="balanced",
           probability=True)

# fit the model.
model.fit(X_train_scaled, y_train)

# predict.
prediction = model.predict(X_test_scaled)

print ((classification_report(y_test, prediction, target_names=["0", "1"])))

              precision    recall  f1-score   support

           0       0.97      0.87      0.92       304
           1       0.09      0.31      0.14        13

   micro avg       0.85      0.85      0.85       317
   macro avg       0.53      0.59      0.53       317
weighted avg       0.93      0.85      0.89       317



#### STEP4: Predict the winner 2016-2018.

In [7]:
def predict_the_winner(model, year, team_data, X_train):
    '''
    INPUT: 
    -X_train = scaled X train data.
    -model = the saved model.
    -team_data = complete dataframe with all data.
    -year = the year want to look at.
    
    OUTPUT:
    -printed prediction.
    
    DESCRIPTION:
    -data from year of interest is isolated.
    -the data are scaled.
    -the prediction is made.
    -print out the resulting probability and the name of the team.
    '''
    
    # grab the data.
    team_data = team_data.loc[team_data["year"] == year].reset_index()

    # set features (no team, year, winners).
    # set target (winners).
    features = team_data[feature_columns]
    
    # scale.
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    features = scaler.fit_transform(features)
    
    # fit the model.
    probabilities = model.predict_proba(features)

    # convert predictions to datafram.e
    WS_predictions = pd.DataFrame(probabilities[:,1])

    # Sort the DataFrame (descending)
    WS_predictions = WS_predictions.sort_values(0, ascending=False)

    WS_predictions['Probability'] = WS_predictions[0]

    # Print 50 highest probability HoF inductees from still eligible players
    for i, row in WS_predictions.head(50).iterrows():
       prob = ' '.join(('WS Probability =', str(row['Probability'])))
       print('')
       print(prob)
       print(team_data.iloc[i,1:27]["team"])

In [8]:
# predict for 2018.
predict_the_winner(model, 2018, team_data, X_train_scaled)


WS Probability = 0.060852004508653385
Atlanta Braves

WS Probability = 0.06028130072090872
Washington Nationals

WS Probability = 0.059648318375392376
Arizona Diamondbacks

WS Probability = 0.05476396521274732
New York Yankees

WS Probability = 0.05233526002093639
Oakland Athletics

WS Probability = 0.049913132415739164
Minnesota Twins

WS Probability = 0.04882868794944363
Los Angeles Angels

WS Probability = 0.04738620245195279
Boston Red Sox

WS Probability = 0.041756522353870035
Cleveland Indians

WS Probability = 0.039316978281950944
Chicago Cubs

WS Probability = 0.03488299799967733
Tampa Bay Rays

WS Probability = 0.03337499483550707
New York Mets

WS Probability = 0.03274768978329956
St. Louis Cardinals

WS Probability = 0.028468565693069783
Cincinnati Reds

WS Probability = 0.027829060811489174
Milwaukee Brewers

WS Probability = 0.02632645299460872
Los Angeles Dodgers

WS Probability = 0.025663015009439162
Houston Astros

WS Probability = 0.024676155224458533
Texas Rangers

W

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [9]:
# predict for 2017.
predict_the_winner(model, 2017, team_data, X_train_scaled)


WS Probability = 0.08984049057305296
Washington Nationals

WS Probability = 0.057445408892973594
Los Angeles Angels

WS Probability = 0.05635823472062213
Boston Red Sox

WS Probability = 0.05190520611531737
Cleveland Indians

WS Probability = 0.050463324998347915
Seattle Mariners

WS Probability = 0.04888431760027947
Atlanta Braves

WS Probability = 0.046303631017353276
Tampa Bay Rays

WS Probability = 0.04273026559203425
New York Yankees

WS Probability = 0.03930025498556103
Houston Astros

WS Probability = 0.03768582846748057
New York Mets

WS Probability = 0.036353868121461665
Milwaukee Brewers

WS Probability = 0.03557061393914468
Oakland Athletics

WS Probability = 0.030991035748564457
Colorado Rockies

WS Probability = 0.0307854271217912
Los Angeles Dodgers

WS Probability = 0.02953121500312916
Minnesota Twins

WS Probability = 0.026272963230368766
Arizona Diamondbacks

WS Probability = 0.02497380042928017
Pittsburgh Pirates

WS Probability = 0.020629277499001553
Chicago Cubs

W

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Ok.  This didn't work. Let's try this penalized model with a grid search.

In [10]:
def grid_search_svc(X_train, X_test, y_train, y_test):
    '''
    INPUT: 
    -X_train = scaled X train data.
    -X_test = scaled X test data.
    -y_train = y train data.
    -y_test = y test data.
    
    OUTPUT:
    -classification report (has F1 score, precision and recall).
    -grid = saved model for prediction. 
    
    DESCRIPTION:
    -the scaled and split data is put through a grid search with svc.
    -the model is trained.
    -a prediction is made.
    -print out the classification report and give the model.
    '''
    
    # set up svc model.
    model = SVC(kernel="rbf", 
                class_weight="balanced",
                probability=True)

    # create gridsearch estimator.
    param_grid = {"C": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
                 "gamma": [0.0001, 0.001, 0.01, 0.1]}
    grid = GridSearchCV(model, param_grid, verbose=3)

    # fit the model.
    grid.fit(X_train, y_train)

    # predict.
    prediction = grid.predict(X_test)
    
    # print out the basic information about the grid search.
    print (grid.best_params_)
    print (grid.best_score_)
    print (grid.best_estimator_)
    
    grid = grid.best_estimator_
    predictions = grid.predict(X_test)
    print (classification_report(y_test, prediction, target_names=["0", "1"]))
    
    return grid

In [11]:
model_grid = grid_search_svc(X_train, X_test, y_train, y_test)

Fitting 3 folds for each of 28 candidates, totalling 84 fits
[CV] C=0.0001, gamma=0.0001 ..........................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  C=0.0001, gamma=0.0001, score=0.03470031545741325, total=   0.2s
[CV] C=0.0001, gamma=0.0001 ..........................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV] . C=0.0001, gamma=0.0001, score=0.9651898734177216, total=   0.2s
[CV] C=0.0001, gamma=0.0001 ..........................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.4s remaining:    0.0s


[CV] . C=0.0001, gamma=0.0001, score=0.9651898734177216, total=   0.2s
[CV] C=0.0001, gamma=0.001 ...........................................
[CV] . C=0.0001, gamma=0.001, score=0.03470031545741325, total=   0.2s
[CV] C=0.0001, gamma=0.001 ...........................................
[CV] .. C=0.0001, gamma=0.001, score=0.9651898734177216, total=   0.2s
[CV] C=0.0001, gamma=0.001 ...........................................
[CV] .. C=0.0001, gamma=0.001, score=0.9651898734177216, total=   0.2s
[CV] C=0.0001, gamma=0.01 ............................................
[CV] .. C=0.0001, gamma=0.01, score=0.03470031545741325, total=   0.1s
[CV] C=0.0001, gamma=0.01 ............................................
[CV] ... C=0.0001, gamma=0.01, score=0.9651898734177216, total=   0.1s
[CV] C=0.0001, gamma=0.01 ............................................
[CV] ... C=0.0001, gamma=0.01, score=0.9651898734177216, total=   0.1s
[CV] C=0.0001, gamma=0.1 .............................................
[CV] .

[CV] ..... C=10, gamma=0.0001, score=0.9652996845425867, total=   0.2s
[CV] C=10, gamma=0.0001 ..............................................
[CV] ..... C=10, gamma=0.0001, score=0.9651898734177216, total=   0.2s
[CV] C=10, gamma=0.0001 ..............................................
[CV] ..... C=10, gamma=0.0001, score=0.9651898734177216, total=   0.2s
[CV] C=10, gamma=0.001 ...............................................
[CV] ...... C=10, gamma=0.001, score=0.9652996845425867, total=   0.2s
[CV] C=10, gamma=0.001 ...............................................
[CV] ...... C=10, gamma=0.001, score=0.9651898734177216, total=   0.2s
[CV] C=10, gamma=0.001 ...............................................
[CV] ...... C=10, gamma=0.001, score=0.9651898734177216, total=   0.2s
[CV] C=10, gamma=0.01 ................................................
[CV] ....... C=10, gamma=0.01, score=0.9652996845425867, total=   0.2s
[CV] C=10, gamma=0.01 ................................................
[CV] .

[Parallel(n_jobs=1)]: Done  84 out of  84 | elapsed:   16.3s finished


{'C': 0.1, 'gamma': 0.0001}
0.9652265542676501
SVC(C=0.1, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       304
           1       0.00      0.00      0.00        13

   micro avg       0.96      0.96      0.96       317
   macro avg       0.48      0.50      0.49       317
weighted avg       0.92      0.96      0.94       317



  'precision', 'predicted', average, warn_for)


Nope.  This is terrible.  Lots of no.