## Purpose: Try different models-- Part4.
### PCA.

In [1]:
# import dependencies.
import pandas as pd
import numpy as np

from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

#### STEP1: Read in dataset.  Remove data from 2016-2019.
- data from 2016-2018 will be used to bs test the model.
- data from 2019 will be used to predict the winners of the 2019 WS.

In [2]:
# read in the data.
team_data = pd.read_csv("../Resources/clean_data.csv")
del team_data["Unnamed: 0"]
team_data.head()

Unnamed: 0,team,year,A,DP,E,GS2,INN,PB,PO,TC,...,OBP1,R1,SHO,SO1,SV,TBF,W,WHIP,WP,winners
0,St. Louis Cardinals,2019,1033,114,43,936,8313.0,3,2771,3847,...,0.319,456,4,895,33,3896,56,1.29,21,0
1,Arizona Diamondbacks,2019,1010,83,45,945,8538.0,2,2846,3901,...,0.315,472,7,925,24,4001,53,1.28,35,0
2,Kansas City Royals,2019,990,105,45,954,8421.0,6,2807,3842,...,0.346,543,5,816,24,4125,39,1.46,34,0
3,Houston Astros,2019,875,54,50,954,8589.0,6,2863,3788,...,0.284,432,7,1074,27,3929,67,1.14,31,0
4,Tampa Bay Rays,2019,975,92,53,963,8760.0,11,2920,3948,...,0.291,409,6,1037,26,3985,59,1.16,40,0


In [3]:
# remove data from 2016 through 2019.
team_data_new = team_data.loc[team_data["year"] < 2016]
team_data_new.head()

Unnamed: 0,team,year,A,DP,E,GS2,INN,PB,PO,TC,...,OBP1,R1,SHO,SO1,SV,TBF,W,WHIP,WP,winners
120,San Francisco Giants,2015,1639,136,72,1458,13143.0,6,4381,6092,...,0.303,631,11,1309,43,6048,87,1.21,40,0
121,Washington Nationals,2015,1425,142,73,1458,13137.0,17,4379,5877,...,0.3,612,12,1476,46,6036,95,1.19,47,0
122,Houston Astros,2015,1599,135,77,1458,13212.0,18,4404,6080,...,0.314,701,8,1396,44,6180,84,1.29,98,0
123,Detroit Tigers,2015,1537,148,75,1449,12852.0,5,4284,5896,...,0.32,721,8,1232,47,6048,86,1.32,44,0
124,Boston Red Sox,2015,1427,139,75,1458,12957.0,37,4319,5821,...,0.314,694,5,1362,43,6073,93,1.27,52,0


In [4]:
target = team_data_new["winners"]
features = team_data_new.drop({"team", "year", "winners"}, axis=1)
feature_columns = list(features.columns)
print (target.shape)
print (features.shape)
print (feature_columns)

(2344,)
(2344, 46)
['A', 'DP', 'E', 'GS2', 'INN', 'PB', 'PO', 'TC', '2B', '3B', 'AB', 'AO', 'BB', 'H', 'HBP', 'HR', 'NP_x', 'OBP', 'OPS_x', 'R', 'RBI', 'SAC', 'SB', 'SLG', 'BB1', 'BK', 'CG', 'ER', 'ERA', 'G1', 'GF', 'GS', 'H1', 'HB', 'HR1', 'IP', 'L', 'OBP1', 'R1', 'SHO', 'SO1', 'SV', 'TBF', 'W', 'WHIP', 'WP']


#### STEP2: Upsample and scale data.

In [5]:
# reset the index.
team_data_new = team_data_new.reset_index().drop({"index"}, axis=1)
team_data_new.head()

Unnamed: 0,team,year,A,DP,E,GS2,INN,PB,PO,TC,...,OBP1,R1,SHO,SO1,SV,TBF,W,WHIP,WP,winners
0,San Francisco Giants,2015,1639,136,72,1458,13143.0,6,4381,6092,...,0.303,631,11,1309,43,6048,87,1.21,40,0
1,Washington Nationals,2015,1425,142,73,1458,13137.0,17,4379,5877,...,0.3,612,12,1476,46,6036,95,1.19,47,0
2,Houston Astros,2015,1599,135,77,1458,13212.0,18,4404,6080,...,0.314,701,8,1396,44,6180,84,1.29,98,0
3,Detroit Tigers,2015,1537,148,75,1449,12852.0,5,4284,5896,...,0.32,721,8,1232,47,6048,86,1.32,44,0
4,Boston Red Sox,2015,1427,139,75,1458,12957.0,37,4319,5821,...,0.314,694,5,1362,43,6073,93,1.27,52,0


In [6]:
# remove team and year.
feature_columns_new = feature_columns + ["winners"]
team_data_new = team_data[feature_columns_new]
team_data_new.head()

Unnamed: 0,A,DP,E,GS2,INN,PB,PO,TC,2B,3B,...,OBP1,R1,SHO,SO1,SV,TBF,W,WHIP,WP,winners
0,1033,114,43,936,8313.0,3,2771,3847,157,10,...,0.319,456,4,895,33,3896,56,1.29,21,0
1,1010,83,45,945,8538.0,2,2846,3901,203,27,...,0.315,472,7,925,24,4001,53,1.28,35,0
2,990,105,45,954,8421.0,6,2807,3842,185,31,...,0.346,543,5,816,24,4125,39,1.46,34,0
3,875,54,50,954,8589.0,6,2863,3788,200,15,...,0.284,432,7,1074,27,3929,67,1.14,31,0
4,975,92,53,963,8760.0,11,2920,3948,195,21,...,0.291,409,6,1037,26,3985,59,1.16,40,0


In [7]:
# upsample for a more balanced dataset.
def upsample(dataset, no_samples, no_components):
    '''
    INPUT: 
    -dataset = dataset without team names and year.
    -n_samples = number of minority_unsampled.
    
    OUTPUT:
    -X_train_scaled = scaled X train data.
    -X_test_scaled = scaled X test data.
    -y_train = y train data
    -y_test = y test data
    
    DESCRIPTION:
    -dataset is taken in and split into minority and majority classes.
    -dataset is then upsampled for the mainority class
    -split the data into features and targets
    -split data into train and test sets
    -train and test sets were are scaled.
    '''
    
    # separate majority and minority classes.
    df_majority = dataset.loc[dataset["winners"] == 0]
    df_minority = dataset.loc[dataset["winners"] == 1]

    # upsample minority class.
    df_minority_unsampled = resample(df_minority,
                                    replace=True,
                                    n_samples=no_samples,
                                    random_state=123)

    # combine majority class with upsampled minority class.
    df_upsampled = pd.concat([df_majority, df_minority_unsampled])

    # separate features and target.
    y = df_upsampled["winners"]
    X = df_upsampled[feature_columns]
    
    # split into train and test sets.
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    # scale.
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.fit_transform(X_test)
    
    # PCA.
    pca = PCA(n_components=no_components)
    X_train_scaled = pca.fit_transform(X_train_scaled)
    X_test_scaled = pca.fit_transform(X_test_scaled)
    
    return X_train_scaled, X_test_scaled, y_train, y_test

In [8]:
# Do three different upsamplings and three different pca conditions.
X_train_100_10, X_test_100_10, y_train_100_10, y_test_100_10 = upsample(team_data_new, 2234, 10)
X_train_100_5, X_test_100_5, y_train_100_5, y_test_100_5 = upsample(team_data_new, 2234, 5)
X_train_100_2, X_test_100_2, y_train_100_2, y_test_100_2 = upsample(team_data_new, 2234, 2)

X_train_50_10, X_test_50_10, y_train_50_10, y_test_50_10 = upsample(team_data_new, 1117, 10)
X_train_50_5, X_test_50_5, y_train_50_5, y_test_50_5 = upsample(team_data_new, 1117, 5)
X_train_50_2, X_test_50_2, y_train_50_2, y_test_50_2 = upsample(team_data_new, 1117, 2)

X_train_25_10, X_test_25_10, y_train_25_10, y_test_25_10 = upsample(team_data_new, 559, 10)
X_train_25_5, X_test_25_5, y_train_25_5, y_test_25_5 = upsample(team_data_new, 559, 5)
X_train_25_2, X_test_25_2, y_train_25_2, y_test_25_2 = upsample(team_data_new, 559, 2)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return

In [9]:
def logistic(X_train, X_test, y_train, y_test):
    '''
    INPUT: 
    -X_train = scaled X train data.
    -X_test = scaled X test data.
    -y_train = y train data.
    -y_test = y test data.
    
    OUTPUT:
    -classification report (has F1 score, precision and recall).
    -grid = saved model for prediction. 
    
    DESCRIPTION:
    -the scaled and split data is put through a grid search with logistic.
    -the model is trained.
    -a prediction is made.
    -print out the classification report and give the model.
    '''
    
    # fit the model.
    model = LogisticRegression(solver="lbfgs", max_iter= 2000)

    # fit the model.
    model.fit(X_train, y_train)

    # predict.
    prediction = model.predict(X_test)
    
    print (classification_report(y_test, prediction, target_names=["0", "1"]))
    
    return model

In [10]:
model_100_10 = logistic(X_train_100_10, X_test_100_10, y_train_100_10, y_test_100_10)
model_100_10

              precision    recall  f1-score   support

           0       0.72      0.64      0.68       607
           1       0.64      0.72      0.68       540

   micro avg       0.68      0.68      0.68      1147
   macro avg       0.68      0.68      0.68      1147
weighted avg       0.68      0.68      0.68      1147



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=2000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [11]:
model_100_5 = logistic(X_train_100_5, X_test_100_5, y_train_100_5, y_test_100_5)
model_100_5

              precision    recall  f1-score   support

           0       0.71      0.65      0.68       607
           1       0.64      0.71      0.67       540

   micro avg       0.68      0.68      0.68      1147
   macro avg       0.68      0.68      0.68      1147
weighted avg       0.68      0.68      0.68      1147



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=2000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [12]:
model_100_2 = logistic(X_train_100_2, X_test_100_2, y_train_100_2, y_test_100_2)
model_100_2

              precision    recall  f1-score   support

           0       0.62      0.55      0.58       607
           1       0.55      0.63      0.59       540

   micro avg       0.59      0.59      0.59      1147
   macro avg       0.59      0.59      0.58      1147
weighted avg       0.59      0.59      0.58      1147



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=2000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [13]:
model_50_10 = logistic(X_train_50_10, X_test_50_10, y_train_50_10, y_test_50_10)
model_50_10

              precision    recall  f1-score   support

           0       0.79      0.86      0.83       606
           1       0.60      0.46      0.52       261

   micro avg       0.74      0.74      0.74       867
   macro avg       0.69      0.66      0.67       867
weighted avg       0.73      0.74      0.73       867



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=2000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [14]:
model_50_5 = logistic(X_train_50_5, X_test_50_5, y_train_50_5, y_test_50_5)
model_50_5

              precision    recall  f1-score   support

           0       0.79      0.87      0.83       606
           1       0.61      0.48      0.54       261

   micro avg       0.75      0.75      0.75       867
   macro avg       0.70      0.67      0.68       867
weighted avg       0.74      0.75      0.74       867



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=2000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [15]:
model_25_10 = logistic(X_train_25_10, X_test_25_10, y_train_25_10, y_test_25_10)
model_25_10

              precision    recall  f1-score   support

           0       0.78      0.91      0.84       579
           1       0.02      0.01      0.01       149

   micro avg       0.73      0.73      0.73       728
   macro avg       0.40      0.46      0.43       728
weighted avg       0.62      0.73      0.67       728



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=2000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [16]:
model_25_5 = logistic(X_train_25_5, X_test_25_5, y_train_25_5, y_test_25_5)
model_25_5

              precision    recall  f1-score   support

           0       0.78      0.92      0.85       579
           1       0.02      0.01      0.01       149

   micro avg       0.73      0.73      0.73       728
   macro avg       0.40      0.46      0.43       728
weighted avg       0.63      0.73      0.68       728



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=2000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [17]:
model_25_2 = logistic(X_train_25_2, X_test_25_2, y_train_25_2, y_test_25_2)
model_25_2

              precision    recall  f1-score   support

           0       0.80      1.00      0.89       579
           1       0.00      0.00      0.00       149

   micro avg       0.80      0.80      0.80       728
   macro avg       0.40      0.50      0.44       728
weighted avg       0.63      0.80      0.70       728



  'precision', 'predicted', average, warn_for)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=2000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

Use model_100_10.

In [20]:
def predict_the_winner(model, year, team_data, X_train, no_components):
    '''
    INPUT: 
    -X_train = scaled X train data.
    -model = the saved model.
    -team_data = complete dataframe with all data.
    -year = the year want to look at.
    
    OUTPUT:
    -printed prediction.
    
    DESCRIPTION:
    -data from year of interest is isolated.
    -the data are scaled.
    -the prediction is made.
    -print out the resulting probability and the name of the team.
    '''
    
    # grab the data.
    team_data = team_data.loc[team_data["year"] == year].reset_index()

    # set features (no team, year, winners).
    # set target (winners).
    features = team_data[feature_columns]
    
    # scale.
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    features = scaler.fit_transform(features)
    
    # PCA.
    pca = PCA(n_components=no_components)
    X_train_scaled = pca.fit_transform(X_train_scaled)
    features = pca.fit_transform(features)
    
    # fit the model.
    probabilities = model.predict_proba(features)

    # convert predictions to datafram.e
    WS_predictions = pd.DataFrame(probabilities[:,1])

    # Sort the DataFrame (descending)
    WS_predictions = WS_predictions.sort_values(0, ascending=False)

    WS_predictions['Probability'] = WS_predictions[0]

    # Print 50 highest probability HoF inductees from still eligible players
    for i, row in WS_predictions.head(50).iterrows():
       prob = ' '.join(('WS Probability =', str(row['Probability'])))
       print('')
       print(prob)
       print(team_data.iloc[i,1:27]["team"])

In [21]:
# predict for 2018.
predict_the_winner(model_100_10, 2018, team_data, X_train_100_10, 10)


WS Probability = 0.8430477404468506
Baltimore Orioles

WS Probability = 0.8208655819654536
Miami Marlins

WS Probability = 0.8086564007736856
Detroit Tigers

WS Probability = 0.7654182765506058
Los Angeles Angels

WS Probability = 0.7252839246249292
Washington Nationals

WS Probability = 0.7215690326451414
Cincinnati Reds

WS Probability = 0.6690108834211279
Houston Astros

WS Probability = 0.6620060099454211
San Diego Padres

WS Probability = 0.6426065811129382
Oakland Athletics

WS Probability = 0.5864037714700981
Minnesota Twins

WS Probability = 0.5535304307475479
New York Mets

WS Probability = 0.5180849597942915
Tampa Bay Rays

WS Probability = 0.4974867402485644
Texas Rangers

WS Probability = 0.4641482034131479
Chicago White Sox

WS Probability = 0.4522471785111992
Los Angeles Dodgers

WS Probability = 0.44514365389337024
New York Yankees

WS Probability = 0.44258311085649527
Cleveland Indians

WS Probability = 0.4239020939444365
Toronto Blue Jays

WS Probability = 0.420238140

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [22]:
# predict for 2017.
predict_the_winner(model_100_10, 2017, team_data, X_train_100_10, 10)


WS Probability = 0.8689325077331056
Chicago White Sox

WS Probability = 0.7572103916171553
Philadelphia Phillies

WS Probability = 0.752874741073026
Detroit Tigers

WS Probability = 0.7357194632847872
Tampa Bay Rays

WS Probability = 0.7321819742899162
Washington Nationals

WS Probability = 0.64798267706221
Colorado Rockies

WS Probability = 0.6423145698040247
New York Yankees

WS Probability = 0.6343882117537152
Pittsburgh Pirates

WS Probability = 0.6252455839180293
Toronto Blue Jays

WS Probability = 0.6153883761718913
San Diego Padres

WS Probability = 0.5922845728991086
Texas Rangers

WS Probability = 0.5734518700995734
Houston Astros

WS Probability = 0.5725736899130952
Los Angeles Angels

WS Probability = 0.538809899273642
Miami Marlins

WS Probability = 0.5380144651186222
Los Angeles Dodgers

WS Probability = 0.5107694187442878
Boston Red Sox

WS Probability = 0.49987681642637954
Baltimore Orioles

WS Probability = 0.4643881066075196
Arizona Diamondbacks

WS Probability = 0.45

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


PCA is not it.  The F1 scores are terrible and the bs test doesn't work.