## Purpose: Try different models-- Part1.
### Linear Regression

In [95]:
# import dependencies.
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, classification_report
from sklearn.model_selection import train_test_split

#### STEP1: Read in dataset.  Remove data from 2016-2019.
- data from 2016-2018 will be used to bs test the model.
- data from 2019 will be used to predict the winners of the 2019 WS.

In [96]:
# read in the data.
team_data = pd.read_csv("../Resources/clean_data.csv")
del team_data["Unnamed: 0"]
team_data.head()

Unnamed: 0,team,year,A,DP,E,GS2,INN,PB,PO,TC,...,OBP1,R1,SHO,SO1,SV,TBF,W,WHIP,WP,winners
0,St. Louis Cardinals,2019,1033,114,43,936,8313.0,3,2771,3847,...,0.319,456,4,895,33,3896,56,1.29,21,0
1,Arizona Diamondbacks,2019,1010,83,45,945,8538.0,2,2846,3901,...,0.315,472,7,925,24,4001,53,1.28,35,0
2,Kansas City Royals,2019,990,105,45,954,8421.0,6,2807,3842,...,0.346,543,5,816,24,4125,39,1.46,34,0
3,Houston Astros,2019,875,54,50,954,8589.0,6,2863,3788,...,0.284,432,7,1074,27,3929,67,1.14,31,0
4,Tampa Bay Rays,2019,975,92,53,963,8760.0,11,2920,3948,...,0.291,409,6,1037,26,3985,59,1.16,40,0


In [97]:
# remove data from 2016 through 2019.
team_data_new = team_data.loc[team_data["year"] < 2016]
team_data_new.head()

Unnamed: 0,team,year,A,DP,E,GS2,INN,PB,PO,TC,...,OBP1,R1,SHO,SO1,SV,TBF,W,WHIP,WP,winners
120,San Francisco Giants,2015,1639,136,72,1458,13143.0,6,4381,6092,...,0.303,631,11,1309,43,6048,87,1.21,40,0
121,Washington Nationals,2015,1425,142,73,1458,13137.0,17,4379,5877,...,0.3,612,12,1476,46,6036,95,1.19,47,0
122,Houston Astros,2015,1599,135,77,1458,13212.0,18,4404,6080,...,0.314,701,8,1396,44,6180,84,1.29,98,0
123,Detroit Tigers,2015,1537,148,75,1449,12852.0,5,4284,5896,...,0.32,721,8,1232,47,6048,86,1.32,44,0
124,Boston Red Sox,2015,1427,139,75,1458,12957.0,37,4319,5821,...,0.314,694,5,1362,43,6073,93,1.27,52,0


In [98]:
# models require numbers! Drop team and year columns.
target = team_data_new["winners"]
features = team_data_new.drop({"team", "year", "winners"}, axis=1)
print (target.shape)
print (features.shape)

(2344,)
(2344, 46)


In [99]:
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=42)

#### STEP2: Try Linear Regression.

In [100]:
# fit the model.
model = LinearRegression()
model.fit(X_train, y_train)

# predict.
predicted = model.predict(X_test)

# evaluate the model.
mse = mean_squared_error(y_test, predicted)
r2 = r2_score(y_test, predicted)
print (f"MSE: {mse}")
print (f"R-squared: {r2}")

MSE: 0.05114210799959861
R-squared: 0.03764615493286427


The linear model doesn't work.  The R-squared value is absolutely abysmal.  I could pick a one at random and do a better job!

#### STEP6: PCA.

In [34]:
# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=42)

# scale X_train and X_test.
scaler = StandardScaler()

# transform the training and testing data.
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

# apply pca on the X components.
pca = PCA(n_components = 2)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.fit_transform(X_test_scaled)

explained_variance = pca.explained_variance_ratio_ 

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [44]:
classifier = LogisticRegression(solver="lbfgs", random_state=42) 
classifier.fit(X_train_pca, y_train) 
predictions = classifier.predict(X_test_pca)

In [45]:
print (classification_report(y_test, predictions, target_names=["0", "1"]))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97       553
           1       0.00      0.00      0.00        33

   micro avg       0.94      0.94      0.94       586
   macro avg       0.47      0.50      0.49       586
weighted avg       0.89      0.94      0.92       586



  'precision', 'predicted', average, warn_for)


In [36]:
# grab the 2018 data.
team_data_2018 = team_data.loc[team_data["year"] == 2018].reset_index()

# set features (no team, year, winners).
# set target (winners).
features_2018 = team_data_2018.drop({"team", "year", "winners"}, axis=1).drop({"index"}, axis=1)
features_2018 = scaler.fit_transform(features_2018)
features_2018 = pca.fit_transform(features_2018)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [37]:
# fit the model.
probabilities = classifier.predict_proba(features_2018)

# convert predictions to datafram.e
WS_predictions = pd.DataFrame(probabilities[:,1])

# Sort the DataFrame (descending)
WS_predictions = WS_predictions.sort_values(0, ascending=False)

WS_predictions['Probability'] = WS_predictions[0]

# Print 50 highest probability HoF inductees from still eligible players
for i, row in WS_predictions.head(50).iterrows():
   prob = ' '.join(('WS Probability =', str(row['Probability'])))
   print('')
   print(prob)
   print(team_data_2018.iloc[i,1:27]["team"])


WS Probability = 0.09035555848469622
Cincinnati Reds

WS Probability = 0.08726213645815503
Cleveland Indians

WS Probability = 0.0747569248671306
Miami Marlins

WS Probability = 0.07278240658813295
Los Angeles Dodgers

WS Probability = 0.06114789170583012
Washington Nationals

WS Probability = 0.06036181368769516
San Diego Padres

WS Probability = 0.06026986153959294
St. Louis Cardinals

WS Probability = 0.05841762176744002
Chicago White Sox

WS Probability = 0.05490991970500289
Houston Astros

WS Probability = 0.05115205473420071
New York Yankees

WS Probability = 0.04869697783059115
Chicago Cubs

WS Probability = 0.04770953978200814
Detroit Tigers

WS Probability = 0.0457877111644281
New York Mets

WS Probability = 0.04562240410874296
Minnesota Twins

WS Probability = 0.04330624781567202
Tampa Bay Rays

WS Probability = 0.039817307651562446
San Francisco Giants

WS Probability = 0.038401955417204967
Philadelphia Phillies

WS Probability = 0.0382985084449339
Arizona Diamondbacks

WS 

In [38]:
# grab the 2017 data.
team_data_2017 = team_data.loc[team_data["year"] == 2017].reset_index()

# set features (no team, year, winners).
# set target (winners).
features_2017 = team_data_2017.drop({"team", "year", "winners"}, axis=1).drop({"index"}, axis=1)
features_2017 = scaler.fit_transform(features_2017)
features_2017 = pca.fit_transform(features_2017)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [39]:
# fit the model.
probabilities = classifier.predict_proba(features_2017)

# convert predictions to datafram.e
WS_predictions = pd.DataFrame(probabilities[:,1])

# Sort the DataFrame (descending)
WS_predictions = WS_predictions.sort_values(0, ascending=False)

WS_predictions['Probability'] = WS_predictions[0]

# Print 50 highest probability HoF inductees from still eligible players
for i, row in WS_predictions.head(50).iterrows():
   prob = ' '.join(('WS Probability =', str(row['Probability'])))
   print('')
   print(prob)
   print(team_data_2017.iloc[i,1:27]["team"])


WS Probability = 0.08334583135908803
Pittsburgh Pirates

WS Probability = 0.06892370448408
Miami Marlins

WS Probability = 0.059441137841465985
Houston Astros

WS Probability = 0.058251944323909476
Cleveland Indians

WS Probability = 0.05256772875130675
Detroit Tigers

WS Probability = 0.05191074112615728
Minnesota Twins

WS Probability = 0.0509044678642867
New York Yankees

WS Probability = 0.050431775817083704
Los Angeles Angels

WS Probability = 0.05002827483292677
Boston Red Sox

WS Probability = 0.0496906770060075
Kansas City Royals

WS Probability = 0.04948943800265006
Toronto Blue Jays

WS Probability = 0.04927404697045466
Washington Nationals

WS Probability = 0.04789597724912886
Seattle Mariners

WS Probability = 0.04775645593544893
New York Mets

WS Probability = 0.047608915751616784
Baltimore Orioles

WS Probability = 0.04256153670154059
Oakland Athletics

WS Probability = 0.04220271114102812
Chicago White Sox

WS Probability = 0.041130583432789934
Texas Rangers

WS Probabi

In [40]:
# grab the 2016 data.
team_data_2016 = team_data.loc[team_data["year"] == 2016].reset_index()

# set features (no team, year, winners).
# set target (winners).
features_2016 = team_data_2016.drop({"team", "year", "winners"}, axis=1).drop({"index"}, axis=1)
features_2016 = scaler.fit_transform(features_2016)
features_2016 = pca.fit_transform(features_2016)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [41]:
# fit the model.
probabilities = classifier.predict_proba(features_2016)

# convert predictions to datafram.e
WS_predictions = pd.DataFrame(probabilities[:,1])

# Sort the DataFrame (descending)
WS_predictions = WS_predictions.sort_values(0, ascending=False)

WS_predictions['Probability'] = WS_predictions[0]

# Print 50 highest probability HoF inductees from still eligible players
for i, row in WS_predictions.head(50).iterrows():
   prob = ' '.join(('WS Probability =', str(row['Probability'])))
   print('')
   print(prob)
   print(team_data_2016.iloc[i,1:27]["team"])


WS Probability = 0.07518649138549548
Boston Red Sox

WS Probability = 0.07173895535288484
Los Angeles Dodgers

WS Probability = 0.06913223259173937
Los Angeles Angels

WS Probability = 0.0628083102599146
San Francisco Giants

WS Probability = 0.06095787881986291
Cleveland Indians

WS Probability = 0.05938123010619937
Pittsburgh Pirates

WS Probability = 0.05812916659804329
Tampa Bay Rays

WS Probability = 0.0567781185071319
Toronto Blue Jays

WS Probability = 0.05649292718003708
San Diego Padres

WS Probability = 0.050149676502835895
St. Louis Cardinals

WS Probability = 0.05012894831889737
Milwaukee Brewers

WS Probability = 0.04375991998036347
Arizona Diamondbacks

WS Probability = 0.04284524701441389
Kansas City Royals

WS Probability = 0.04230299886796214
Washington Nationals

WS Probability = 0.041082393645504936
Philadelphia Phillies

WS Probability = 0.040559063104420444
Seattle Mariners

WS Probability = 0.040401080287095964
New York Yankees

WS Probability = 0.038446263572630

2016-2018 checks doesn't work. And this is worse than the straight up logistic regression.  I'm going to end this notebook here and try other models in another notebook.

The reason the logistic models aren't working is because the data is not balanced--there are way more ws losers than there are winners.  Might want to try Kappa .