In [123]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [12]:
#!/usr/bin/env python
# coding: utf-8

import pandas as pd, numpy as np, matplotlib, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
#import seaborn as seabornInstance
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn import preprocessing
from sklearn import linear_model
%matplotlib inline

In [13]:
allData = pd.read_csv('all-np15-17.csv')

# Some preprocessing on the column names
allData.astype({'Season': 'category'}).dtypes # set season to category instead of int
allData.set_index(['playerid', 'Name'])

# drop NaN values, which only come into play for pitchers running the bases (a very atypical occurrence)
allData = allData.dropna()

# sort all the rows by player ID first, then season. Makes it easier to figure out which player played which seasons
allData = allData.sort_values(by=['playerid', 'Season'])

runsCorr = allData.corr()['R'].sort_values(ascending=False)
print(runsCorr[:10])

hrCorr = allData.corr()['HR'].sort_values(ascending=False)
print(hrCorr[:10])

rbiCorr = allData.corr()['RBI'].sort_values(ascending=False)
print(rbiCorr[:10])

opsCorr = allData.corr()['OPS'].sort_values(ascending=False)
print(opsCorr[:10])

avgCorr = allData.corr()['AVG'].sort_values(ascending=False)
print(avgCorr[:10])

R      1.000000
wRC    0.934454
PA     0.910258
H      0.899199
AB     0.887578
RBI    0.817627
G      0.795114
2B     0.791685
1B     0.772649
BB     0.735760
Name: R, dtype: float64
HR       1.000000
RBI      0.875344
ISO      0.835865
wRC      0.795607
SLG      0.787282
HR/FB    0.748119
R        0.724578
SO       0.706100
OPS      0.705641
wRAA     0.700970
Name: HR, dtype: float64
RBI    1.000000
wRC    0.886090
HR     0.875344
PA     0.833839
R      0.817627
AB     0.816274
H      0.806104
2B     0.757831
G      0.738276
SLG    0.700665
Name: RBI, dtype: float64
OPS     1.000000
wOBA    0.992368
wRC+    0.970170
SLG     0.957861
wRAA    0.934246
OBP     0.824722
ISO     0.790124
wRC     0.778788
HR      0.705641
AVG     0.703624
Name: OPS, dtype: float64
AVG      1.000000
BABIP    0.768853
OBP      0.734719
wOBA     0.717520
OPS      0.703624
wRC+     0.699201
wRAA     0.665636
1B       0.656045
H        0.640521
wRC      0.617360
Name: AVG, dtype: float64


In [14]:
allData = pd.read_csv('all-np15-17.csv')


# Some preprocessing on the column names
allData.astype({'Season': 'category'}).dtypes # set season to category instead of int
allData.set_index(['playerid', 'Name'])

# drop NaN values, which only come into play for pitchers running the bases (a very atypical occurrence)
allData = allData.dropna()

# dictionary assigning number of years in our data set played to each player
playerYears = dict() # dict of {'player ID': 'years in our data set that they played (1, 2, or 3)'}
pid, yearsPlayed = np.unique(allData['playerid'], return_counts=True)
for i in range(len(pid)):
    playerYears[pid[i]] = yearsPlayed[i]

# dictionary assigning names to player IDs
playerNames = dict() # dict where key is player ID, value is player name
# Assign each player ID to a name for future purposes
for i, r in allData.iterrows():
    playerNames[r['playerid']] = r['Name']

# assigns weights to each player and the data for that year. If played 2016-18, keep the weights of 0.6, 0.3, 0.1. If played two years, 0.7 and 0.3. If only played one year, assign 1 for now
for index, row in allData.iterrows():
    if playerYears[row['playerid']] == 3:
        continue
    elif playerYears[row['playerid']] == 2: # played in 16-17 or 17-18
        played1617 = False
        if row['Season'] == 2015:
            played1617 = True
            allData.set_value(index, 'weight', 0.3)
        elif row['Season'] == 2016:
            if played1617:
                allData.set_value(index, 'weight', 0.7)
            else:
                allData.set_value(index, 'weight', 0.3)
        else:
            allData.set_value(index, 'weight', 0.7)
    else:
        allData.set_value(index, 'weight', 1)

# sort all the rows by player ID first, then season. Makes it easier to figure out which player played which seasons
allData = allData.sort_values(by=['playerid', 'Season'])

# scale all relevant stats
allData[['wRC', 'PA', 'H', 'AB', 'RBI', 'G', '2B']] = preprocessing.scale(allData[['wRC', 'PA', 'H', 'AB', 'RBI', 'G', '2B']])

# assign X and Y sets for regression
X = allData[['wRC', 'PA', 'H', 'AB', 'RBI', 'G', '2B']].values
Y = allData[['R']].values

# add newly calculated weighted stats to our dataframe as columns
allData['weightWRC'] = allData['weight']*allData['wRC']
allData['weightPA'] = allData['weight']*allData['PA']
allData['weightH'] = allData['weight']*allData['H']
allData['weightAB'] = allData['weight']*allData['AB']
allData['weightRBI'] = allData['weight']*allData['RBI']
allData['weightG'] = allData['weight']*allData['G']
allData['weight2B'] = allData['weight']*allData['2B']

# sum up weighted averages by player, so that all years are combined
weightedRunStats2019 = allData.groupby('playerid', as_index=False).sum()
weightedRunStats2019 = weightedRunStats2019[['playerid', 'weightWRC', 'weightPA', 'weightH', 'weightAB', 'weightRBI', 'weightG', 'weight2B']]

# we lost player names when doing the group by and sum, so get the player names and put it back in our new data frame
names = []
for i in pid:
    names.append(playerNames[i])
weightedRunStats2019['Name'] = names
weightedRunStats2019['Season'] = 2018

# move name and season columns to the front of the dataframe
cols = weightedRunStats2019.columns.tolist()
cols = cols[-2:] + cols[:-2]
weightedRunStats2019 = weightedRunStats2019[cols]


# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
regressor = LinearRegression()
regressor.fit(X, Y)

X_2019 = weightedRunStats2019[['weightWRC', 'weightPA', 'weightH', 'weightAB', 'weightRBI', 'weightG', 'weight2B']].values
y_pred = regressor.predict(X_2019)


# In[241]:


y_pred_list = [] # list of y_pred so we can add it to a dataframe
for i in range(len(y_pred)):
    y_pred_list.append(y_pred[i][0])

weightedRunStats2019['runsPredicted'] = y_pred_list

# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
regr = LinearSVR(random_state=0)
regr.fit(X, Y)

# In[240]:
y_pred = regr.predict(X_2019)
weightedRunStats2019['linearSVRRuns'] = y_pred

regr = SVR()
regr.fit(X, Y)

y_pred = regr.predict(X_2019)
weightedRunStats2019['svrRuns'] = y_pred

regr = linear_model.BayesianRidge()
regr.fit(X, Y)

y_pred = regr.predict(X_2019)
weightedRunStats2019['BayesRidgeRuns'] = y_pred

regr = linear_model.HuberRegressor()
regr.fit(X, Y)

y_pred = regr.predict(X_2019)
weightedRunStats2019['HuberRegressorRuns'] = y_pred

regr = linear_model.Ridge()
regr.fit(X, Y)

y_pred = regr.predict(X_2019)
weightedRunStats2019['RidgeRuns'] = y_pred

regr = linear_model.ARDRegression()
regr.fit(X, Y)

y_pred = regr.predict(X_2019)
weightedRunStats2019['ARDRegressionRuns'] = y_pred

weightedRunStats2019 = weightedRunStats2019.sort_values(by=['linearSVRRuns'], ascending=False)
print(weightedRunStats2019)
# weightedRunStats2019.to_csv('RunsPredicted.csv')

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


                      Name  Season  playerid  weightWRC  weightPA   weightH  \
111             Joey Votto    2018      4314   3.097043  1.607073  1.704605   
474            Aaron Judge    2018     15640   3.132649  1.483595  1.117666   
232       Charlie Blackmon    2018      7859   2.662660  1.598534  2.241844   
160            Jose Altuve    2018      5417   2.402742  1.504612  2.376745   
470            Kris Bryant    2018     15429   2.363576  1.455352  1.387469   
..                     ...     ...       ...        ...       ...       ...   
351           Brett Eibner    2018     11369  -1.460421 -1.603351 -1.675029   
305  Christian Bethancourt    2018     10028  -1.460421 -1.629623 -1.485693   
318          Rafael Ortega    2018     10323  -1.531631 -1.642759 -1.509360   
216            Tyler Moore    2018      7244  -1.431937 -1.642102 -1.544861   
366             Elias Diaz    2018     11680  -1.567236 -1.655895 -1.533027   

     weightAB  weightRBI   weightG  weight2B  runsP

In [15]:
allData = pd.read_csv('all-np15-17.csv')


# In[238]:


# Some preprocessing on the column names
allData.astype({'Season': 'category'}).dtypes # set season to category instead of int
allData.set_index(['playerid', 'Name'])

# drop NaN values, which only come into play for pitchers running the bases (a very atypical occurrence)
allData = allData.dropna()

# dictionary assigning number of years in our data set played to each player
playerYears = dict() # dict of {'player ID': 'years in our data set that they played (1, 2, or 3)'}
pid, yearsPlayed = np.unique(allData['playerid'], return_counts=True)
for i in range(len(pid)):
    playerYears[pid[i]] = yearsPlayed[i]

# dictionary assigning names to player IDs
playerNames = dict() # dict where key is player ID, value is player name
# Assign each player ID to a name for future purposes
for i, r in allData.iterrows():
    playerNames[r['playerid']] = r['Name']

# assigns weights to each player and the data for that year. If played 2016-18, keep the weights of 0.6, 0.3, 0.1. If played two years, 0.7 and 0.3. If only played one year, assign 1 for now
for index, row in allData.iterrows():
    if playerYears[row['playerid']] == 3:
        continue
    elif playerYears[row['playerid']] == 2: # played in 16-17 or 17-18
        played1617 = False
        if row['Season'] == 2015:
            played1617 = True
            allData.set_value(index, 'weight', 0.3)
        elif row['Season'] == 2016:
            if played1617:
                allData.set_value(index, 'weight', 0.7)
            else:
                allData.set_value(index, 'weight', 0.3)
        else:
            allData.set_value(index, 'weight', 0.7)
    else:
        allData.set_value(index, 'weight', 1)

# sort all the rows by player ID first, then season. Makes it easier to figure out which player played which seasons
allData = allData.sort_values(by=['playerid', 'Season'])

# scale relevants stats
allData[['BABIP', 'OBP', 'wOBA', 'OPS', 'wRC+', 'wRAA', '1B']] = preprocessing.scale(allData[['BABIP', 'OBP', 'wOBA', 'OPS', 'wRC+', 'wRAA', '1B']])

# assign X and Y sets for regression
X = allData[['BABIP', 'OBP', 'wOBA', 'OPS', 'wRC+', 'wRAA', '1B']].values
Y = allData[['AVG']].values

# add newly calculated weighted stats to our dataframe as columns
allData['weightBABIP'] = allData['weight']*allData['BABIP']
allData['weightOBP'] = allData['weight']*allData['OBP']
allData['weightwOBA'] = allData['weight']*allData['wOBA']
allData['weightOPS'] = allData['weight']*allData['OPS']
allData['weightwRC+'] = allData['weight']*allData['wRC+']
allData['weightwRAA'] = allData['weight']*allData['wRAA']
allData['weight1B'] = allData['weight']*allData['1B']

# sum up weighted averages by player, so that all years are combined
weightedAvgStats2019 = allData.groupby('playerid', as_index=False).sum()
weightedAvgStats2019 = weightedAvgStats2019[['playerid', 'weightBABIP', 'weightOBP', 'weightwOBA', 'weightOPS', 'weightwRC+', 'weightwRAA', 'weight1B']]

# we lost player names when doing the group by and sum, so get the player names and put it back in our new data frame
names = []
for i in pid:
    names.append(playerNames[i])
weightedAvgStats2019['Name'] = names
weightedAvgStats2019['Season'] = 2018

# move name and season columns to the front of the dataframe
cols = weightedAvgStats2019.columns.tolist()
cols = cols[-2:] + cols[:-2]
weightedAvgStats2019 = weightedAvgStats2019[cols]


# In[13]:


# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
regressor = LinearRegression()
regressor.fit(X, Y)


# In[14]:


X_2019 = weightedAvgStats2019[['weightBABIP', 'weightOBP', 'weightwOBA', 'weightOPS', 'weightwRC+', 'weightwRAA', 'weight1B']].values
y_pred = regressor.predict(X_2019)


# In[15]:


y_pred_list = [] # list of y_pred so we can add it to a dataframe
for i in range(len(y_pred)):
    y_pred_list.append(y_pred[i][0])

weightedAvgStats2019['AVGPredicted'] = y_pred_list

# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
regr = LinearSVR(random_state=0)
regr.fit(X, Y)

# In[240]:

y_pred = regr.predict(X_2019)
weightedAvgStats2019['linearSvrAVG'] = y_pred

regr = SVR()
regr.fit(X, Y)

# In[240]:

y_pred = regr.predict(X_2019)
weightedAvgStats2019['svrAVG'] = y_pred

regr = linear_model.BayesianRidge()
regr.fit(X, Y)

y_pred = regr.predict(X_2019)
weightedAvgStats2019['BayesRidgeAVG'] = y_pred

regr = linear_model.HuberRegressor()
regr.fit(X, Y)

y_pred = regr.predict(X_2019)
weightedAvgStats2019['HuberRegressorAVG'] = y_pred

regr = linear_model.Ridge()
regr.fit(X, Y)

y_pred = regr.predict(X_2019)
weightedAvgStats2019['RidgeAVG'] = y_pred

regr = linear_model.ARDRegression()
regr.fit(X, Y)

y_pred = regr.predict(X_2019)
weightedAvgStats2019['ARDRegressionAVG'] = y_pred


weightedAvgStats2019 = weightedAvgStats2019.sort_values(by=['linearSvrAVG'], ascending=False)
print(weightedAvgStats2019)
# weightedAvgStats2019.to_csv('AVGPredictions.csv')

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


                 Name  Season  playerid  weightBABIP  weightOBP  weightwOBA  \
160       Jose Altuve    2018      5417     1.573139   2.097674    1.955426   
111        Joey Votto    2018      4314     1.034413   3.467079    2.722558   
232  Charlie Blackmon    2018      7859     1.603528   1.766641    2.120197   
294       DJ LeMahieu    2018      9874     1.689172   1.670443    0.880361   
474       Aaron Judge    2018     15640     1.517885   2.717302    2.900835   
..                ...     ...       ...          ...        ...         ...   
351      Brett Eibner    2018     11369    -2.073622  -1.696481   -1.475059   
424     Tyler Goeddel    2018     13180    -1.880233  -1.922829   -2.123339   
109      Stephen Drew    2018      4251    -2.791923  -1.555014   -1.042872   
274    Chris Herrmann    2018      9284    -2.626161  -1.498427   -1.367012   
200          JB Shuck    2018      6677    -2.543280  -2.205764   -2.231386   

     weightOPS  weightwRC+  weightwRAA  weight1B  A

In [16]:

allData = pd.read_csv('all-np15-17.csv')


# In[238]:


# Some preprocessing on the column names
allData.astype({'Season': 'category'}).dtypes # set season to category instead of int
allData.set_index(['playerid', 'Name'])

# drop NaN values, which only come into play for pitchers running the bases (a very atypical occurrence)
allData = allData.dropna()

# dictionary assigning number of years in our data set played to each player
playerYears = dict() # dict of {'player ID': 'years in our data set that they played (1, 2, or 3)'}
pid, yearsPlayed = np.unique(allData['playerid'], return_counts=True)
for i in range(len(pid)):
    playerYears[pid[i]] = yearsPlayed[i]

# dictionary assigning names to player IDs
playerNames = dict() # dict where key is player ID, value is player name
# Assign each player ID to a name for future purposes
for i, r in allData.iterrows():
    playerNames[r['playerid']] = r['Name']

# assigns weights to each player and the data for that year. If played 2016-18, keep the weights of 0.6, 0.3, 0.1. If played two years, 0.7 and 0.3. If only played one year, assign 1 for now
for index, row in allData.iterrows():
    if playerYears[row['playerid']] == 3:
        continue
    elif playerYears[row['playerid']] == 2: # played in 16-17 or 17-18
        played1617 = False
        if row['Season'] == 2015:
            played1617 = True
            allData.set_value(index, 'weight', 0.3)
        elif row['Season'] == 2016:
            if played1617:
                allData.set_value(index, 'weight', 0.7)
            else:
                allData.set_value(index, 'weight', 0.3)
        else:
            allData.set_value(index, 'weight', 0.7)
    else:
        allData.set_value(index, 'weight', 1)


# sort all the rows by player ID first, then season. Makes it easier to figure out which player played which seasons
allData = allData.sort_values(by=['playerid', 'Season'])

# scale relevants stats
allData[['RBI', 'ISO', 'wRC', 'SLG', 'R', 'HR/FB', 'OPS']] = preprocessing.scale(allData[['RBI', 'ISO', 'wRC', 'SLG', 'R', 'HR/FB', 'OPS']])

# assign X and Y sets for regression
X = allData[['RBI', 'ISO', 'wRC', 'SLG', 'R', 'HR/FB', 'OPS']].values
Y = allData[['HR']].values

# add newly calculated weighted stats to our dataframe as columns
allData['weightRBI'] = allData['weight']*allData['RBI']
allData['weightISO'] = allData['weight']*allData['ISO']
allData['weightwRC'] = allData['weight']*allData['wRC']
allData['weightSLG'] = allData['weight']*allData['SLG']
allData['weightR'] = allData['weight']*allData['R']
allData['weightHRFB'] = allData['weight']*allData['HR/FB']
allData['weightOPS'] = allData['weight']*allData['OPS']

# sum up weighted averages by player, so that all years are combined
weightedHRStats2019 = allData.groupby('playerid', as_index=False).sum()
weightedHRStats2019 = weightedHRStats2019[['playerid', 'weightRBI', 'weightISO', 'weightwRC', 'weightSLG', 'weightR', 'weightHRFB', 'weightOPS']]

# we lost player names when doing the group by and sum, so get the player names and put it back in our new data frame
names = []
for i in pid:
    names.append(playerNames[i])
weightedHRStats2019['Name'] = names
weightedHRStats2019['Season'] = 2018

# move name and season columns to the front of the dataframe
cols = weightedHRStats2019.columns.tolist()
cols = cols[-2:] + cols[:-2]
weightedHRStats2019 = weightedHRStats2019[cols]


# In[13]:


# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
regressor = LinearRegression()
regressor.fit(X, Y)


# In[14]:


X_2019 = weightedHRStats2019[['weightRBI', 'weightISO', 'weightwRC', 'weightSLG', 'weightR', 'weightHRFB', 'weightOPS']].values
y_pred = regressor.predict(X_2019)


# In[15]:


y_pred_list = [] # list of y_pred so we can add it to a dataframe
for i in range(len(y_pred)):
    y_pred_list.append(y_pred[i][0])

weightedHRStats2019['HRsPredicted'] = y_pred_list

# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
regr = LinearSVR(random_state=0)
regr.fit(X, Y)

# In[240]:

y_pred = regr.predict(X_2019)
weightedHRStats2019['linearSvrHR'] = y_pred

regr = SVR()
regr.fit(X, Y)

# In[240]:

y_pred = regr.predict(X_2019)
weightedHRStats2019['SvrHR'] = y_pred

regr = linear_model.BayesianRidge()
regr.fit(X, Y)

y_pred = regr.predict(X_2019)
weightedHRStats2019['BayesRidgeHR'] = y_pred

regr = linear_model.HuberRegressor()
regr.fit(X, Y)

y_pred = regr.predict(X_2019)
weightedHRStats2019['HuberRegressorHR'] = y_pred

regr = linear_model.Ridge()
regr.fit(X, Y)

y_pred = regr.predict(X_2019)
weightedHRStats2019['RidgeHR'] = y_pred

regr = linear_model.ARDRegression()
regr.fit(X, Y)

y_pred = regr.predict(X_2019)
weightedHRStats2019['ARDRegressionHR'] = y_pred


weightedHRStats2019 = weightedHRStats2019.sort_values(by=['linearSvrHR'], ascending=False)
print(weightedHRStats2019)
# weightedHRStats2019.to_csv('HRPredictions.csv')

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


                  Name  Season  playerid  weightRBI  weightISO  weightwRC  \
474        Aaron Judge    2018     15640   2.359202   3.127421   3.132649   
133  Giancarlo Stanton    2018      4949   2.131862   2.701805   1.790333   
450         Joey Gallo    2018     14128   1.049107   2.846022   0.960732   
288      Nolan Arenado    2018      9777   3.010397   1.973687   2.331532   
264        Khris Davis    2018      9112   1.943054   1.975446   1.306102   
..                 ...     ...       ...        ...        ...        ...   
429       Omar Narvaez    2018     13338  -1.494019  -1.797052  -0.748317   
26     Shane Victorino    2018      1677  -1.763744  -1.814640  -1.460421   
80    Alberto Callaspo    2018      3336  -1.455487  -2.148800  -1.318000   
353         Tyler Holt    2018     11391  -1.532551  -1.832227  -1.353605   
416    Kelby Tomlinson    2018     13005  -1.609615  -1.902577  -1.318000   

     weightSLG   weightR  weightHRFB  weightOPS  HRsPredicted  linearSvrHR 

In [17]:

allData = pd.read_csv('all-np15-17.csv')


# In[238]:


# Some preprocessing on the column names
allData.astype({'Season': 'category'}).dtypes # set season to category instead of int
allData.set_index(['playerid', 'Name'])

# drop NaN values, which only come into play for pitchers running the bases (a very atypical occurrence)
allData = allData.dropna()

# dictionary assigning number of years in our data set played to each player
playerYears = dict() # dict of {'player ID': 'years in our data set that they played (1, 2, or 3)'}
pid, yearsPlayed = np.unique(allData['playerid'], return_counts=True)
for i in range(len(pid)):
    playerYears[pid[i]] = yearsPlayed[i]

# dictionary assigning names to player IDs
playerNames = dict() # dict where key is player ID, value is player name
# Assign each player ID to a name for future purposes
for i, r in allData.iterrows():
    playerNames[r['playerid']] = r['Name']

# assigns weights to each player and the data for that year. If played 2016-18, keep the weights of 0.6, 0.3, 0.1. If played two years, 0.7 and 0.3. If only played one year, assign 1 for now
for index, row in allData.iterrows():
    if playerYears[row['playerid']] == 3:
        continue
    elif playerYears[row['playerid']] == 2: # played in 16-17 or 17-18
        played1617 = False
        if row['Season'] == 2015:
            played1617 = True
            allData.set_value(index, 'weight', 0.3)
        elif row['Season'] == 2016:
            if played1617:
                allData.set_value(index, 'weight', 0.7)
            else:
                allData.set_value(index, 'weight', 0.3)
        else:
            allData.set_value(index, 'weight', 0.7)
    else:
        allData.set_value(index, 'weight', 1)


# sort all the rows by player ID first, then season. Makes it easier to figure out which player played which seasons
allData = allData.sort_values(by=['playerid', 'Season'])

# scale all relevant stats
allData[['wOBA', 'wRC+', 'SLG', 'wRAA', 'OBP', 'ISO', 'HR']] = preprocessing.scale(allData[['wOBA', 'wRC+', 'SLG', 'wRAA', 'OBP', 'ISO', 'HR']])

# assign X and Y sets for regression
X = allData[['wOBA', 'wRC+', 'SLG', 'wRAA', 'OBP', 'ISO', 'HR']].values
Y = allData[['OPS']].values


# add newly calculated weighted stats to our dataframe as columns
allData['weightwOBA'] = allData['weight']*allData['wOBA']
allData['weightwRC+'] = allData['weight']*allData['wRC+']
allData['weightSLG'] = allData['weight']*allData['SLG']
allData['weightwRAA'] = allData['weight']*allData['wRAA']
allData['weightOBP'] = allData['weight']*allData['OBP']
allData['weightISO'] = allData['weight']*allData['ISO']
allData['weightHR'] = allData['weight']*allData['HR']

# sum up weighted averages by player, so that all years are combined
weightedOPSStats2019 = allData.groupby('playerid', as_index=False).sum()
weightedOPSStats2019 = weightedOPSStats2019[['playerid', 'weightwOBA', 'weightwRC+', 'weightSLG', 'weightwRAA', 'weightOBP', 'weightISO', 'weightHR']]

# we lost player names when doing the group by and sum, so get the player names and put it back in our new data frame
names = []
for i in pid:
    names.append(playerNames[i])
weightedOPSStats2019['Name'] = names
weightedOPSStats2019['Season'] = 2018

# move name and season columns to the front of the dataframe
cols = weightedOPSStats2019.columns.tolist()
cols = cols[-2:] + cols[:-2]
weightedOPSStats2019 = weightedOPSStats2019[cols]


# In[239]:


# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
regressor = LinearRegression()
regressor.fit(X, Y)


# In[240]:


X_2019 = weightedOPSStats2019[['weightwOBA', 'weightwRC+', 'weightSLG', 'weightwRAA', 'weightOBP', 'weightISO', 'weightHR']].values
y_pred = regressor.predict(X_2019)


# In[241]:


y_pred_list = [] # list of y_pred so we can add it to a dataframe
for i in range(len(y_pred)):
    y_pred_list.append(y_pred[i][0])

weightedOPSStats2019['opsPredicted'] = y_pred_list

# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
regr = LinearSVR(random_state=0)
regr.fit(X, Y)

# In[240]:

y_pred = regr.predict(X_2019)
weightedOPSStats2019['linearSvrOPS'] = y_pred

regr = SVR()
regr.fit(X, Y)

# In[240]:

y_pred = regr.predict(X_2019)
weightedOPSStats2019['svrOPS'] = y_pred

regr = linear_model.BayesianRidge()
regr.fit(X, Y)

y_pred = regr.predict(X_2019)
weightedOPSStats2019['BayesRidgeOPS'] = y_pred

regr = linear_model.HuberRegressor()
regr.fit(X, Y)

y_pred = regr.predict(X_2019)
weightedOPSStats2019['HuberRegressorOPS'] = y_pred

regr = linear_model.Ridge()
regr.fit(X, Y)

y_pred = regr.predict(X_2019)
weightedOPSStats2019['RidgeOPS'] = y_pred

regr = linear_model.ARDRegression()
regr.fit(X, Y)

y_pred = regr.predict(X_2019)
weightedOPSStats2019['ARDRegressionOPS'] = y_pred


weightedOPSStats2019 = weightedOPSStats2019.sort_values(by=['linearSvrOPS'], ascending=False)
print(weightedOPSStats2019)
# weightedOPSStats2019.to_csv('OPSPredictions.csv')

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


               Name  Season  playerid  weightwOBA  weightwRC+  weightSLG  \
474     Aaron Judge    2018     15640    2.900835    2.954472   2.917772   
310      Mike Trout    2018     10155    2.876524    3.038714   2.549750   
483    Rhys Hoskins    2018     16472    2.549683    2.312631   2.788390   
111      Joey Votto    2018      4314    2.722558    2.497160   2.039406   
459      Matt Olson    2018     14344    2.387613    2.553321   3.262794   
..              ...     ...       ...         ...         ...        ...   
424   Tyler Goeddel    2018     13180   -2.123339   -2.140143  -1.912524   
200        JB Shuck    2018      6677   -2.231386   -2.180258  -1.797517   
262  Alexi Amarista    2018      9063   -2.339433   -2.100028  -1.970028   
184   Casey McGehee    2018      6086   -2.150351   -1.939568  -2.156915   
464      Adam Engel    2018     15082   -2.501503   -2.461064  -2.041907   

     weightwRAA  weightOBP  weightISO  weightHR  opsPredicted  linearSvrOPS  \
474    3

In [18]:

allData = pd.read_csv('all-np15-17.csv')


# In[238]:


# Some preprocessing on the column names
allData.astype({'Season': 'category'}).dtypes # set season to category instead of int
allData.set_index(['playerid', 'Name'])

# drop NaN values, which only come into play for pitchers running the bases (a very atypical occurrence)
allData = allData.dropna()

# dictionary assigning number of years in our data set played to each player
playerYears = dict() # dict of {'player ID': 'years in our data set that they played (1, 2, or 3)'}
pid, yearsPlayed = np.unique(allData['playerid'], return_counts=True)
for i in range(len(pid)):
    playerYears[pid[i]] = yearsPlayed[i]

# dictionary assigning names to player IDs
playerNames = dict() # dict where key is player ID, value is player name
# Assign each player ID to a name for future purposes
for i, r in allData.iterrows():
    playerNames[r['playerid']] = r['Name']

# assigns weights to each player and the data for that year. If played 2016-18, keep the weights of 0.6, 0.3, 0.1. If played two years, 0.7 and 0.3. If only played one year, assign 1 for now
for index, row in allData.iterrows():
    if playerYears[row['playerid']] == 3:
        continue
    elif playerYears[row['playerid']] == 2: # played in 16-17 or 17-18
        played1617 = False
        if row['Season'] == 2015:
            played1617 = True
            allData.set_value(index, 'weight', 0.3)
        elif row['Season'] == 2016:
            if played1617:
                allData.set_value(index, 'weight', 0.7)
            else:
                allData.set_value(index, 'weight', 0.3)
        else:
            allData.set_value(index, 'weight', 0.7)
    else:
        allData.set_value(index, 'weight', 1)


# sort all the rows by player ID first, then season. Makes it easier to figure out which player played which seasons
allData = allData.sort_values(by=['playerid', 'Season'])

# scale relevants stats
allData[['wRC', 'HR',  'PA', 'AB', 'R', 'H', '2B']] = preprocessing.scale(allData[['wRC', 'HR',  'PA', 'AB', 'R', 'H', '2B']])

# assign X and Y sets for regression
X = allData[['wRC', 'HR',  'PA', 'AB', 'R', 'H', '2B']].values
Y = allData[['RBI']].values

# add newly calculated weighted stats to our dataframe as columns
allData['weightwRC'] = allData['weight']*allData['wRC']
allData['weightHR'] = allData['weight']*allData['HR']
allData['weightPA'] = allData['weight']*allData['PA']
allData['weightAB'] = allData['weight']*allData['AB']
allData['weightR'] = allData['weight']*allData['R']
allData['weightH'] = allData['weight']*allData['H']
allData['weight2B'] = allData['weight']*allData['2B']


# sum up weighted averages by player, so that all years are combined
weightedRBIStats2019 = allData.groupby('playerid', as_index=False).sum()
weightedRBIStats2019 = weightedRBIStats2019[['playerid', 'weightwRC', 'weightHR', 'weightPA', 'weightAB', 'weightR' ,'weightH', 'weight2B']]

# we lost player names when doing the group by and sum, so get the player names and put it back in our new data frame
names = []
for i in pid:
    names.append(playerNames[i])
weightedRBIStats2019['Name'] = names
weightedRBIStats2019['Season'] = 2018

# move name and season columns to the front of the dataframe
cols = weightedRBIStats2019.columns.tolist()
cols = cols[-2:] + cols[:-2]
weightedRBIStats2019 = weightedRBIStats2019[cols]


# In[13]:


# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
regressor = LinearRegression()
regressor.fit(X, Y)


# In[14]:


X_2019 = weightedRBIStats2019[['weightwRC', 'weightHR', 'weightPA', 'weightAB', 'weightR' ,'weightH', 'weight2B']].values
y_pred = regressor.predict(X_2019)


# In[15]:


y_pred_list = [] # list of y_pred so we can add it to a dataframe
for i in range(len(y_pred)):
    y_pred_list.append(y_pred[i][0])

weightedRBIStats2019['RBIsPredicted'] = y_pred_list

# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
regr = LinearSVR(random_state=0)
regr.fit(X, Y)

# In[240]:

y_pred = regr.predict(X_2019)
weightedRBIStats2019['linearSvrRBI'] = y_pred

regr = SVR()
regr.fit(X, Y)

# In[240]:

y_pred = regr.predict(X_2019)
weightedRBIStats2019['svrRBI'] = y_pred

regr = linear_model.BayesianRidge()
regr.fit(X, Y)

y_pred = regr.predict(X_2019)
weightedRBIStats2019['BayesRidgeRBI'] = y_pred

regr = linear_model.HuberRegressor()
regr.fit(X, Y)

y_pred = regr.predict(X_2019)
weightedRBIStats2019['HuberRegressorRBI'] = y_pred

regr = linear_model.Ridge()
regr.fit(X, Y)

y_pred = regr.predict(X_2019)
weightedRBIStats2019['RidgeRBI'] = y_pred

regr = linear_model.ARDRegression()
regr.fit(X, Y)

y_pred = regr.predict(X_2019)
weightedRBIStats2019['ARDRegressionRBI'] = y_pred


weightedRBIStats2019 = weightedRBIStats2019.sort_values(by=['linearSvrRBI'], ascending=False)
print(weightedRBIStats2019)
# weightedRBIStats2019.to_csv('RBIPredictions.csv')

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


                  Name  Season  playerid  weightwRC  weightHR  weightPA  \
474        Aaron Judge    2018     15640   3.132649  3.744537  1.483595   
288      Nolan Arenado    2018      9777   2.331532  2.416911  1.518405   
133  Giancarlo Stanton    2018      4949   1.790333  3.165572  0.892477   
53         Nelson Cruz    2018      2434   2.018206  2.616554  1.316768   
111         Joey Votto    2018      4314   3.097043  1.867893  1.607073   
..                 ...     ...       ...        ...       ...       ...   
210      Eric Campbell    2018      6938  -1.424816 -1.146716 -1.616487   
318      Rafael Ortega    2018     10323  -1.531631 -1.346359 -1.642759   
26     Shane Victorino    2018      1677  -1.460421 -1.346359 -1.629623   
416    Kelby Tomlinson    2018     13005  -1.318000 -1.346359 -1.511399   
353         Tyler Holt    2018     11391  -1.353605 -1.446180 -1.603351   

     weightAB   weightR   weightH  weight2B  RBIsPredicted  linearSvrRBI  \
474  1.001647  2.979669

In [19]:
weightedRBIStats2019.reset_index()
dfa = weightedRBIStats2019.join(weightedAvgStats2019, how='outer', lsuffix='playerid', rsuffix='playerid')
dfa = dfa.join(weightedHRStats2019, how='outer', lsuffix='playerid', rsuffix='playerid')
dfa = dfa.join(weightedOPSStats2019, how='outer', lsuffix='playerid', rsuffix='playerid')
dfa = dfa.join(weightedRunStats2019, how='outer', lsuffix='playerid', rsuffix='playerid')
print(dfa)
dfa.to_csv('allPredictions2018.csv')

          Nameplayerid  Seasonplayerid  playeridplayerid  weightwRCplayerid  \
0         Will Venable            2018               211          -0.677107   
1      Victor Martinez            2018               393           0.049239   
2           Juan Uribe            2018               454          -0.534443   
3       Carlos Beltran            2018               589           0.166736   
4       Brandon Barnes            2018               629          -1.104369   
..                 ...             ...               ...                ...   
498      Byung-ho Park            2018             18497          -1.175579   
499       Hyun Soo Kim            2018             18718          -1.011795   
500  Guillermo Heredia            2018             18721          -0.605896   
501         Dae-Ho Lee            2018             18722          -0.712712   
502       Yuli Gurriel            2018             19198           0.818311   

     weightHRplayerid  weightPAplayerid  weightABpl

In [136]:
dfa.to_csv('allPredictions2019.csv')

In [133]:
from sklearn import linear_model
allData = pd.read_csv('all-np.csv')


# In[238]:


# Some preprocessing on the column names
allData.astype({'Season': 'category'}).dtypes # set season to category instead of int
allData.set_index(['playerid', 'Name'])

# drop NaN values, which only come into play for pitchers running the bases (a very atypical occurrence)
allData = allData.dropna()

# dictionary assigning number of years in our data set played to each player
playerYears = dict() # dict of {'player ID': 'years in our data set that they played (1, 2, or 3)'}
pid, yearsPlayed = np.unique(allData['playerid'], return_counts=True)
for i in range(len(pid)):
    playerYears[pid[i]] = yearsPlayed[i]

# dictionary assigning names to player IDs
playerNames = dict() # dict where key is player ID, value is player name
# Assign each player ID to a name for future purposes
for i, r in allData.iterrows():
    playerNames[r['playerid']] = r['Name']

# assigns weights to each player and the data for that year. If played 2016-18, keep the weights of 0.6, 0.3, 0.1. If played two years, 0.7 and 0.3. If only played one year, assign 1 for now
for index, row in allData.iterrows():
    if playerYears[row['playerid']] == 3:
        continue
    elif playerYears[row['playerid']] == 2: # played in 16-17 or 17-18
        played1617 = False
        if row['Season'] == 2016:
            played1617 = True
            allData.set_value(index, 'weight', 0.3)
        elif row['Season'] == 2017:
            if played1617:
                allData.set_value(index, 'weight', 0.7)
            else:
                allData.set_value(index, 'weight', 0.3)
        else:
            allData.set_value(index, 'weight', 0.7)
    else:
        allData.set_value(index, 'weight', 1)


# sort all the rows by player ID first, then season. Makes it easier to figure out which player played which seasons
allData = allData.sort_values(by=['playerid', 'Season'])

# scale relevants stats
allData[['wRC', 'HR',  'PA', 'AB', 'R', 'H', '2B']] = preprocessing.scale(allData[['wRC', 'HR',  'PA', 'AB', 'R', 'H', '2B']])

# assign X and Y sets for regression
X = allData[['wRC', 'HR',  'PA', 'AB', 'R', 'H', '2B']].values
Y = allData[['RBI']].values

# add newly calculated weighted stats to our dataframe as columns
allData['weightwRC'] = allData['weight']*allData['wRC']
allData['weightHR'] = allData['weight']*allData['HR']
allData['weightPA'] = allData['weight']*allData['PA']
allData['weightAB'] = allData['weight']*allData['AB']
allData['weightR'] = allData['weight']*allData['R']
allData['weightH'] = allData['weight']*allData['H']
allData['weight2B'] = allData['weight']*allData['2B']


# sum up weighted averages by player, so that all years are combined
weightedRBIStats2019 = allData.groupby('playerid', as_index=False).sum()
weightedRBIStats2019 = weightedRBIStats2019[['playerid', 'weightwRC', 'weightHR', 'weightPA', 'weightAB', 'weightR' ,'weightH', 'weight2B']]

# we lost player names when doing the group by and sum, so get the player names and put it back in our new data frame
names = []
for i in pid:
    names.append(playerNames[i])
weightedRBIStats2019['Name'] = names
weightedRBIStats2019['Season'] = 2019

# move name and season columns to the front of the dataframe
cols = weightedRBIStats2019.columns.tolist()
cols = cols[-2:] + cols[:-2]
weightedRBIStats2019 = weightedRBIStats2019[cols]


# In[13]:


# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

regressor = LinearRegression()
regressor.fit(X, Y)


# In[14]:


X_2019 = weightedRBIStats2019[['weightwRC', 'weightHR', 'weightPA', 'weightAB', 'weightR' ,'weightH', 'weight2B']].values
y_pred = regressor.predict(X_2019)


y_pred_list = [] # list of y_pred so we can add it to a dataframe
for i in range(len(y_pred)):
    y_pred_list.append(y_pred[i][0])

weightedRBIStats2019['RBIsPredicted'] = y_pred_list

# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
regr = LinearSVR(random_state=0)
regr.fit(X, Y)

y_pred = regr.predict(X_2019)
weightedRBIStats2019['linearSvrRBI'] = y_pred

regr = SVR()
regr.fit(X, Y)

y_pred = regr.predict(X_2019)
weightedRBIStats2019['svrRBI'] = y_pred

regr = linear_model.BayesianRidge()
regr.fit(X, Y)

y_pred = regr.predict(X_2019)
weightedRBIStats2019['BayesRidgeRBI'] = y_pred

regr = linear_model.HuberRegressor()
regr.fit(X, Y)

y_pred = regr.predict(X_2019)
weightedRBIStats2019['HuberRegressorRBI'] = y_pred

regr = linear_model.Ridge()
regr.fit(X, Y)

y_pred = regr.predict(X_2019)
weightedRBIStats2019['RidgeRBI'] = y_pred

regr = linear_model.ARDRegression()
regr.fit(X, Y)

y_pred = regr.predict(X_2019)
weightedRBIStats2019['ARDRegressionRBI'] = y_pred


weightedRBIStats2019 = weightedRBIStats2019.sort_values(by=['linearSvrRBI'], ascending=False)
print(weightedRBIStats2019)
weightedRBIStats2019.to_csv('RBIPredictions.csv')

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


                  Name  Season  playerid  weightwRC  weightHR  weightPA  \
4          David Ortiz    2019       745   2.403585  2.303024  1.147429   
107  Giancarlo Stanton    2019      4949   1.924209  2.825340  1.488313   
220        Khris Davis    2019      9112   1.525903  3.096543  1.299374   
242      Nolan Arenado    2019      9777   2.259068  2.303024  1.486992   
153      J.D. Martinez    2019      6184   2.262592  2.654583  0.895070   
..                 ...     ...       ...        ...       ...       ...   
442    Victor Caratini    2019     14968  -1.473725 -1.313013 -1.666844   
271      Rafael Ortega    2019     10323  -1.544221 -1.413459 -1.653631   
376    Kelby Tomlinson    2019     13005  -1.332732 -1.413459 -1.521506   
306         Tyler Holt    2019     11391  -1.367980 -1.513904 -1.613994   
458       Victor Reyes    2019     15487  -1.685214 -1.413459 -1.541325   

     weightAB   weightR   weightH  weight2B  RBIsPredicted  linearSvrRBI  \
4    0.984012  0.943859