In [32]:
#!/usr/bin/env python
# coding: utf-8

import pandas as pd, numpy as np, matplotlib, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
import seaborn as seabornInstance
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn import preprocessing
%matplotlib inline


# In[237]:


allData = pd.read_csv('all-np.csv')


# In[238]:


# Some preprocessing on the column names
allData.astype({'Season': 'category'}).dtypes # set season to category instead of int
allData.set_index(['playerid', 'Name'])

# drop NaN values, which only come into play for pitchers running the bases (a very atypical occurrence)
allData = allData.dropna()

# sort all the rows by player ID first, then season. Makes it easier to figure out which player played which seasons
allData = allData.sort_values(by=['playerid', 'Season'])

runsCorr = allData.corr()['R'].sort_values(ascending=False)
print(runsCorr[:10])

hrCorr = allData.corr()['HR'].sort_values(ascending=False)
print(hrCorr[:10])

rbiCorr = allData.corr()['RBI'].sort_values(ascending=False)
print(rbiCorr[:10])

opsCorr = allData.corr()['OPS'].sort_values(ascending=False)
print(opsCorr[:10])

avgCorr = allData.corr()['AVG'].sort_values(ascending=False)
print(avgCorr[:10])

R      1.000000
wRC    0.935741
PA     0.914242
H      0.903099
AB     0.893115
RBI    0.818290
G      0.808387
2B     0.794936
1B     0.780092
BB     0.740837
Name: R, dtype: float64
HR       1.000000
RBI      0.881248
ISO      0.843345
wRC      0.802392
SLG      0.794585
R        0.740530
HR/FB    0.737600
OPS      0.710500
wRAA     0.702016
SO       0.701649
Name: HR, dtype: float64
RBI    1.000000
wRC    0.888195
HR     0.881248
PA     0.839335
AB     0.824583
R      0.818290
H      0.817837
2B     0.754585
G      0.746350
SLG    0.719561
Name: RBI, dtype: float64
OPS     1.000000
wOBA    0.992336
wRC+    0.973173
SLG     0.957429
wRAA    0.935156
OBP     0.826919
wRC     0.793114
ISO     0.786431
AVG     0.724403
HR      0.710500
Name: OPS, dtype: float64
AVG      1.000000
BABIP    0.755144
OBP      0.736842
wOBA     0.734102
OPS      0.724403
wRC+     0.713292
wRAA     0.676495
1B       0.673635
H        0.651439
wRC      0.624871
Name: AVG, dtype: float64


In [33]:
#!/usr/bin/env python
# coding: utf-8

# Runs

# In[236]:


import pandas as pd, numpy as np, matplotlib, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
import seaborn as seabornInstance 
from sklearn.svm import LinearSVR
from sklearn import metrics
from sklearn import preprocessing
%matplotlib inline


# In[237]:

allData = pd.read_csv('all-np.csv')


# Some preprocessing on the column names
allData.astype({'Season': 'category'}).dtypes # set season to category instead of int
allData.set_index(['playerid', 'Name'])

# drop NaN values, which only come into play for pitchers running the bases (a very atypical occurrence)
allData = allData.dropna()

# dictionary assigning number of years in our data set played to each player
playerYears = dict() # dict of {'player ID': 'years in our data set that they played (1, 2, or 3)'}
pid, yearsPlayed = np.unique(allData['playerid'], return_counts=True)
for i in range(len(pid)):
    playerYears[pid[i]] = yearsPlayed[i]

# dictionary assigning names to player IDs
playerNames = dict() # dict where key is player ID, value is player name
# Assign each player ID to a name for future purposes
for i, r in allData.iterrows():
    playerNames[r['playerid']] = r['Name']

# assigns weights to each player and the data for that year. If played 2016-18, keep the weights of 0.6, 0.3, 0.1. If played two years, 0.7 and 0.3. If only played one year, assign 1 for now
for index, row in allData.iterrows():
    if playerYears[row['playerid']] == 3:
        continue
    elif playerYears[row['playerid']] == 2: # played in 16-17 or 17-18
        played1617 = False
        if row['Season'] == 2016:
            played1617 = True
            allData.set_value(index, 'weight', 0.3)
        elif row['Season'] == 2017:
            if played1617:
                allData.set_value(index, 'weight', 0.7)
            else:
                allData.set_value(index, 'weight', 0.3)
        else:
            allData.set_value(index, 'weight', 0.7)
    else:
        allData.set_value(index, 'weight', 1)

# sort all the rows by player ID first, then season. Makes it easier to figure out which player played which seasons
allData = allData.sort_values(by=['playerid', 'Season'])

# scale all relevant stats
allData[['wRC', 'PA', 'H', 'AB', 'RBI', 'G', '2B']] = preprocessing.scale(allData[['wRC', 'PA', 'H', 'AB', 'RBI', 'G', '2B']])

# assign X and Y sets for regression
X = allData[['wRC', 'PA', 'H', 'AB', 'RBI', 'G', '2B']].values
Y = allData[['R']].values

# add newly calculated weighted stats to our dataframe as columns
allData['weightWRC'] = allData['weight']*allData['wRC']
allData['weightPA'] = allData['weight']*allData['PA']
allData['weightH'] = allData['weight']*allData['H']
allData['weightAB'] = allData['weight']*allData['AB']
allData['weightRBI'] = allData['weight']*allData['RBI']
allData['weightG'] = allData['weight']*allData['G']
allData['weight2B'] = allData['weight']*allData['2B']

# sum up weighted averages by player, so that all years are combined
weightedRunStats2019 = allData.groupby('playerid', as_index=False).sum()
weightedRunStats2019 = weightedRunStats2019[['playerid', 'weightWRC', 'weightPA', 'weightH', 'weightAB', 'weightRBI', 'weightG', 'weight2B']]

# we lost player names when doing the group by and sum, so get the player names and put it back in our new data frame
names = []
for i in pid:
    names.append(playerNames[i])
weightedRunStats2019['Name'] = names
weightedRunStats2019['Season'] = 2019

# move name and season columns to the front of the dataframe
cols = weightedRunStats2019.columns.tolist()
cols = cols[-2:] + cols[:-2]
weightedRunStats2019 = weightedRunStats2019[cols]


# In[239]:


# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
regressor = LinearRegression()
regressor.fit(X, Y)


# In[240]:


X_2019 = weightedRunStats2019[['weightWRC', 'weightPA', 'weightH', 'weightAB', 'weightRBI', 'weightG', 'weight2B']].values
y_pred = regressor.predict(X_2019)


# In[241]:


y_pred_list = [] # list of y_pred so we can add it to a dataframe
for i in range(len(y_pred)):
    y_pred_list.append(y_pred[i][0])

weightedRunStats2019['runsPredicted'] = y_pred_list

# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
regr = LinearSVR(random_state=0)
regr.fit(X, Y)

# In[240]:
y_pred = regr.predict(X_2019)
weightedRunStats2019['linearSVRRuns'] = y_pred

regr = SVR()
regr.fit(X, Y)

y_pred = regr.predict(X_2019)
weightedRunStats2019['svrRuns'] = y_pred

weightedRunStats2019 = weightedRunStats2019.sort_values(by=['linearSVRRuns'], ascending=False)
print(weightedRunStats2019)
weightedRunStats2019.to_csv('RunsPredicted.csv')

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


                 Name  Season  playerid  weightWRC  weightPA   weightH  \
192  Charlie Blackmon    2019      7859   2.262592  1.631009  2.065036   
224  Paul Goldschmidt    2019      9218   2.318990  1.530593  1.547836   
399      Mookie Betts    2019     13611   2.371862  1.339011  1.763336   
263        Mike Trout    2019     10155   2.491706  0.876572  0.882180   
369  Francisco Lindor    2019     12916   1.970032  1.849677  1.816014   
..                ...     ...       ...        ...       ...       ...   
481     Dustin Fowler    2019     17098  -1.508973 -1.647025 -1.497899   
271     Rafael Ortega    2019     10323  -1.544221 -1.653631 -1.497899   
327   Ronny Rodriguez    2019     11875  -1.544221 -1.627206 -1.521844   
458      Victor Reyes    2019     15487  -1.685214 -1.541325 -1.402121   
431    Pedro Severino    2019     14523  -1.720463 -1.580962 -1.761288   

     weightAB  weightRBI   weightG  weight2B  runsPredicted  linearSVRRuns  \
192  1.648745   1.081317  1.24378

PermissionError: [Errno 13] Permission denied: 'RunsPredicted.csv'

In [34]:
allData = pd.read_csv('all-np.csv')


# In[238]:


# Some preprocessing on the column names
allData.astype({'Season': 'category'}).dtypes # set season to category instead of int
allData.set_index(['playerid', 'Name'])

# drop NaN values, which only come into play for pitchers running the bases (a very atypical occurrence)
allData = allData.dropna()

# dictionary assigning number of years in our data set played to each player
playerYears = dict() # dict of {'player ID': 'years in our data set that they played (1, 2, or 3)'}
pid, yearsPlayed = np.unique(allData['playerid'], return_counts=True)
for i in range(len(pid)):
    playerYears[pid[i]] = yearsPlayed[i]

# dictionary assigning names to player IDs
playerNames = dict() # dict where key is player ID, value is player name
# Assign each player ID to a name for future purposes
for i, r in allData.iterrows():
    playerNames[r['playerid']] = r['Name']

# assigns weights to each player and the data for that year. If played 2016-18, keep the weights of 0.6, 0.3, 0.1. If played two years, 0.7 and 0.3. If only played one year, assign 1 for now
for index, row in allData.iterrows():
    if playerYears[row['playerid']] == 3:
        continue
    elif playerYears[row['playerid']] == 2: # played in 16-17 or 17-18
        played1617 = False
        if row['Season'] == 2016:
            played1617 = True
            allData.set_value(index, 'weight', 0.3)
        elif row['Season'] == 2017:
            if played1617:
                allData.set_value(index, 'weight', 0.7)
            else:
                allData.set_value(index, 'weight', 0.3)
        else:
            allData.set_value(index, 'weight', 0.7)
    else:
        allData.set_value(index, 'weight', 1)

# sort all the rows by player ID first, then season. Makes it easier to figure out which player played which seasons
allData = allData.sort_values(by=['playerid', 'Season'])

# scale relevants stats
allData[['BABIP', 'OBP', 'wOBA', 'OPS', 'wRC+', 'wRAA', '1B']] = preprocessing.scale(allData[['BABIP', 'OBP', 'wOBA', 'OPS', 'wRC+', 'wRAA', '1B']])

# assign X and Y sets for regression
X = allData[['BABIP', 'OBP', 'wOBA', 'OPS', 'wRC+', 'wRAA', '1B']].values
Y = allData[['AVG']].values

# dictionary assigning number of years in our data set played to each player
playerYears = dict() # dict of {'player ID': 'years in our data set that they played (1, 2, or 3)'}
pid, yearsPlayed = np.unique(allData['playerid'], return_counts=True)
for i in range(len(pid)):
    playerYears[pid[i]] = yearsPlayed[i]

# dictionary assigning names to player IDs
playerNames = dict() # dict where key is player ID, value is player name
# Assign each player ID to a name for future purposes
for i, r in allData.iterrows():
    playerNames[r['playerid']] = r['Name']

# assigns weights to each player and the data for that year. If played 2016-18, keep the weights of 0.6, 0.3, 0.1. If played two years, 0.7 and 0.3. If only played one year, assign 1 for now
for index, row in allData.iterrows():
    if playerYears[row['playerid']] == 3:
        continue
    elif playerYears[row['playerid']] == 2: # played in 16-17 or 17-18
        played1617 = False
        if row['Season'] == 2016:
            played1617 = True
            allData.set_value(index, 'weight', 0.3)
        elif row['Season'] == 2017:
            if played1617:
                allData.set_value(index, 'weight', 0.7)
            else:
                allData.set_value(index, 'weight', 0.3)
        else:
            allData.set_value(index, 'weight', 0.7)
    else:
        allData.set_value(index, 'weight', 1)

# add newly calculated weighted stats to our dataframe as columns
allData['weightBABIP'] = allData['weight']*allData['BABIP']
allData['weightOBP'] = allData['weight']*allData['OBP']
allData['weightwOBA'] = allData['weight']*allData['wOBA']
allData['weightOPS'] = allData['weight']*allData['OPS']
allData['weightwRC+'] = allData['weight']*allData['wRC+']
allData['weightwRAA'] = allData['weight']*allData['wRAA']
allData['weight1B'] = allData['weight']*allData['1B']

# sum up weighted averages by player, so that all years are combined
weightedAvgStats2019 = allData.groupby('playerid', as_index=False).sum()
weightedAvgStats2019 = weightedAvgStats2019[['playerid', 'weightBABIP', 'weightOBP', 'weightwOBA', 'weightOPS', 'weightwRC+', 'weightwRAA', 'weight1B']]

# we lost player names when doing the group by and sum, so get the player names and put it back in our new data frame
names = []
for i in pid:
    names.append(playerNames[i])
weightedAvgStats2019['Name'] = names
weightedAvgStats2019['Season'] = 2019

# move name and season columns to the front of the dataframe
cols = weightedAvgStats2019.columns.tolist()
cols = cols[-2:] + cols[:-2]
weightedAvgStats2019 = weightedAvgStats2019[cols]


# In[13]:


# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
regressor = LinearRegression()
regressor.fit(X, Y)


# In[14]:


X_2019 = weightedAvgStats2019[['weightBABIP', 'weightOBP', 'weightwOBA', 'weightOPS', 'weightwRC+', 'weightwRAA', 'weight1B']].values
y_pred = regressor.predict(X_2019)


# In[15]:


y_pred_list = [] # list of y_pred so we can add it to a dataframe
for i in range(len(y_pred)):
    y_pred_list.append(y_pred[i][0])

weightedAvgStats2019['AVGPredicted'] = y_pred_list

# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
regr = LinearSVR(random_state=0)
regr.fit(X, Y)

# In[240]:

y_pred = regr.predict(X_2019)
weightedAvgStats2019['linearSvrAVG'] = y_pred

regr = SVR()
regr.fit(X, Y)

# In[240]:

y_pred = regr.predict(X_2019)
weightedAvgStats2019['svrAVG'] = y_pred


weightedAvgStats2019 = weightedAvgStats2019.sort_values(by=['linearSvrAVG'], ascending=False)
print(weightedAvgStats2019)
weightedAvgStats2019.to_csv('AVGPredictions.csv')

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


                 Name  Season  playerid  weightBABIP  weightOBP  weightwOBA  \
263        Mike Trout    2019     10155     1.084405   3.532253    3.182341   
153     J.D. Martinez    2019      6184     1.656939   1.817545    2.711331   
393      Tyler Naquin    2019     13359     3.035975   1.278558    1.376352   
128   Freddie Freeman    2019      5361     1.420218   1.884570    1.782767   
224  Paul Goldschmidt    2019      9218     1.469764   1.940424    1.866203   
..                ...     ...       ...          ...        ...         ...   
304      Brett Eibner    2019     11369    -2.028749  -1.681687   -1.476627   
385     Tyler Goeddel    2019     13180    -1.836069  -1.905102   -2.122585   
230    Chris Herrmann    2019      9284    -2.579262  -1.486199   -1.368968   
165          JB Shuck    2019      6677    -2.496685  -2.184370   -2.230244   
431    Pedro Severino    2019     14523    -2.469159  -2.016809   -2.741627   

     weightOPS  weightwRC+  weightwRAA  weight1B  A

In [35]:

allData = pd.read_csv('all-np.csv')


# In[238]:


# Some preprocessing on the column names
allData.astype({'Season': 'category'}).dtypes # set season to category instead of int
allData.set_index(['playerid', 'Name'])

# drop NaN values, which only come into play for pitchers running the bases (a very atypical occurrence)
allData = allData.dropna()

# dictionary assigning number of years in our data set played to each player
playerYears = dict() # dict of {'player ID': 'years in our data set that they played (1, 2, or 3)'}
pid, yearsPlayed = np.unique(allData['playerid'], return_counts=True)
for i in range(len(pid)):
    playerYears[pid[i]] = yearsPlayed[i]

# dictionary assigning names to player IDs
playerNames = dict() # dict where key is player ID, value is player name
# Assign each player ID to a name for future purposes
for i, r in allData.iterrows():
    playerNames[r['playerid']] = r['Name']

# assigns weights to each player and the data for that year. If played 2016-18, keep the weights of 0.6, 0.3, 0.1. If played two years, 0.7 and 0.3. If only played one year, assign 1 for now
for index, row in allData.iterrows():
    if playerYears[row['playerid']] == 3:
        continue
    elif playerYears[row['playerid']] == 2: # played in 16-17 or 17-18
        played1617 = False
        if row['Season'] == 2016:
            played1617 = True
            allData.set_value(index, 'weight', 0.3)
        elif row['Season'] == 2017:
            if played1617:
                allData.set_value(index, 'weight', 0.7)
            else:
                allData.set_value(index, 'weight', 0.3)
        else:
            allData.set_value(index, 'weight', 0.7)
    else:
        allData.set_value(index, 'weight', 1)


# sort all the rows by player ID first, then season. Makes it easier to figure out which player played which seasons
allData = allData.sort_values(by=['playerid', 'Season'])

# scale relevants stats
allData[['RBI', 'ISO', 'wRC', 'SLG', 'R', 'HR/FB', 'OPS']] = preprocessing.scale(allData[['RBI', 'ISO', 'wRC', 'SLG', 'R', 'HR/FB', 'OPS']])

# assign X and Y sets for regression
X = allData[['RBI', 'ISO', 'wRC', 'SLG', 'R', 'HR/FB', 'OPS']].values
Y = allData[['HR']].values

# add newly calculated weighted stats to our dataframe as columns
allData['weightRBI'] = allData['weight']*allData['RBI']
allData['weightISO'] = allData['weight']*allData['ISO']
allData['weightwRC'] = allData['weight']*allData['wRC']
allData['weightSLG'] = allData['weight']*allData['SLG']
allData['weightR'] = allData['weight']*allData['R']
allData['weightHRFB'] = allData['weight']*allData['HR/FB']
allData['weightOPS'] = allData['weight']*allData['OPS']

# sum up weighted averages by player, so that all years are combined
weightedHRStats2019 = allData.groupby('playerid', as_index=False).sum()
weightedHRStats2019 = weightedHRStats2019[['playerid', 'weightRBI', 'weightISO', 'weightwRC', 'weightSLG', 'weightR', 'weightHRFB', 'weightOPS']]

# we lost player names when doing the group by and sum, so get the player names and put it back in our new data frame
names = []
for i in pid:
    names.append(playerNames[i])
weightedHRStats2019['Name'] = names
weightedHRStats2019['Season'] = 2019

# move name and season columns to the front of the dataframe
cols = weightedHRStats2019.columns.tolist()
cols = cols[-2:] + cols[:-2]
weightedHRStats2019 = weightedHRStats2019[cols]


# In[13]:


# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
regressor = LinearRegression()
regressor.fit(X, Y)


# In[14]:


X_2019 = weightedHRStats2019[['weightRBI', 'weightISO', 'weightwRC', 'weightSLG', 'weightR', 'weightHRFB', 'weightOPS']].values
y_pred = regressor.predict(X_2019)


# In[15]:


y_pred_list = [] # list of y_pred so we can add it to a dataframe
for i in range(len(y_pred)):
    y_pred_list.append(y_pred[i][0])

weightedHRStats2019['HRsPredicted'] = y_pred_list

# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
regr = LinearSVR(random_state=0)
regr.fit(X, Y)

# In[240]:

y_pred = regr.predict(X_2019)
weightedHRStats2019['linearSvrHR'] = y_pred

regr = SVR()
regr.fit(X, Y)

# In[240]:

y_pred = regr.predict(X_2019)
weightedHRStats2019['SvrHR'] = y_pred


weightedHRStats2019 = weightedHRStats2019.sort_values(by=['linearSvrHR'], ascending=False)
print(weightedHRStats2019)
weightedHRStats2019.to_csv('HRPredictions.csv')

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


                  Name  Season  playerid  weightRBI  weightISO  weightwRC  \
153      J.D. Martinez    2019      6184   2.420194   2.677996   2.262592   
220        Khris Davis    2019      9112   2.458890   2.228543   1.525903   
107  Giancarlo Stanton    2019      4949   2.071931   1.915179   1.924209   
4          David Ortiz    2019       745   2.845849   2.439839   2.403585   
416         Joey Gallo    2019     14128   1.352188   2.395073   0.849137   
..                 ...     ...       ...        ...        ...        ...   
42         Carlos Ruiz    2019      2579  -1.488089  -1.499588  -1.050745   
249      Ivan De Jesus    2019      9886  -1.294609  -1.965156  -1.332732   
468      Austin Slater    2019     16153  -1.178522  -2.036782  -1.297483   
306         Tyler Holt    2019     11391  -1.565480  -1.929344  -1.367980   
376    Kelby Tomlinson    2019     13005  -1.642872  -2.000969  -1.332732   

     weightSLG   weightR  weightHRFB  weightOPS  HRsPredicted  linearSvrHR 

In [26]:

allData = pd.read_csv('all-np.csv')


# In[238]:


# Some preprocessing on the column names
allData.astype({'Season': 'category'}).dtypes # set season to category instead of int
allData.set_index(['playerid', 'Name'])

# drop NaN values, which only come into play for pitchers running the bases (a very atypical occurrence)
allData = allData.dropna()

# dictionary assigning number of years in our data set played to each player
playerYears = dict() # dict of {'player ID': 'years in our data set that they played (1, 2, or 3)'}
pid, yearsPlayed = np.unique(allData['playerid'], return_counts=True)
for i in range(len(pid)):
    playerYears[pid[i]] = yearsPlayed[i]

# dictionary assigning names to player IDs
playerNames = dict() # dict where key is player ID, value is player name
# Assign each player ID to a name for future purposes
for i, r in allData.iterrows():
    playerNames[r['playerid']] = r['Name']

# assigns weights to each player and the data for that year. If played 2016-18, keep the weights of 0.6, 0.3, 0.1. If played two years, 0.7 and 0.3. If only played one year, assign 1 for now
for index, row in allData.iterrows():
    if playerYears[row['playerid']] == 3:
        continue
    elif playerYears[row['playerid']] == 2: # played in 16-17 or 17-18
        played1617 = False
        if row['Season'] == 2016:
            played1617 = True
            allData.set_value(index, 'weight', 0.3)
        elif row['Season'] == 2017:
            if played1617:
                allData.set_value(index, 'weight', 0.7)
            else:
                allData.set_value(index, 'weight', 0.3)
        else:
            allData.set_value(index, 'weight', 0.7)
    else:
        allData.set_value(index, 'weight', 1)


# sort all the rows by player ID first, then season. Makes it easier to figure out which player played which seasons
allData = allData.sort_values(by=['playerid', 'Season'])

# scale all relevant stats
allData[['wOBA', 'wRC+', 'SLG', 'wRAA', 'OBP', 'ISO', 'HR']] = preprocessing.scale(allData[['wOBA', 'wRC+', 'SLG', 'wRAA', 'OBP', 'ISO', 'HR']])

# assign X and Y sets for regression
X = allData[['wOBA', 'wRC+', 'SLG', 'wRAA', 'OBP', 'ISO', 'HR']].values
Y = allData[['OPS']].values


# add newly calculated weighted stats to our dataframe as columns
allData['weightwOBA'] = allData['weight']*allData['wOBA']
allData['weightwRC+'] = allData['weight']*allData['wRC+']
allData['weightSLG'] = allData['weight']*allData['SLG']
allData['weightwRAA'] = allData['weight']*allData['wRAA']
allData['weightOBP'] = allData['weight']*allData['OBP']
allData['weightISO'] = allData['weight']*allData['ISO']
allData['weightHR'] = allData['weight']*allData['HR']

# sum up weighted averages by player, so that all years are combined
weightedOPSStats2019 = allData.groupby('playerid', as_index=False).sum()
weightedOPSStats2019 = weightedOPSStats2019[['playerid', 'weightwOBA', 'weightwRC+', 'weightSLG', 'weightwRAA', 'weightOBP', 'weightISO', 'weightHR']]

# we lost player names when doing the group by and sum, so get the player names and put it back in our new data frame
names = []
for i in pid:
    names.append(playerNames[i])
weightedOPSStats2019['Name'] = names
weightedOPSStats2019['Season'] = 2019

# move name and season columns to the front of the dataframe
cols = weightedOPSStats2019.columns.tolist()
cols = cols[-2:] + cols[:-2]
weightedOPSStats2019 = weightedOPSStats2019[cols]


# In[239]:


# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
regressor = LinearRegression()
regressor.fit(X, Y)


# In[240]:


X_2019 = weightedOPSStats2019[['weightwOBA', 'weightwRC+', 'weightSLG', 'weightwRAA', 'weightOBP', 'weightISO', 'weightHR']].values
y_pred = regressor.predict(X_2019)


# In[241]:


y_pred_list = [] # list of y_pred so we can add it to a dataframe
for i in range(len(y_pred)):
    y_pred_list.append(y_pred[i][0])

weightedOPSStats2019['opsPredicted'] = y_pred_list

# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
regr = LinearSVR(random_state=0)
regr.fit(X, Y)

# In[240]:

y_pred = regr.predict(X_2019)
weightedOPSStats2019['linearSvrOPS'] = y_pred

regr = SVR()
regr.fit(X, Y)

# In[240]:

y_pred = regr.predict(X_2019)
weightedOPSStats2019['svrOPS'] = y_pred


weightedOPSStats2019 = weightedOPSStats2019.sort_values(by=['linearSvrOPS'], ascending=False)
print(weightedOPSStats2019)
weightedOPSStats2019.to_csv('OPSPredictions.csv')

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


                Name  Season  playerid  weightwOBA  weightwRC+  weightSLG  \
263       Mike Trout    2019     10155    3.182341    3.421117   2.800025   
153    J.D. Martinez    2019      6184    2.711331    2.661648   3.049516   
4        David Ortiz    2019       745    2.587522    2.525024   2.792856   
399     Mookie Betts    2019     13611    2.318373    2.271868   2.149055   
389        Max Muncy    2019     13301    2.264544    2.484840   2.247991   
..               ...     ...       ...         ...         ...        ...   
385    Tyler Goeddel    2019     13180   -2.122585   -2.136266  -1.924526   
165         JB Shuck    2019      6677   -2.230244   -2.176450  -1.809818   
5    A.J. Pierzynski    2019       746   -2.337904   -2.377368  -1.738125   
458     Victor Reyes    2019     15487   -2.526308   -2.497918  -1.967542   
431   Pedro Severino    2019     14523   -2.741627   -2.779203  -2.555422   

     weightwRAA  weightOBP  weightISO  weightHR  opsPredicted  linearSvrOPS

In [36]:

allData = pd.read_csv('all-np.csv')


# In[238]:


# Some preprocessing on the column names
allData.astype({'Season': 'category'}).dtypes # set season to category instead of int
allData.set_index(['playerid', 'Name'])

# drop NaN values, which only come into play for pitchers running the bases (a very atypical occurrence)
allData = allData.dropna()

# dictionary assigning number of years in our data set played to each player
playerYears = dict() # dict of {'player ID': 'years in our data set that they played (1, 2, or 3)'}
pid, yearsPlayed = np.unique(allData['playerid'], return_counts=True)
for i in range(len(pid)):
    playerYears[pid[i]] = yearsPlayed[i]

# dictionary assigning names to player IDs
playerNames = dict() # dict where key is player ID, value is player name
# Assign each player ID to a name for future purposes
for i, r in allData.iterrows():
    playerNames[r['playerid']] = r['Name']

# assigns weights to each player and the data for that year. If played 2016-18, keep the weights of 0.6, 0.3, 0.1. If played two years, 0.7 and 0.3. If only played one year, assign 1 for now
for index, row in allData.iterrows():
    if playerYears[row['playerid']] == 3:
        continue
    elif playerYears[row['playerid']] == 2: # played in 16-17 or 17-18
        played1617 = False
        if row['Season'] == 2016:
            played1617 = True
            allData.set_value(index, 'weight', 0.3)
        elif row['Season'] == 2017:
            if played1617:
                allData.set_value(index, 'weight', 0.7)
            else:
                allData.set_value(index, 'weight', 0.3)
        else:
            allData.set_value(index, 'weight', 0.7)
    else:
        allData.set_value(index, 'weight', 1)


# sort all the rows by player ID first, then season. Makes it easier to figure out which player played which seasons
allData = allData.sort_values(by=['playerid', 'Season'])

# scale relevants stats
allData[['wRC', 'HR',  'PA', 'AB', 'R', 'H', '2B']] = preprocessing.scale(allData[['wRC', 'HR',  'PA', 'AB', 'R', 'H', '2B']])

# assign X and Y sets for regression
X = allData[['wRC', 'HR',  'PA', 'AB', 'R', 'H', '2B']].values
Y = allData[['RBI']].values

# add newly calculated weighted stats to our dataframe as columns
allData['weightwRC'] = allData['weight']*allData['wRC']
allData['weightHR'] = allData['weight']*allData['HR']
allData['weightPA'] = allData['weight']*allData['PA']
allData['weightAB'] = allData['weight']*allData['AB']
allData['weightR'] = allData['weight']*allData['R']
allData['weightH'] = allData['weight']*allData['H']
allData['weight2B'] = allData['weight']*allData['2B']


# sum up weighted averages by player, so that all years are combined
weightedRBIStats2019 = allData.groupby('playerid', as_index=False).sum()
weightedRBIStats2019 = weightedRBIStats2019[['playerid', 'weightwRC', 'weightHR', 'weightPA', 'weightAB', 'weightR' ,'weightH', 'weight2B']]

# we lost player names when doing the group by and sum, so get the player names and put it back in our new data frame
names = []
for i in pid:
    names.append(playerNames[i])
weightedRBIStats2019['Name'] = names
weightedRBIStats2019['Season'] = 2019

# move name and season columns to the front of the dataframe
cols = weightedRBIStats2019.columns.tolist()
cols = cols[-2:] + cols[:-2]
weightedRBIStats2019 = weightedRBIStats2019[cols]


# In[13]:


# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
regressor = LinearRegression()
regressor.fit(X, Y)


# In[14]:


X_2019 = weightedRBIStats2019[['weightwRC', 'weightHR', 'weightPA', 'weightAB', 'weightR' ,'weightH', 'weight2B']].values
y_pred = regressor.predict(X_2019)


# In[15]:


y_pred_list = [] # list of y_pred so we can add it to a dataframe
for i in range(len(y_pred)):
    y_pred_list.append(y_pred[i][0])

weightedRBIStats2019['RBIsPredicted'] = y_pred_list

# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
regr = LinearSVR(random_state=0)
regr.fit(X, Y)

# In[240]:

y_pred = regr.predict(X_2019)
weightedRBIStats2019['linearSvrRBI'] = y_pred

regr = SVR()
regr.fit(X, Y)

# In[240]:

y_pred = regr.predict(X_2019)
weightedRBIStats2019['svrRBI'] = y_pred


weightedRBIStats2019 = weightedRBIStats2019.sort_values(by=['linearSvrRBI'], ascending=False)
print(weightedRBIStats2019)
weightedRBIStats2019.to_csv('RBIPredictions.csv')



                  Name  Season  playerid  weightwRC  weightHR  weightPA  \
4          David Ortiz    2019       745   2.403585  2.303024  1.147429   
107  Giancarlo Stanton    2019      4949   1.924209  2.825340  1.488313   
220        Khris Davis    2019      9112   1.525903  3.096543  1.299374   
242      Nolan Arenado    2019      9777   2.259068  2.303024  1.486992   
153      J.D. Martinez    2019      6184   2.262592  2.654583  0.895070   
..                 ...     ...       ...        ...       ...       ...   
442    Victor Caratini    2019     14968  -1.473725 -1.313013 -1.666844   
271      Rafael Ortega    2019     10323  -1.544221 -1.413459 -1.653631   
376    Kelby Tomlinson    2019     13005  -1.332732 -1.413459 -1.521506   
306         Tyler Holt    2019     11391  -1.367980 -1.513904 -1.613994   
458       Victor Reyes    2019     15487  -1.685214 -1.413459 -1.541325   

     weightAB   weightR   weightH  weight2B  RBIsPredicted  linearSvrRBI  \
4    0.984012  0.943859

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [37]:
weightedRBIStats2019.reset_index()
dfa = weightedRBIStats2019.join(weightedAvgStats2019, how='outer', lsuffix='playerid', rsuffix='playerid')
dfa = dfa.join(weightedHRStats2019, how='outer', lsuffix='playerid', rsuffix='playerid')
dfa = dfa.join(weightedOPSStats2019, how='outer', lsuffix='playerid', rsuffix='playerid')
dfa = dfa.join(weightedRunStats2019, how='outer', lsuffix='playerid', rsuffix='playerid')
print(dfa)
dfa.to_csv('allPredictions2019.csv')

            Nameplayerid  Seasonplayerid  playeridplayerid  weightwRCplayerid  \
0        Victor Martinez            2019               393          -0.268234   
1             Juan Uribe            2019               454          -1.438476   
2         Carlos Beltran            2019               589           0.204937   
3          Adrian Beltre            2019               639           0.334512   
4            David Ortiz            2019               745           2.403585   
..                   ...             ...               ...                ...   
500           Dae-Ho Lee            2019             18722          -0.733511   
501         Yuli Gurriel            2019             19198           0.560101   
502  Lourdes Gurriel Jr.            2019             19238          -0.909752   
503        Shohei Ohtani            2019             19755           0.253441   
504            Juan Soto            2019             20123           1.099399   

     weightHRplayerid  weig

PermissionError: [Errno 13] Permission denied: 'allPredictions2019.csv'

In [38]:
dfa.to_csv('allPredictions2019.csv')