In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import pandas as pd
from datetime import datetime
pd.options.display.max_columns = None
from pybaseball import batting_stats, pitching_stats, cache, playerid_lookup, statcast_batter, statcast_pitcher, statcast

cache.enable()
cache.config.cache_type='csv'
cache.config.save()

from sqlalchemy import MetaData, text, Column, Integer, String, ForeignKey, Table, create_engine, Float, Boolean, DateTime
from sqlalchemy.orm import relationship, backref, sessionmaker
from sqlalchemy.ext.declarative import declarative_base

meta = MetaData()
engine = create_engine('sqlite:///fantasy_data.db', echo=False)

In [3]:
n_teams = 13
tm_players = 23
tm_dollars = 260
player_split = .65
pitcher_split = 1 - player_split
tot_dollars = n_teams * tm_dollars
tot_players = n_teams * tm_players
tot_hitters = n_teams * 14
tot_pitchers = n_teams * 9

def owners(conv):
    df = pd.read_sql('players', engine)
    owners_df = df.groupby('Owner').agg({'Name':'count', 'Paid':'sum', 'z':'sum', 'H':'sum', 'AB':'sum', 'HR':'sum', 'R':'sum', 'RBI':'sum', 'SB':'sum', 'W':'sum', 'Sv+Hld':'sum', 'SO':'sum'}).reset_index()
    owners_df.rename(columns={'Name':'Drafted'},inplace=True)
    owners_df['$/unit'] = owners_df['Paid']/owners_df['z']
    owners_df['$ Left'] = tm_dollars - owners_df['Paid']
    owners_df['$ Left / Plyr'] = owners_df['$ Left'] / (tm_players -owners_df['Drafted'])
    owners_df['Value'] = (owners_df['z']*conv)-owners_df['Paid']
    owners_df['BA'] = owners_df['H']/owners_df['AB']
    owners_df['Pts'] = 0
    for i in ['BA', 'HR', 'R', 'RBI', 'SB', 'W', 'Sv+Hld', 'SO']:
        owners_df['Pts'] += owners_df[i].rank()
    owners_df['Rank'] = owners_df['Pts'].rank()
    return df.sort_values('z', ascending=False), owners_df

conv = 4.818971892407433
df, owners_df = owners(conv)
df['diff'] = df['Value'] - df['Dollars']

In [46]:
sc = pd.read_csv('data/statcast-exit_velocity.csv')
sc = sc.sort_values('brl_pa', ascending=False)
sc = sc.merge(df[['playerid', 'Name', 'Primary_Pos', 'z', 'Value']], on='playerid', how='left')
sc.Name.fillna(sc['first_name']+' '+sc['last_name'],inplace=True)
sc = sc[~sc['Primary_Pos'].isin(['SP', 'RP'])]
sc['brl_pa_rank'] = sc.brl_pa.rank(pct=True)

In [11]:
from sklearn.cluster import OPTICS, DBSCAN

In [110]:
X = sc[['anglesweetspotpercent', 'ev95percent', 'brl_pa']]

In [121]:
optics = OPTICS(min_samples=5).fit(X)
dbscan = DBSCAN(eps=1, min_samples=5).fit(X)
sc['optics'] = optics.labels_
sc['dbscan'] = dbscan.labels_
print(len(sc['optics'].value_counts()))
print(len(sc['dbscan'].value_counts()))

57
39


In [122]:
sc.dbscan.value_counts()

-1     937
 10    171
 0     131
 11     40
 6      22
 4      18
 26     10
 2      10
 15      8
 27      8
 36      8
 35      8
 7       8
 33      8
 29      7
 1       7
 24      7
 21      7
 9       7
 32      7
 12      7
 3       6
 19      6
 31      6
 16      5
 14      5
 13      5
 17      5
 37      5
 20      5
 8       5
 22      5
 34      5
 5       5
 23      5
 25      5
 28      5
 18      4
 30      4
Name: dbscan, dtype: int64

In [128]:
sc.groupby('dbscan')['Value'].mean().sort_values()

dbscan
 19   -34.575109
 35   -33.476452
 25   -30.771144
 34   -26.356247
 22   -26.275456
 27   -25.572614
 32   -23.803004
 5    -18.850637
 29   -18.264357
 28   -17.657460
 37   -15.726575
 16   -12.817111
 33   -12.561029
 24   -11.939041
 36   -11.823249
 15   -11.722316
 10   -11.554837
 23   -10.199643
-1    -10.003129
 31    -9.637822
 11    -9.063883
 12    -6.863034
 30    -6.860451
 20    -6.713214
 18    -6.654697
 26    -6.361071
 7     -3.417392
 14    -2.898107
 0     -1.775202
 4     -1.514089
 21    -1.096644
 6     -0.446365
 13     0.763880
 1      1.083382
 17     1.786212
 3      2.002396
 2      8.827477
 9      9.385388
 8     17.983151
Name: Value, dtype: float64

In [147]:
mask = (sc['dbscan'] == 2)

fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=sc[mask]['brl_pa'],
        y=sc[mask]['anglesweetspotpercent'],
        mode='markers',
        marker=dict(color=sc[mask]['Value'], colorscale='bluered'),
        text=sc[mask]['Name']+'<br>'+sc[mask]['year'].astype(str)+'<br>'+sc[mask]['Value'].astype(str)+'<br>'+sc[mask]['playerid']
    )
)
fig.show()

In [24]:
from sklearn.preprocessing import MinMaxScaler
from scipy.stats.stats import pearsonr
def scale_data(df, cols):
    """
    INPUT: 
        df: original dataframe
        list: subset of columns to scale
    OUTPUT:
        df: scaled data
    """
    scaler = MinMaxScaler()
    scaler.fit(df[cols])
    scaled_df = scaler.transform(df[cols])
    scaled_df = pd.DataFrame(scaled_df, index=df.index)
    scaled_df.columns=[df[cols].columns.tolist()]
    return scaled_df

def add_distance_metrics(h, player_id, col_list):
    scaled_df = scale_data(h.set_index('playerid'), col_list)
    df2 = h.loc[:,['playerid', 'Name', 'Value']+col_list].set_index('playerid')
    for j, row in scaled_df.iterrows():
        #df2.at[j,'corr'] = pearsonr(scaled_df.loc[player_id,col_list],row[col_list])[0]
        df2.at[j,'eucl_dist'] = np.linalg.norm(scaled_df.loc[player_id,col_list] - row[col_list])
        #df2.at[j,'manh_dist']= sum(abs(e - s) for s, e in zip(scaled_df.loc[player_id,col_list], row[col_list]))
    return df2.sort_values('eucl_dist').iloc[:10]

In [175]:
cols = X.columns.tolist()

In [176]:
add_distance_metrics(sc[(sc['year']==2021) & (sc['playerid'].notna())], '20003', cols)

Unnamed: 0_level_0,Name,Value,anglesweetspotpercent,ev95percent,brl_pa,eucl_dist
playerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
20003,Keston Hiura,-20.4997,34.3,41.4,7.6,0.0
18607,Jared Walsh,11.264866,33.6,41.2,7.4,0.02498
12144,Max Kepler,4.859777,35.3,42.5,7.3,0.042449
11442,Gary Sanchez,-0.121855,34.4,42.0,8.2,0.047523
13510,Jose Ramirez,31.398757,33.4,42.3,8.2,0.055319
12533,Marcus Semien,22.290029,35.2,41.3,6.9,0.058972


In [25]:
cols = ['HR', 'R', 'RBI', 'SB', 'BA']

In [31]:
add_distance_metrics(df[df['Owner'].isna()], '11739', cols)

Unnamed: 0_level_0,Name,Value,HR,R,RBI,SB,BA,eucl_dist
playerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
11739,J.T. Realmuto,25.364503,21.0,76.0,81.0,10.0,0.258941,0.0
16997,Gleyber Torres,13.626992,21.0,74.0,76.0,12.0,0.263818,0.073975
5760,Avisaíl García,10.418367,22.0,68.0,75.0,8.0,0.260638,0.104101
18314,Dansby Swanson,12.661707,23.0,82.0,77.0,10.0,0.248169,0.105489
12155,Eddie Rosario,14.347798,24.0,68.0,82.0,8.0,0.266899,0.124373
19566,Nathaniel Lowe,2.591434,21.0,77.0,76.0,5.0,0.260626,0.127749
19363,Austin Hays,9.770525,23.0,72.0,73.0,6.0,0.256731,0.134266
15112,Ryan McMahon,2.35307,23.0,73.0,80.0,5.0,0.254296,0.135288
3473,Anthony Rizzo,4.087522,24.0,76.0,78.0,5.0,0.258258,0.14325
11477,Christian Yelich,21.388996,24.0,87.0,79.0,14.0,0.262925,0.158454


In [60]:
df[(df['Owner'].isna()) & (df['Pos'].str.contains('SP'))][['playerid', 'Name', 'Primary_Pos', 'Pos', 'z', 'Dollars', 'Value', 'diff', 'HR', 'R', 'RBI', 'SB', 'BA', 'PA', 'W', 'SO', 'ERA', 'WHIP', 'Sv+Hld']].sort_values('diff', ascending=False).iloc[:50]

Unnamed: 0,playerid,Name,Primary_Pos,Pos,z,Dollars,Value,diff,HR,R,RBI,SB,BA,PA,W,SO,ERA,WHIP,Sv+Hld
677,13164,Eduardo Rodriguez,SP,SP,3.039927,4.0,14.649325,10.649325,,,,,,,11.0,186.0,3.657401,1.236066,0.0
696,14444,Hyun Jin Ryu,SP,SP,2.272612,1.0,10.951651,9.951651,,,,,,,12.0,143.0,3.901852,1.216318,0.0
650,15873,Sean Manaea,SP,SP,3.43177,8.0,16.537602,8.537602,,,,,,,11.0,169.0,3.656705,1.165906,0.0
638,8700,Justin Verlander,SP,SP,5.075417,16.0,24.458293,8.458293,,,,,,,11.0,191.0,3.56044,1.042386,0.0
669,13781,Alex Wood,SP,SP,1.756188,1.0,8.463021,7.463021,,,,,,,9.0,148.0,3.759474,1.239683,0.0
701,23550,Aaron Ashby,SP,SP,1.515641,0.0,7.303834,7.303834,,,,,,,7.0,111.0,3.554844,1.2917,9.0
685,14120,Lance McCullers Jr.,SP,SP,1.868237,2.0,9.002983,7.002983,,,,,,,9.0,152.0,3.608377,1.267227,0.0
708,12703,Trevor Bauer,SP,SP,2.253248,4.0,10.858337,6.858337,,,,,,,8.0,145.0,3.696099,1.149897,0.0
706,16511,Jordan Montgomery,SP,SP,1.385593,1.0,6.677131,5.677131,,,,,,,9.0,158.0,3.991644,1.253415,0.0
740,16269,John Means,SP,SP,1.579859,2.0,7.613295,5.613295,,,,,,,10.0,147.0,4.271186,1.177196,0.0


In [43]:
for i in ['HR', 'R', 'RBI', 'SB', 'H']:
    print(i, df[(df['z']>0) & (~df['Primary_Pos'].isin(['SP', 'RP']))][i].sum())

HR 3777.0
R 12269.0
RBI 12205.0
SB 1274.0
H 21085.0


In [48]:
for i in ['ER', 'HA', 'BB', 'W', 'SO', 'Sv+Hld']:
    print(i, df[(df['z']>0) & (df['Primary_Pos'].isin(['SP', 'RP']))][i].sum())

ER 6252.0
HA 13235.0
BB 4750.0
W 957.0
SO 16328.0
Sv+Hld 843.0
