In [4]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import pandas as pd
from datetime import datetime
pd.options.display.max_columns = None
from pybaseball import batting_stats, pitching_stats, cache, playerid_lookup, statcast_batter, statcast_pitcher, statcast

cache.enable()
cache.config.cache_type='csv'
cache.config.save()

from sqlalchemy import MetaData, text, Column, Integer, String, ForeignKey, Table, create_engine, Float, Boolean, DateTime
from sqlalchemy.orm import relationship, backref, sessionmaker
from sqlalchemy.ext.declarative import declarative_base

meta = MetaData()
engine = create_engine('sqlite:///fantasy_data.db', echo=False)

In [7]:
n_teams = 12
tm_players = 23
tm_dollars = 260
player_split = .6
pitcher_split = 1 - player_split
tot_dollars = n_teams * tm_dollars
tot_players = n_teams * tm_players
tot_hitters = n_teams * 14
tot_pitchers = n_teams * 9

def owners(conv):
    df = pd.read_sql('players', engine)
    owners_df = df.groupby('Owner').agg({'Name':'count', 'Paid':'sum', 'z':'sum', 'H':'sum', 'AB':'sum', 'HR':'sum', 'R':'sum', 'RBI':'sum', 'SB':'sum', 'W':'sum', 'Sv+Hld':'sum', 'SO':'sum'}).reset_index()
    owners_df.rename(columns={'Name':'Drafted'},inplace=True)
    owners_df['$/unit'] = owners_df['Paid']/owners_df['z']
    owners_df['$ Left'] = tm_dollars - owners_df['Paid']
    owners_df['$ Left / Plyr'] = owners_df['$ Left'] / (tm_players -owners_df['Drafted'])
    owners_df['Value'] = (owners_df['z']*conv)-owners_df['Paid']
    owners_df['BA'] = owners_df['H']/owners_df['AB']
    owners_df['Pts'] = 0
    for i in ['BA', 'HR', 'R', 'RBI', 'SB', 'W', 'Sv+Hld', 'SO']:
        owners_df['Pts'] += owners_df[i].rank()
    owners_df['Rank'] = owners_df['Pts'].rank()
    return df.sort_values('z', ascending=False), owners_df

conv = 5.277402144368934
df, owners_df = owners(conv)

In [46]:
sc = pd.read_csv('data/statcast-exit_velocity.csv')
sc = sc.sort_values('brl_pa', ascending=False)
sc = sc.merge(df[['playerid', 'Name', 'Primary_Pos', 'z', 'Value']], on='playerid', how='left')
sc.Name.fillna(sc['first_name']+' '+sc['last_name'],inplace=True)
sc = sc[~sc['Primary_Pos'].isin(['SP', 'RP'])]
sc['brl_pa_rank'] = sc.brl_pa.rank(pct=True)

In [11]:
from sklearn.cluster import OPTICS, DBSCAN

In [110]:
X = sc[['anglesweetspotpercent', 'ev95percent', 'brl_pa']]

In [121]:
optics = OPTICS(min_samples=5).fit(X)
dbscan = DBSCAN(eps=1, min_samples=5).fit(X)
sc['optics'] = optics.labels_
sc['dbscan'] = dbscan.labels_
print(len(sc['optics'].value_counts()))
print(len(sc['dbscan'].value_counts()))

57
39


In [122]:
sc.dbscan.value_counts()

-1     937
 10    171
 0     131
 11     40
 6      22
 4      18
 26     10
 2      10
 15      8
 27      8
 36      8
 35      8
 7       8
 33      8
 29      7
 1       7
 24      7
 21      7
 9       7
 32      7
 12      7
 3       6
 19      6
 31      6
 16      5
 14      5
 13      5
 17      5
 37      5
 20      5
 8       5
 22      5
 34      5
 5       5
 23      5
 25      5
 28      5
 18      4
 30      4
Name: dbscan, dtype: int64

In [128]:
sc.groupby('dbscan')['Value'].mean().sort_values()

dbscan
 19   -34.575109
 35   -33.476452
 25   -30.771144
 34   -26.356247
 22   -26.275456
 27   -25.572614
 32   -23.803004
 5    -18.850637
 29   -18.264357
 28   -17.657460
 37   -15.726575
 16   -12.817111
 33   -12.561029
 24   -11.939041
 36   -11.823249
 15   -11.722316
 10   -11.554837
 23   -10.199643
-1    -10.003129
 31    -9.637822
 11    -9.063883
 12    -6.863034
 30    -6.860451
 20    -6.713214
 18    -6.654697
 26    -6.361071
 7     -3.417392
 14    -2.898107
 0     -1.775202
 4     -1.514089
 21    -1.096644
 6     -0.446365
 13     0.763880
 1      1.083382
 17     1.786212
 3      2.002396
 2      8.827477
 9      9.385388
 8     17.983151
Name: Value, dtype: float64

In [147]:
mask = (sc['dbscan'] == 2)

fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=sc[mask]['brl_pa'],
        y=sc[mask]['anglesweetspotpercent'],
        mode='markers',
        marker=dict(color=sc[mask]['Value'], colorscale='bluered'),
        text=sc[mask]['Name']+'<br>'+sc[mask]['year'].astype(str)+'<br>'+sc[mask]['Value'].astype(str)+'<br>'+sc[mask]['playerid']
    )
)
fig.show()

In [174]:
from sklearn.preprocessing import MinMaxScaler
from scipy.stats.stats import pearsonr
def scale_data(df, cols):
    """
    INPUT: 
        df: original dataframe
        list: subset of columns to scale
    OUTPUT:
        df: scaled data
    """
    scaler = MinMaxScaler()
    scaler.fit(df[cols])
    scaled_df = scaler.transform(df[cols])
    scaled_df = pd.DataFrame(scaled_df, index=df.index)
    scaled_df.columns=[df[cols].columns.tolist()]
    return scaled_df

def add_distance_metrics(h, player_id, col_list):
    scaled_df = scale_data(h.set_index('playerid'), col_list)
    df2 = h.loc[:,['playerid', 'Name', 'Value']+col_list].set_index('playerid')
    for j, row in scaled_df.iterrows():
        #df2.at[j,'corr'] = pearsonr(scaled_df.loc[player_id,col_list],row[col_list])[0]
        df2.at[j,'eucl_dist'] = np.linalg.norm(scaled_df.loc[player_id,col_list] - row[col_list])
        #df2.at[j,'manh_dist']= sum(abs(e - s) for s, e in zip(scaled_df.loc[player_id,col_list], row[col_list]))
    return df2.sort_values('eucl_dist').iloc[:6]

In [175]:
cols = X.columns.tolist()

In [176]:
add_distance_metrics(sc[(sc['year']==2021) & (sc['playerid'].notna())], '20003', cols)

Unnamed: 0_level_0,Name,Value,anglesweetspotpercent,ev95percent,brl_pa,eucl_dist
playerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
20003,Keston Hiura,-20.4997,34.3,41.4,7.6,0.0
18607,Jared Walsh,11.264866,33.6,41.2,7.4,0.02498
12144,Max Kepler,4.859777,35.3,42.5,7.3,0.042449
11442,Gary Sanchez,-0.121855,34.4,42.0,8.2,0.047523
13510,Jose Ramirez,31.398757,33.4,42.3,8.2,0.055319
12533,Marcus Semien,22.290029,35.2,41.3,6.9,0.058972
