In [1]:
from bs4 import BeautifulSoup, Comment
import numpy as np
import pandas as pd
import sqlite3


def extract_table(table_str, header_row=1, start_of_rows=2, get_url=False):
    """Extract table from html."""
    columns = [t['data-stat'] for t in table_str.findAll('tr')[header_row].findAll('th')]
    rows = [
        [r.find('th').text] +
        [t.text for t in r.findAll('td')] for r in table_str.findAll('tr')[start_of_rows:]
    ]
    df = pd.DataFrame(rows, columns=columns)
    if get_url:
        extra_rows = [[t.find('a')['href'] if t.find('a') else np.nan for t in r.findAll('td')]
                      for r in table_str.findAll('tr')[start_of_rows:]]
        extra_df = pd.DataFrame(extra_rows, columns=[c+'_url' for c in columns[1:]])
        df = pd.concat([df, extra_df], axis=1)
    return df


def find_table(soup, table_name):
    """Find html for table, even if in a comment."""
    tables = soup.findAll('table', {"id": table_name})
    if tables:
        return tables[0]
    comments = soup.findAll(text=lambda text: isinstance(text, Comment))
    table_comment = next(c for c in comments if 'id="{}"'.format(table_name) in c)
    table_soup = BeautifulSoup(str(table_comment), "lxml")
    return table_soup.findAll('table', {"id": table_name})[0]


def create_insert_table_sql(table_name, col_mappings):
    """Create sql for creating and inserting a table."""
    col_types = ', '.join([k + ' ' + v for k, v in col_mappings.items()])
    create_sql = 'CREATE TABLE {} ({})'.format(table_name, col_types)
    placeholders = ', '.join(['?' for _ in col_mappings.items()])
    insert_sql = 'INSERT INTO {} VALUES ({})'.format(table_name, placeholders)
    return create_sql, insert_sql


def save_df_to_sqlite(sqlite_db, df, cols, table_name):
    """Helper to save df to sqlite."""
    with sqlite3.connect(sqlite_db) as conn:
        cur = conn.cursor()
        create_sql, insert_sql = create_insert_table_sql(table_name, cols)
        cur.execute(create_sql)
        cur.executemany(insert_sql, df[list(cols.keys())].values)


In [3]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import tqdm
import sys
import numpy as np
import pandas as pd
#sys.path.insert(0, "/Users/harrisonchase/workplace/sports/")

#from clean_sports_work.sports_reference.api import find_table, extract_table, create_insert_table_sql

In [606]:
all_dfs = []
for year in tqdm.tqdm(range(1950, 2020)):
    url = 'https://www.basketball-reference.com/leagues/NBA_{}_advanced.html'.format(year)
    html = urlopen(url)

    # create the BeautifulSoup object
    soup = BeautifulSoup(html, "lxml")

    table_str = find_table(soup, 'advanced_stats')

    yr2018 = extract_table(table_str, header_row=0, get_url=True, start_of_rows=1)
    drop_cols = yr2018.isnull().mean()[lambda x: x == 1].index
    for col in drop_cols:
        del yr2018[col]
    yr2018['year'] = year
    all_dfs.append(yr2018)

100%|██████████| 70/70 [03:07<00:00,  3.68s/it]


In [607]:
all_stats = pd.concat(all_dfs)

In [619]:
all_stats = all_stats.dropna(subset=['player_url', 'mp', 'age'])

In [615]:
all_stats = all_stats.replace('', np.nan)

In [620]:
float_cols = ['bpm', 'ts_pct', 'per', 'usg_pct', 'obpm', 'dbpm', 
              'fg3a_per_fga_pct', 'fta_per_fga_pct', 'orb_pct', 'drb_pct',
             'trb_pct', 'ast_pct', 'stl_pct', 'blk_pct', 'tov_pct', 'ws', 'ows', 'dws']
for col in float_cols:
    all_stats[col] = all_stats[col].astype(float)
    
int_cols = ['mp', 'age']
for col in int_cols:
    all_stats[col] = all_stats[col].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [621]:
all_stats['id'] = all_stats['player_url'] + '___' + all_stats['year'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [622]:
multiple_ids = all_stats['id'].value_counts()[lambda x: x> 1].index

In [979]:
base_all_stats = pd.concat([
    all_stats[~all_stats['id'].isin(multiple_ids)],
    all_stats[all_stats['id'].isin(multiple_ids) & (all_stats['team_id'] == 'TOT')],
])

In [980]:
most_recent_year = base_all_stats[base_all_stats['year'] == base_all_stats['year'].max()]
most_recent_year['year'] += 1
most_recent_year['age'] +=1
most_recent_year['id'] = most_recent_year['player_url'] + '___' + most_recent_year['year'].astype(str)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [981]:
all_stats = pd.concat([base_all_stats, most_recent_year])

In [982]:
target = all_stats[['player_url', 'year', 'ws', 'id', 'bpm']]

In [983]:
def get_stats(all_stats, year_change):
    all_prior_stats = all_stats.copy()
    all_prior_stats['year'] += year_change
    all_prior_stats['id'] = all_prior_stats['player_url'] + '___' + all_prior_stats['year'].astype(str)
    cols = ['mp', 'bpm', 'ts_pct', 'per', 'usg_pct', 'obpm', 'dbpm',
            'fg3a_per_fga_pct', 'fta_per_fga_pct', 'orb_pct', 'drb_pct',
             'trb_pct', 'ast_pct', 'stl_pct', 'blk_pct', 'tov_pct', 'ws', 'ows', 'dws']
    return all_prior_stats[['id'] + cols].rename(columns={col: '{}___{}'.format(col, year_change) for col in cols})

In [984]:
def merge_dfs(target_df, *args):
    res = target_df
    for arg in args:
        cols = arg.columns
        intersection = set(res.columns).intersection(cols)
        if intersection != {'id'}:
            raise ValueError
        else:
            res = res.merge(arg, how='left', on='id')
    return res

In [985]:
all_stats['pos'] = all_stats['pos'].str.split('-').str[0]

In [986]:
X_df1 = pd.concat([all_stats[['id', 'age', 'mp']], pd.get_dummies(all_stats['pos'])], axis=1)

In [987]:
year_in_league = target.merge(target.groupby('player_url')['year'].min().to_frame('min_year').reset_index())
year_in_league['years_pro'] = year_in_league['year'] - year_in_league['min_year']
year_in_league = year_in_league[['id', 'years_pro']]

In [988]:
all_df = merge_dfs(
    target, 
    X_df1, 
    year_in_league,
    get_stats(all_stats, 1),
    get_stats(all_stats, 2),
    get_stats(all_stats, 3),
).dropna(subset=['bpm'])

In [2266]:
all_df['lot_of_min'] = (all_df['mp___1'].fillna(0) + all_df['mp___2'].fillna(0)) > 1000

In [2267]:
all_df['ws_diff'] = all_df['ws___1'] - all_df['ws___2']
all_df['bpm_diff'] = all_df['bpm___1'] - all_df['bpm___2']
all_df['mp_diff'] = all_df['mp___1'] - all_df['mp___2']
all_df['bpm_int'] = all_df['bpm___1'] * all_df['bpm___2']
all_df['bpm_diff_2'] = all_df['bpm___2'] - all_df['bpm___3']
all_df['bpm_age_int'] = all_df['bpm___1'] * all_df['age']

In [2268]:
all_df['diff'] = all_df['bpm'] - all_df['bpm___1']

In [2269]:
train_df = all_df[
    (all_df['year'] > 1970) 
    & (all_df['year'] < 2019) 
    & (all_df['years_pro'] > 0) 
    &(all_df['diff'].notnull())
]

In [2270]:
import lightgbm as lgb
lgb_params = {
    'boosting_type': 'gbdt',
    'metric': ['rmse'],
    #'num_leaves': 20,
    'learning_rate': 0.05,
    #'feature_fraction': 0.6,
    #'bagging_fraction': 0.6,
    #'bagging_freq': 1,
    #'bagging_freq': 5,
    #'colsample_bytree': .4,
    #'min_data_in_leaf': 2,
    #'reg_alpha': 1,
    #'reg_lambda': 1,
    #'max_depth': 4,
    'verbose': 0
}

In [2271]:
y = train_df['diff']
drop_cols =['id', 'ws', 'bpm', 'player_url', 'year', 'mp', 'diff']
X = train_df.drop(drop_cols, 1)
X_all = all_df.drop(drop_cols, 1)

In [2272]:
weights = np.log10(train_df['mp']+1) #* 0 + 1

In [2273]:
lgb_data = lgb.Dataset(X, y, weight=weights)
out = lgb.cv(lgb_params, lgb_data, num_boost_round=10000, nfold=5,
             early_stopping_rounds=10,stratified=False)
cv_loss = out['rmse-mean'][-1]
cv_num_rounds = len(out['rmse-mean'])

cv_loss, cv_num_rounds

(2.39418222760402, 152)

In [2274]:
bst = lgb.LGBMRegressor(n_estimators=cv_num_rounds, **lgb_params)
bst.fit(X, y, sample_weight=weights)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.05, max_depth=-1,
              metric=['rmse'], min_child_samples=20, min_child_weight=0.001,
              min_split_gain=0.0, n_estimators=152, n_jobs=-1, num_leaves=31,
              objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,
              silent=True, subsample=1.0, subsample_for_bin=200000,
              subsample_freq=0, verbose=0)

In [2275]:
preds = bst.predict(X_all)

In [2276]:
pred_df = all_df.copy()
pred_df['pred'] = preds + pred_df['bpm___1']

In [2277]:
recent = pred_df[(pred_df['year'] > 2018) & (pred_df['year'] < 2020)].sort_values('pred')
recent = recent[(recent['years_pro'] > 0)]

In [2278]:
(((recent['bpm'] - recent['pred'])**2)**.5).mean()

2.1676610734861486

In [2279]:
recent.merge(recent1[['id', 'pred']], on='id')

Unnamed: 0,player_url,year,ws,id,bpm,age,mp,C,F,G,...,lot_of_min,ws_diff,bpm_diff,mp_diff,bpm_int,bpm_diff_2,bpm_age_int,diff,pred_x,pred_y
0,/players/b/bouchch01.html,2019,0.7,/players/b/bouchch01.html___2019,1.1,26,163,0,0,0,...,False,,,,,,-1534.0,60.1,-27.145053,-8.270845
1,/players/p/pattoju01.html,2019,0.0,/players/p/pattoju01.html___2019,-0.2,21,21,1,0,0,...,False,,,,,,69.3,-3.5,-8.730375,-3.391247
2,/players/d/doziepj01.html,2019,0.1,/players/d/doziepj01.html___2019,-1.1,22,51,0,0,0,...,False,,,,,,-748.0,32.9,-7.733395,-7.873882
3,/players/l/lemonwa01.html,2019,0.1,/players/l/lemonwa01.html___2019,-1.4,26,167,0,0,0,...,False,,,,,,-304.2,10.3,-6.583206,-6.481689
4,/players/h/hamilda02.html,2019,-0.1,/players/h/hamilda02.html___2019,-5.8,23,204,0,0,0,...,False,,,,,,-36.8,-4.2,-6.365367,-3.047119
5,/players/r/richama01.html,2019,-0.1,/players/r/richama01.html___2019,-10.8,23,103,0,0,0,...,False,-0.2,-0.9,126.0,35.20,,-147.2,-4.4,-5.738711,-4.580231
6,/players/l/lydonty01.html,2019,0.2,/players/l/lydonty01.html___2019,-2.5,22,94,0,0,0,...,False,,,,,,-143.0,4.0,-5.211629,-5.086721
7,/players/d/dellama01.html,2019,0.5,/players/d/dellama01.html___2019,-5.5,28,812,0,0,0,...,True,-0.7,-1.2,-1274.0,27.73,-3.1,-165.2,0.4,-5.200312,-4.584083
8,/players/r/rosede01.html,2019,3.0,/players/r/rosede01.html___2019,-0.6,30,1392,0,0,0,...,True,-3.2,-5.9,-1662.0,14.82,1.4,-234.0,7.2,-5.016934,-5.683634
9,/players/m/morrimo01.html,2019,6.2,/players/m/morrimo01.html___2019,0.2,23,1970,0,0,0,...,False,,,,,,46.0,-1.8,-4.937921,-1.032753


In [1820]:
def scorer(row, metric):
    row = row.fillna(0)
    summed = row['{}___1'.format(metric)] * row['mp___1']*3 + row['{}___2'.format(metric)] * row['mp___2']*2 + row['{}___3'.format(metric)] * row['mp___3']
    d =(row['mp___1']*3 + row['mp___2']*2 + row['mp___3'])
    if d > 0:
        return summed / d
    return -2

In [1286]:
def scorer_bpm(row):
    return scorer(row, 'bpm')
def scorer_ws(row):
    return scorer(row, 'ws')
def max_bpm(row):
    return row[['bpm___1', 'bpm___2', 'bpm___3']].max()

In [1289]:
recent['delta'] = recent['pred'] - recent.apply(scorer_ws, axis=1)
recent['delta'] = recent['pred'] - recent.apply(scorer_bpm, axis=1)
#recent['delta'] = recent['pred'] - recent.apply(max_bpm, axis=1)* 1

In [1290]:
recent.sort_values('delta', ascending=False)[lambda x: x['pred'] > .5]

Unnamed: 0,player_url,year,ws,id,bpm,age,mp,C,F,G,...,lot_of_min,ws_diff,bpm_diff,mp_diff,bpm_int,bpm_diff_2,bpm_age_int,diff,pred,delta
21164,/players/f/foxde01.html,2020,5.6,/players/f/foxde01.html___2020,1.1,22,2546,0,0,0,...,True,6.2,5.5,520.0,-4.84,,24.2,0.0,1.221688,2.028103
21384,/players/s/sabondo01.html,2020,7.6,/players/s/sabondo01.html___2020,3.7,23,1838,1,0,0,...,True,2.9,4.0,28.0,-1.11,4.5,85.1,0.0,2.567970,1.501446
21035,/players/a/antetgi01.html,2020,14.4,/players/a/antetgi01.html___2020,10.8,25,2358,0,0,0,...,True,2.5,5.0,-398.0,62.64,-1.8,270.0,0.0,9.740313,1.316309
21337,/players/n/nurkiju01.html,2020,7.8,/players/n/nurkiju01.html___2020,5.1,25,1974,1,0,0,...,True,3.1,4.8,-114.0,1.53,0.5,127.5,0.0,4.029698,1.316076
21383,/players/r/russeda01.html,2020,5.0,/players/r/russeda01.html___2020,3.4,23,2448,0,0,0,...,True,4.6,3.8,1214.0,-1.36,0.1,78.2,0.0,3.086084,1.100633
21075,/players/b/bookede01.html,2020,3.5,/players/b/bookede01.html___2020,0.8,23,2242,0,0,0,...,True,1.1,0.4,377.0,0.32,2.7,18.4,0.0,0.914783,0.869750
21029,/players/a/allenja01.html,2020,7.6,/players/a/allenja01.html___2020,2.5,21,2096,1,0,0,...,True,3.4,2.3,655.0,0.50,,52.5,0.0,2.628878,0.851735
21127,/players/d/davisan02.html,2020,9.5,/players/d/davisan02.html___2020,8.5,26,1850,1,0,0,...,True,-4.2,3.3,-877.0,44.20,1.5,221.0,0.0,7.033087,0.793633
21141,/players/d/doncilu01.html,2020,4.9,/players/d/doncilu01.html___2020,4.1,20,2318,0,0,0,...,True,,,,,,82.0,0.0,4.873589,0.773589
21023,/players/a/adebaba01.html,2020,6.8,/players/a/adebaba01.html___2020,3.0,22,1913,1,0,0,...,True,2.6,2.8,545.0,0.60,,66.0,0.0,2.829432,0.733361


In [1291]:
recent.sort_values('pred', ascending=False)

Unnamed: 0,player_url,year,ws,id,bpm,age,mp,C,F,G,...,lot_of_min,ws_diff,bpm_diff,mp_diff,bpm_int,bpm_diff_2,bpm_age_int,diff,pred,delta
21194,/players/h/hardeja01.html,2020,15.2,/players/h/hardeja01.html___2020,11.7,30,2867,0,0,0,...,True,-0.2,0.8,316.0,127.53,0.8,351.0,0.0,11.084683,-0.086981
21035,/players/a/antetgi01.html,2020,14.4,/players/a/antetgi01.html___2020,10.8,25,2358,0,0,0,...,True,2.5,5.0,-398.0,62.64,-1.8,270.0,0.0,9.740313,1.316309
21246,/players/j/jokicni01.html,2020,11.8,/players/j/jokicni01.html___2020,9.5,24,2504,1,0,0,...,True,1.1,2.4,61.0,67.45,-1.3,228.0,0.0,8.848674,0.316269
21240,/players/j/jamesle01.html,2020,7.2,/players/j/jamesle01.html___2020,8.1,35,1937,0,0,0,...,True,-6.8,-1.5,-1089.0,77.76,1.2,283.5,0.0,7.489493,-1.287057
21127,/players/d/davisan02.html,2020,9.5,/players/d/davisan02.html___2020,8.5,26,1850,1,0,0,...,True,-4.2,3.3,-877.0,44.20,1.5,221.0,0.0,7.033087,0.793633
21442,/players/w/westbru01.html,2020,6.8,/players/w/westbru01.html___2020,6.5,31,2630,0,0,0,...,True,-3.3,-1.7,-284.0,53.30,-7.4,201.5,0.0,6.234210,-2.408999
21417,/players/t/townska01.html,2020,10.4,/players/t/townska01.html___2020,6.8,24,2545,1,0,0,...,True,-3.6,1.3,-373.0,37.40,0.6,163.2,0.0,5.731027,-0.260307
21232,/players/i/irvinky01.html,2020,9.1,/players/i/irvinky01.html___2020,6.4,27,2214,0,0,0,...,True,0.2,0.2,283.0,39.68,3.7,172.8,0.0,5.669528,0.084625
21125,/players/c/curryst01.html,2020,9.7,/players/c/curryst01.html___2020,6.3,31,2331,0,0,0,...,True,0.6,-2.3,700.0,54.18,1.3,195.3,0.0,5.515595,-1.570925
21177,/players/g/goberru01.html,2020,14.4,/players/g/goberru01.html___2020,7.0,27,2577,1,0,0,...,True,6.3,2.7,761.0,30.10,-1.5,189.0,0.0,5.306386,-0.765053


In [None]:
studs = all_df[(all_df['years_pro'] == 1) & (all_df['lot_of_min']) & (all_df['bpm___1'] > 2.0)][lambda x: x['year']!=2020]

studs['diff'] = studs['bpm'] - studs['bpm___1']

studs.sort_values('year')[['player_url', 'diff']]

In [2207]:
all_df['lot_of_min'] = (
    all_df['bpm___1'].notnull()
    #& ((all_df['mp___1'].fillna(0) + all_df['mp___2'].fillna(0)) > 1000)
    & (all_df['mp___1'].fillna(0) > 1000)
    &(all_df['years_pro'] < 3)
    
)

In [2208]:
train_df = all_df[
    (all_df['year'] > 1970) 
    & (all_df['year'] < 2000) 
    & (all_df['years_pro'] > 0) 
]

In [2209]:
high_train = train_df[train_df['lot_of_min']]
low_train = train_df[~train_df['lot_of_min']]
high_all = all_df[all_df['lot_of_min']]
low_all = all_df[~all_df['lot_of_min']]

In [2210]:
y_high = high_train['diff']
X_high = high_train.drop(drop_cols, 1)
X_all_high = high_all.drop(drop_cols, 1)

In [2211]:
y_low = low_train['bpm']
X_low = low_train.drop(drop_cols, 1)
X_all_low = low_all.drop(drop_cols, 1)

In [2212]:
X_high.shape

(1181, 73)

In [2213]:
weights = np.log10(high_train['mp']+1) 
lgb_data = lgb.Dataset(X_high, y_high, weight=weights)
out = lgb.cv(lgb_params, lgb_data, num_boost_round=10000, nfold=5,
             early_stopping_rounds=10,stratified=False)
cv_loss = out['rmse-mean'][-1]
cv_num_rounds = len(out['rmse-mean'])

cv_loss, cv_num_rounds

(2.0048456528381964, 23)

In [2214]:
bst_high = lgb.LGBMRegressor(n_estimators=cv_num_rounds, **lgb_params).fit(X_high, y_high, sample_weight=weights)

In [2215]:
weights = np.log10(low_train['mp']+1) 
lgb_data = lgb.Dataset(X_low, y_low)
out = lgb.cv(lgb_params, lgb_data, num_boost_round=10000, nfold=5,
             early_stopping_rounds=10,stratified=False)
cv_loss = out['rmse-mean'][-1]
cv_num_rounds = len(out['rmse-mean'])

cv_loss, cv_num_rounds

(3.330401118917867, 63)

In [2216]:
bst_low = lgb.LGBMRegressor(n_estimators=cv_num_rounds, **lgb_params).fit(X_low, y_low, sample_weight=weights)

In [2217]:
preds_high = bst_high.predict(X_all_high)
high_all['pred'] = preds_high + high_all['bpm___1']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [2218]:
preds_low = bst_low.predict(X_all_low)
low_all['pred'] = preds_low 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [2219]:
recent.shape

(7415, 82)

In [2220]:
all_joined = pd.concat([high_all, low_all])

In [2221]:
recent = all_joined[(all_joined['year'] > 2000) & (all_joined['year'] < 2020)].sort_values('pred')
recent = recent[(recent['years_pro'] > 0)]

In [2222]:
recent.shape

(7415, 81)

In [2223]:
recent['error'] = (recent['bpm'] - recent['pred'])**2

In [2224]:
((recent['error'])**.5).mean()

1.9659955980914825

In [2225]:
#recent1 = recent.copy()

In [1882]:
huh = recent[['id', 'bpm', 'bpm___1', 'mp___1','error']].merge(recent1[['id', 'error']], on='id')

In [1883]:
huh[huh['error_x']!=huh['error_y']]

Unnamed: 0,id,bpm,bpm___1,mp___1,error_x,error_y
0,/players/b/bouchch01.html___2019,1.1,-59.0,1.0,90.533179,49.131705
1,/players/d/doziepj01.html___2019,-1.1,-34.0,3.0,48.980944,25.296974
2,/players/l/lemonwa01.html___2019,-1.4,-11.7,35.0,33.726147,13.947940
3,/players/h/houseda01.html___2018,-2.7,-20.8,1.0,16.832049,9.060543
4,/players/m/milleda01.html___2018,-1.0,,,31.905745,7.302324
5,/players/d/drewla02.html___2018,-10.7,,,19.772226,48.968067
6,/players/b/brownbo02.html___2018,-9.3,-7.6,123.0,9.383190,19.700612
7,/players/n/niangge01.html___2018,-4.0,-12.0,93.0,4.645335,0.856790
8,/players/m/montelu01.html___2018,-25.9,,,415.673773,454.536506
9,/players/w/weartr01.html___2018,-5.1,,,0.136955,1.953618


In [1591]:
recent[recent['id'] == '/players/c/curryse01.html___2019']

Unnamed: 0,player_url,year,ws,id,bpm,age,mp,C,F,G,...,lot_of_min,ws_diff,bpm_diff,mp_diff,bpm_int,bpm_diff_2,bpm_age_int,diff,pred,error
18503,/players/c/curryse01.html,2019,2.4,/players/c/curryse01.html___2019,-1.3,28,1399,0,0,0,...,True,,,,,2.9,,,,
