### Load libraries

In [5]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import tqdm
import sys
import numpy as np
import pandas as pd

### Load helpers

In [6]:
# This may need to change
sys.path.insert(0, "/Users/harrisonchase/workplace/sports/")

from clean_sports_work.sports_reference.api import find_table, extract_table, create_insert_table_sql

### Get data for each year

In [11]:
all_dfs = []
for year in tqdm.tqdm(range(1950, 2020)):
    url = 'https://www.basketball-reference.com/leagues/NBA_{}_advanced.html'.format(year)
    html = urlopen(url)

    # create the BeautifulSoup object
    soup = BeautifulSoup(html, "lxml")

    table_str = find_table(soup, 'advanced_stats')

    yr2018 = extract_table(table_str, header_row=0, get_url=True, start_of_rows=1)
    drop_cols = yr2018.isnull().mean()[lambda x: x == 1].index
    for col in drop_cols:
        del yr2018[col]
    yr2018['year'] = year
    all_dfs.append(yr2018)

100%|██████████| 70/70 [03:47<00:00,  5.48s/it]


In [12]:
all_draft_dfs = []
for year in tqdm.tqdm(range(1950, 2020)):
    url = 'https://www.basketball-reference.com/draft/NBA_{}.html'.format(year)
    html = urlopen(url)

    # create the BeautifulSoup object
    soup = BeautifulSoup(html, "lxml")

    table_str = find_table(soup, 'stats')

    yr2018 = extract_table(table_str, header_row=1, get_url=True, start_of_rows=2)
    drop_cols = yr2018.isnull().mean()[lambda x: x == 1].index
    for col in drop_cols:
        del yr2018[col]
    yr2018['year'] = year
    all_draft_dfs.append(yr2018)

100%|██████████| 70/70 [02:05<00:00,  1.34s/it]


### Rough cleaning of data

In [49]:
all_stats = pd.concat(all_dfs)
all_stats = all_stats.replace('', np.nan)
all_stats = all_stats.dropna(subset=['player_url', 'mp', 'age'])
float_cols = ['bpm', 'ts_pct', 'per', 'usg_pct', 'obpm', 'dbpm', 
              'fg3a_per_fga_pct', 'fta_per_fga_pct', 'orb_pct', 'drb_pct',
             'trb_pct', 'ast_pct', 'stl_pct', 'blk_pct', 'tov_pct', 'ws', 'ows', 'dws']
for col in float_cols:
    all_stats[col] = all_stats[col].astype(float)
    
int_cols = ['mp', 'age']
for col in int_cols:
    all_stats[col] = all_stats[col].astype(int)

In [50]:
all_stats['pos'] = all_stats['pos'].str.split('-').str[0]

### Draft stats

In [51]:
all_draft = pd.concat(all_draft_dfs).dropna(subset=['player_url'])
all_draft = all_draft.replace('', np.nan)

In [52]:
all_draft['pick_overall'] = all_draft['pick_overall'].astype(float)

In [53]:
all_stats = all_stats.merge(all_draft[['player_url', 'pick_overall', 'college_name']], how='left', on='player_url')

### Create id for each (player, year)

In [54]:
all_stats['id'] = all_stats['player_url'] + '___' + all_stats['year'].astype(str)

### Deduplicate ids

Occurs if player played on multiple teams in one season, take row where team == 'TOT'

In [55]:
multiple_ids = all_stats['id'].value_counts()[lambda x: x> 1].index

In [67]:
changed = all_stats[all_stats['id'].isin(multiple_ids) & (all_stats['team_id'] == 'TOT')]
first_team = all_stats.iloc[changed.index + 1]
if not all(first_team['player_url'].values == changed['player_url'].values):
    raise ValueError
changed['started_team'] = first_team['team_id'].values
no_change = all_stats[~all_stats['id'].isin(multiple_ids)]
no_change['started_team'] = no_change['team_id']
base_all_stats = pd.concat([
    no_change,
    changed,
])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


### Add in row for next year

Be careful not to use this in your model evaluation!!! Is only present for ease of predicting next year

In [76]:
most_recent_year = base_all_stats[base_all_stats['year'] == base_all_stats['year'].max()]
most_recent_year['year'] += 1
most_recent_year['age'] +=1
most_recent_year['id'] = most_recent_year['player_url'] + '___' + most_recent_year['year'].astype(str)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [112]:
def get_team(url):
    html = urlopen(url)

    # create the BeautifulSoup object
    soup = BeautifulSoup(html, "lxml")

    team_p = [x for x in soup.findAll('div', {'id': 'meta'})[0].findAll('p') if x.find('strong') and (x.find('strong').text == 'Team')]

    if len(team_p) == 0:
        return np.nan
    elif len(team_p) == 1:
        return team_p[0].find('a')['href'][7:10]
    else:
        raise ValueError

In [116]:
teams_now = []
for url_suffix in tqdm.tqdm(most_recent_year['player_url']):
    url = 'https://www.basketball-reference.com' + url_suffix
    teams_now.append(get_team(url))


  0%|          | 0/530 [00:00<?, ?it/s][A
  0%|          | 1/530 [00:03<27:02,  3.07s/it][A
  0%|          | 2/530 [00:06<26:57,  3.06s/it][A
  1%|          | 3/530 [00:07<21:14,  2.42s/it][A
  1%|          | 4/530 [00:08<19:09,  2.19s/it][A
  1%|          | 5/530 [00:11<20:03,  2.29s/it][A
  1%|          | 6/530 [00:13<20:22,  2.33s/it][A
  1%|▏         | 7/530 [00:15<20:15,  2.32s/it][A
  2%|▏         | 8/530 [00:18<20:18,  2.33s/it][A
  2%|▏         | 9/530 [00:19<16:22,  1.89s/it][A
  2%|▏         | 10/530 [00:20<13:59,  1.61s/it][A
  2%|▏         | 11/530 [00:21<12:30,  1.45s/it][A
  2%|▏         | 12/530 [00:22<11:22,  1.32s/it][A
  2%|▏         | 13/530 [00:23<11:48,  1.37s/it][A
  3%|▎         | 14/530 [00:25<12:12,  1.42s/it][A
  3%|▎         | 15/530 [00:26<11:49,  1.38s/it][A
  3%|▎         | 16/530 [00:27<10:28,  1.22s/it][A
  3%|▎         | 17/530 [00:28<10:46,  1.26s/it][A
  3%|▎         | 18/530 [00:29<09:38,  1.13s/it][A
  4%|▎         | 19/530 [00:3

 29%|██▉       | 156/530 [03:23<09:43,  1.56s/it][A
 30%|██▉       | 157/530 [03:25<08:51,  1.42s/it][A
 30%|██▉       | 158/530 [03:26<07:58,  1.29s/it][A
 30%|███       | 159/530 [03:27<07:53,  1.28s/it][A
 30%|███       | 160/530 [03:28<07:01,  1.14s/it][A
 30%|███       | 161/530 [03:29<07:07,  1.16s/it][A
 31%|███       | 162/530 [03:30<07:58,  1.30s/it][A
 31%|███       | 163/530 [03:32<08:34,  1.40s/it][A
 31%|███       | 164/530 [03:33<07:23,  1.21s/it][A
 31%|███       | 165/530 [03:34<07:24,  1.22s/it][A
 31%|███▏      | 166/530 [03:35<07:35,  1.25s/it][A
 32%|███▏      | 167/530 [03:37<07:28,  1.24s/it][A
 32%|███▏      | 168/530 [03:37<06:38,  1.10s/it][A
 32%|███▏      | 169/530 [03:39<07:22,  1.23s/it][A
 32%|███▏      | 170/530 [03:40<07:34,  1.26s/it][A
 32%|███▏      | 171/530 [03:42<08:09,  1.36s/it][A
 32%|███▏      | 172/530 [03:44<09:32,  1.60s/it][A
 33%|███▎      | 173/530 [03:46<09:42,  1.63s/it][A
 33%|███▎      | 174/530 [03:47<08:17,  1.40s/

 58%|█████▊    | 310/530 [06:47<04:42,  1.28s/it][A
 59%|█████▊    | 311/530 [06:49<04:56,  1.35s/it][A
 59%|█████▉    | 312/530 [06:51<05:26,  1.50s/it][A
 59%|█████▉    | 313/530 [06:52<05:36,  1.55s/it][A
 59%|█████▉    | 314/530 [06:54<05:48,  1.61s/it][A
 59%|█████▉    | 315/530 [06:57<07:00,  1.96s/it][A
 60%|█████▉    | 316/530 [06:59<06:48,  1.91s/it][A
 60%|█████▉    | 317/530 [07:02<07:45,  2.19s/it][A
 60%|██████    | 318/530 [07:03<06:23,  1.81s/it][A
 60%|██████    | 319/530 [07:04<06:13,  1.77s/it][A
 60%|██████    | 320/530 [07:05<05:31,  1.58s/it][A
 61%|██████    | 321/530 [07:07<05:06,  1.47s/it][A
 61%|██████    | 322/530 [07:08<05:07,  1.48s/it][A
 61%|██████    | 323/530 [07:09<04:40,  1.35s/it][A
 61%|██████    | 324/530 [07:10<04:25,  1.29s/it][A
 61%|██████▏   | 325/530 [07:11<03:56,  1.16s/it][A
 62%|██████▏   | 326/530 [07:12<03:37,  1.07s/it][A
 62%|██████▏   | 327/530 [07:14<04:11,  1.24s/it][A
 62%|██████▏   | 328/530 [07:15<04:25,  1.32s/

 88%|████████▊ | 464/530 [10:26<02:22,  2.15s/it][A
 88%|████████▊ | 465/530 [10:27<02:03,  1.90s/it][A
 88%|████████▊ | 466/530 [10:29<02:07,  1.99s/it][A
 88%|████████▊ | 467/530 [10:31<01:57,  1.87s/it][A
 88%|████████▊ | 468/530 [10:32<01:44,  1.69s/it][A
 88%|████████▊ | 469/530 [10:34<01:43,  1.69s/it][A
 89%|████████▊ | 470/530 [10:35<01:41,  1.69s/it][A
 89%|████████▉ | 471/530 [10:38<01:50,  1.87s/it][A
 89%|████████▉ | 472/530 [10:40<02:00,  2.08s/it][A
 89%|████████▉ | 473/530 [10:42<01:45,  1.85s/it][A
 89%|████████▉ | 474/530 [10:43<01:33,  1.67s/it][A
 90%|████████▉ | 475/530 [10:45<01:31,  1.65s/it][A
 90%|████████▉ | 476/530 [10:46<01:21,  1.50s/it][A
 90%|█████████ | 477/530 [10:48<01:26,  1.63s/it][A
 90%|█████████ | 478/530 [10:49<01:21,  1.56s/it][A
 90%|█████████ | 479/530 [10:50<01:17,  1.52s/it][A
 91%|█████████ | 480/530 [10:51<01:07,  1.35s/it][A
 91%|█████████ | 481/530 [10:53<01:07,  1.39s/it][A
 91%|█████████ | 482/530 [10:54<00:59,  1.24s/

In [119]:
most_recent_year['team_id'] = teams_now
most_recent_year['started_team'] = teams_now

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [122]:
most_recent_year = most_recent_year.dropna(subset=['started_team'])

In [123]:
for col in most_recent_year.columns:
    if col not in ['player', 'pos', 'age', 'team_id', 'id', 'player_url', 'pick_overall', 'college_name', 'started_team', 'year']:
        most_recent_year[col] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [125]:
all_stats = pd.concat([base_all_stats, most_recent_year])

### Save data

In [126]:
all_stats.to_msgpack('all_stats.mp')