### Load libraries

In [None]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import tqdm
import sys
import numpy as np
import pandas as pd

### Load helpers

In [None]:
# This may need to change
sys.path.insert(0, "/Users/harrisonchase/workplace/sports/")

from clean_sports_work.sports_reference.api import find_table, extract_table, create_insert_table_sql

### Get data for each year

In [None]:
all_dfs = []
for year in tqdm.tqdm(range(1950, 2020)):
    url = 'https://www.basketball-reference.com/leagues/NBA_{}_advanced.html'.format(year)
    html = urlopen(url)

    # create the BeautifulSoup object
    soup = BeautifulSoup(html, "lxml")

    table_str = find_table(soup, 'advanced_stats')

    yr2018 = extract_table(table_str, header_row=0, get_url=True, start_of_rows=1)
    drop_cols = yr2018.isnull().mean()[lambda x: x == 1].index
    for col in drop_cols:
        del yr2018[col]
    yr2018['year'] = year
    all_dfs.append(yr2018)

### Rough cleaning of data

In [None]:
all_stats = pd.concat(all_dfs)
all_stats = all_stats.dropna(subset=['player_url', 'mp', 'age'])
all_stats = all_stats.replace('', np.nan)
float_cols = ['bpm', 'ts_pct', 'per', 'usg_pct', 'obpm', 'dbpm', 
              'fg3a_per_fga_pct', 'fta_per_fga_pct', 'orb_pct', 'drb_pct',
             'trb_pct', 'ast_pct', 'stl_pct', 'blk_pct', 'tov_pct', 'ws', 'ows', 'dws']
for col in float_cols:
    all_stats[col] = all_stats[col].astype(float)
    
int_cols = ['mp', 'age']
for col in int_cols:
    all_stats[col] = all_stats[col].astype(int)

In [None]:
all_stats['pos'] = all_stats['pos'].str.split('-').str[0]

### Create id for each (player, year)

In [None]:
all_stats['id'] = all_stats['player_url'] + '___' + all_stats['year'].astype(str)

### Deduplicate ids

Occurs if player played on multiple teams in one season, take row where team == 'TOT'

In [None]:
multiple_ids = all_stats['id'].value_counts()[lambda x: x> 1].index

In [None]:
base_all_stats = pd.concat([
    all_stats[~all_stats['id'].isin(multiple_ids)],
    all_stats[all_stats['id'].isin(multiple_ids) & (all_stats['team_id'] == 'TOT')],
])

### Add in row for next year

Be careful not to use this in your model evaluation!!! Is only present for ease of predicting next year

In [None]:
most_recent_year = base_all_stats[base_all_stats['year'] == base_all_stats['year'].max()]
most_recent_year['year'] += 1
most_recent_year['age'] +=1
most_recent_year['id'] = most_recent_year['player_url'] + '___' + most_recent_year['year'].astype(str)


In [None]:
all_stats = pd.concat([base_all_stats, most_recent_year])

### Save data

In [None]:
all_stats.to_msgpack('all_stats.mp')