### Load libraries

In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import tqdm
import sys
import numpy as np
import pandas as pd
import re

### Load helpers

In [2]:
# This may need to change
sys.path.insert(0, "/Users/harrisonchase/workplace/sports/")

from clean_sports_work.sports_reference.api import find_table, extract_table, create_insert_table_sql

### Get data for each year

In [3]:
all_dfs = []
for year in tqdm.tqdm(range(1950, 2020)):
    url = 'https://www.basketball-reference.com/leagues/NBA_{}_advanced.html'.format(year)
    html = urlopen(url)

    # create the BeautifulSoup object
    soup = BeautifulSoup(html, "lxml")

    table_str = find_table(soup, 'advanced_stats')

    yr2018 = extract_table(table_str, header_row=0, get_url=True, start_of_rows=1)
    drop_cols = yr2018.isnull().mean()[lambda x: x == 1].index
    for col in drop_cols:
        del yr2018[col]
    yr2018['year'] = year
    all_dfs.append(yr2018)

100%|██████████| 70/70 [04:34<00:00,  5.01s/it]


In [4]:
all_draft_dfs = []
for year in tqdm.tqdm(range(1950, 2020)):
    url = 'https://www.basketball-reference.com/draft/NBA_{}.html'.format(year)
    html = urlopen(url)

    # create the BeautifulSoup object
    soup = BeautifulSoup(html, "lxml")

    table_str = find_table(soup, 'stats')

    yr2018 = extract_table(table_str, header_row=1, get_url=True, start_of_rows=2)
    drop_cols = yr2018.isnull().mean()[lambda x: x == 1].index
    for col in drop_cols:
        del yr2018[col]
    yr2018['year'] = year
    all_draft_dfs.append(yr2018)

100%|██████████| 70/70 [02:08<00:00,  1.47s/it]


In [5]:
mvp_dfs = []
mip_dfs = []
roy_dfs = []
for year in tqdm.tqdm(range(1960, 2020)):
    url = 'https://www.basketball-reference.com/awards/awards_{}.html'.format(year)
    html = urlopen(url)

    # create the BeautifulSoup object
    soup = BeautifulSoup(html, "lxml")
    try:
        table_str = find_table(soup, 'mvp')

        yr2018 = extract_table(table_str, header_row=1, get_url=True, start_of_rows=2)
        drop_cols = yr2018.isnull().mean()[lambda x: x == 1].index
        for col in drop_cols:
            del yr2018[col]
        yr2018['year'] = year
        mvp_dfs.append(yr2018)
    except:
        pass
    try:
        table_str = find_table(soup, 'mip')

        yr2018 = extract_table(table_str, header_row=1, get_url=True, start_of_rows=2)
        drop_cols = yr2018.isnull().mean()[lambda x: x == 1].index
        for col in drop_cols:
            del yr2018[col]
        yr2018['year'] = year
        mip_dfs.append(yr2018)
    except:
        pass
    try:
        table_str = find_table(soup, 'roy')

        yr2018 = extract_table(table_str, header_row=1, get_url=True, start_of_rows=2)
        drop_cols = yr2018.isnull().mean()[lambda x: x == 1].index
        for col in drop_cols:
            del yr2018[col]
        yr2018['year'] = year
        roy_dfs.append(yr2018)
    except: 
        pass

100%|██████████| 60/60 [01:37<00:00,  1.66s/it]


In [6]:
playoff_dfs = []
for year in tqdm.tqdm(range(1950, 2020)):
    url = 'https://www.basketball-reference.com/playoffs/NBA_{}_advanced.html'.format(year)
    html = urlopen(url)

    # create the BeautifulSoup object
    soup = BeautifulSoup(html, "lxml")

    table_str = find_table(soup, 'advanced_stats')

    yr2018 = extract_table(table_str, header_row=0, get_url=True, start_of_rows=1)
    drop_cols = yr2018.isnull().mean()[lambda x: x == 1].index
    for col in drop_cols:
        del yr2018[col]
    yr2018['year'] = year
    playoff_dfs.append(yr2018)

100%|██████████| 70/70 [02:46<00:00,  2.19s/it]


### Rough cleaning of data

In [7]:
all_stats = pd.concat(all_dfs)
all_stats = all_stats.replace('', np.nan)
all_stats = all_stats.dropna(subset=['player_url', 'mp', 'age'])
float_cols = ['bpm', 'ts_pct', 'per', 'usg_pct', 'obpm', 'dbpm', 
              'fg3a_per_fga_pct', 'fta_per_fga_pct', 'orb_pct', 'drb_pct',
             'trb_pct', 'ast_pct', 'stl_pct', 'blk_pct', 'tov_pct', 'ws', 'ows', 'dws']
for col in float_cols:
    all_stats[col] = all_stats[col].astype(float)
    
int_cols = ['mp', 'age']
for col in int_cols:
    all_stats[col] = all_stats[col].astype(int)

In [8]:
playoff_stats = pd.concat(playoff_dfs)
playoff_stats = playoff_stats.replace('', np.nan)
playoff_stats = playoff_stats.dropna(subset=['player_url', 'mp', 'age'])
float_cols = ['bpm', 'ts_pct', 'per', 'usg_pct', 'obpm', 'dbpm', 
              'fg3a_per_fga_pct', 'fta_per_fga_pct', 'orb_pct', 'drb_pct',
             'trb_pct', 'ast_pct', 'stl_pct', 'blk_pct', 'tov_pct', 'ws', 'ows', 'dws']
for col in float_cols:
    playoff_stats[col] = playoff_stats[col].astype(float)
    
int_cols = ['mp', 'age']
for col in int_cols:
    playoff_stats[col] = playoff_stats[col].astype(int)

In [9]:
all_stats['pos'] = all_stats['pos'].str.split('-').str[0]

In [10]:
mvp_df = pd.concat(mvp_dfs)
mip_df = pd.concat(mip_dfs)
roy_df = pd.concat(roy_dfs)

In [11]:
mvp_df['award_share'] = mvp_df['award_share'].astype(float)
mip_df['award_share'] = mip_df['award_share'].astype(float)
roy_df['award_share'] = roy_df['award_share'].astype(float)

### Draft stats

In [12]:
all_draft = pd.concat(all_draft_dfs).dropna(subset=['player_url'])
all_draft = all_draft.replace('', np.nan)
all_draft = all_draft.drop_duplicates(subset=['player_url'], keep='last')

In [13]:
all_draft['pick_overall'] = all_draft['pick_overall'].astype(float)

In [14]:
all_stats = all_stats.merge(all_draft[['player_url', 'pick_overall', 'college_name']], how='left', on='player_url')

### Create id for each (player, year)

In [15]:
all_stats['id'] = all_stats['player_url'] + '___' + all_stats['year'].astype(str)
playoff_stats['id'] = playoff_stats['player_url'] + '___' + playoff_stats['year'].astype(str)
mvp_df['id'] = mvp_df['player_url'] + '___' + mvp_df['year'].astype(str)
mip_df['id'] = mip_df['player_url'] + '___' + mip_df['year'].astype(str)
roy_df['id'] = roy_df['player_url'] + '___' + roy_df['year'].astype(str)

### Playoff stats

In [16]:
all_stats = all_stats.merge(playoff_stats[['id', 'mp', 'bpm']].rename(columns={'mp':'playoff_mp', 'bpm': 'playoff_bpm'}), how='left', on='id')

### Deduplicate ids

Occurs if player played on multiple teams in one season, take row where team == 'TOT'

In [17]:
multiple_ids = all_stats['id'].value_counts()[lambda x: x> 1].index

In [18]:
changed = all_stats[all_stats['id'].isin(multiple_ids) & (all_stats['team_id'] == 'TOT')]
first_team = all_stats.iloc[changed.index + 1]
if not all(first_team['player_url'].values == changed['player_url'].values):
    raise ValueError
changed['started_team'] = first_team['team_id'].values
no_change = all_stats[~all_stats['id'].isin(multiple_ids)]
no_change['started_team'] = no_change['team_id']
base_all_stats = pd.concat([
    no_change,
    changed,
])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


### Rename teams

In [19]:
team_ids = base_all_stats[base_all_stats['mp'].notnull()]['team_id'].unique()

In [20]:
team_renamer = dict()
for team_id in set(team_ids).difference({'TOT'}):
    url = 'https://www.basketball-reference.com/teams/{}/'.format(team_id)
    html = urlopen(url)

    # create the BeautifulSoup object
    soup = BeautifulSoup(html, "lxml")
    s = soup.find('script').text
    if (s is not None) and ('teams/' in s):
        regexp = re.compile("teams/(.*)/")
        s1 = regexp.search(s).group(1)
        team_renamer[team_id] = s1

In [21]:
base_all_stats['team_id'] = base_all_stats['team_id'].apply(team_renamer.get).fillna(base_all_stats['team_id'])
base_all_stats['started_team'] = base_all_stats['started_team'].apply(team_renamer.get).fillna(base_all_stats['started_team'])

base_all_stats['started_team_url'] = '/teams/' + base_all_stats['started_team'] + '/' + base_all_stats['year'].astype(str) +'.html'

### Add team data

In [22]:
team_ids = base_all_stats[base_all_stats['mp'].notnull()]['team_id'].unique()

In [23]:
team_dfs = []
for team_id in set(team_ids).difference({'TOT'}):
    url = 'https://www.basketball-reference.com/teams/{}/'.format(team_id)
    html = urlopen(url)

    # create the BeautifulSoup object
    soup = BeautifulSoup(html, "lxml")

    table_str = find_table(soup, team_id)

    yr2018 = extract_table(table_str, header_row=0, get_url=True, start_of_rows=1)

    drop_cols = set(yr2018.isnull().mean()[lambda x: x == 1].index)
    for col in drop_cols:
        del yr2018[col]

    yr2018['year'] = yr2018['lg_id_url'].str[-9:-5].astype(int)
    team_dfs.append(yr2018)

### Add new rosters

In [27]:
all_team_df = pd.concat(team_dfs).rename(columns={'team_name_url': 'started_team_url'})
team_urls = all_team_df[all_team_df['year'] == 2020]['started_team_url'].unique()

In [28]:
new_team_info = []
for team_url in team_urls:

    url = 'https://www.basketball-reference.com' + team_url

    html = urlopen(url)
    soup = BeautifulSoup(html, "lxml")

    table_str = find_table(soup, 'roster')
    yr2018 = extract_table(table_str, header_row=0, get_url=True, start_of_rows=1)

    yr2018['team_url'] = team_url
    new_team_info.append(yr2018[['player_url', 'team_url', 'years_experience', 'pos']])

In [29]:
most_recent_year = base_all_stats[base_all_stats['year'] == base_all_stats['year'].max()]
most_recent_year['year'] += 1
most_recent_year['age'] +=1
most_recent_year['id'] = most_recent_year['player_url'] + '___' + most_recent_year['year'].astype(str)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [30]:
new_team_df = pd.concat(new_team_info)
new_team_df['year'] = 2020
new_team_df['id'] = new_team_df['player_url'] + '___' + new_team_df['year'].astype(str)
new_team_df['pos'] = new_team_df['pos'].str.split('-').str[0]

In [31]:
most_recent_year =new_team_df.drop(['pos'], 1).merge(most_recent_year.drop(['player_url', 'year'], 1), how='left', on='id')

In [32]:
most_recent_year = most_recent_year[most_recent_year['years_experience'] != 'R']

In [33]:
most_recent_year['team_id'] = most_recent_year['team_url'].str[7:10]
most_recent_year['started_team'] = most_recent_year['team_id']
most_recent_year['team_id_url'] = most_recent_year['team_url']

In [34]:
most_recent_year['team_id'] = most_recent_year['team_id'].apply(team_renamer.get).fillna(most_recent_year['team_id'])
most_recent_year['started_team'] = most_recent_year['started_team'].apply(team_renamer.get).fillna(most_recent_year['started_team'])

most_recent_year['started_team_url'] = '/teams/' + most_recent_year['started_team'] + '/' + most_recent_year['year'].astype(str) +'.html'

In [35]:
for col in most_recent_year.columns:
    if col not in ['player', 'pos', 'age', 'team_id', 'id', 'player_url', 'pick_overall', 'college_name', 'started_team', 'year']:
        most_recent_year[col] = np.nan

In [36]:
for i, row in most_recent_year.iterrows():
    if pd.isnull(row['age']):
        sub = base_all_stats[base_all_stats['player_url'] == row['player_url']].sort_values('year')
        if sub.empty:
            raise ValueError
        else:
            sub_row = sub.iloc[-1]
        for col in ['pos', 'player', 'pick_overall', 'college_name']:
            most_recent_year.loc[i, col] = sub_row[col]
        most_recent_year.loc[i, 'age'] = sub_row['age'] + 2020 - sub_row['year']

In [37]:
all_stats = pd.concat([base_all_stats, most_recent_year])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


### Team info

In [38]:
all_team_df = pd.concat(team_dfs).rename(columns={'team_name_url': 'started_team_url'})
all_team_df['team_url'] = all_team_df['started_team_url']
all_team_df['team_id'] = all_team_df['started_team_url'].str[7:10]
all_team_df['team_id'] = all_team_df['team_id'].apply(team_renamer.get).fillna(all_team_df['team_id'])
all_team_df['started_team_url'] = '/teams/' + all_team_df['team_id'] + '/' + all_team_df['year'].astype(str) + '.html'

In [39]:
all_team_df['made_playoffs'] = all_team_df['rank_team_playoffs'].notnull()

In [40]:
all_team_df = all_team_df.replace('', np.nan)

In [41]:
for col in ['win_loss_pct', 'srs', 'pace_rel', 'off_rtg_rel', 'def_rtg_rel', 'made_playoffs']:
    all_team_df[col] = all_team_df[col].astype(float)

In [42]:
filtered_team_df = all_team_df[['started_team_url', 'win_loss_pct', 'srs', 'pace_rel', 'off_rtg_rel', 'def_rtg_rel', 'made_playoffs']]

In [43]:
all_stats = all_stats.merge(filtered_team_df, how='left', on='started_team_url')

### Coach info

In [44]:
coaches_by_year = all_team_df[['started_team_url', 'coaches_url', 'year']].fillna(method='bfill')

In [45]:
coaches_by_year['team_id'] = coaches_by_year['started_team_url'].str[7:10]
coaches_by_year['team_id'] = coaches_by_year['team_id'].apply(team_renamer.get).fillna(coaches_by_year['team_id'])

In [46]:
p_year = coaches_by_year.copy()
p_year['year'] += 1

In [47]:
merged = coaches_by_year.merge(p_year, how='left', on=['team_id', 'year'], suffixes=('', '_p'))
merged = merged.iloc[::-1].reset_index(drop=True)

In [48]:
merged['year_coaching'] = 0

In [49]:
prev = np.nan
p_t = np.nan
yr_coaching = 0
for i, row in merged.iterrows():
    if pd.isnull(prev):
        merged.loc[i, 'year_coaching'] = 0
        yr_coaching = 0
    elif (row['coaches_url'] == row['coaches_url_p']) and (p_t == row['team_id']):
        yr_coaching +=1
        merged.loc[i, 'year_coaching'] = yr_coaching
    else:
        merged.loc[i, 'year_coaching'] = 0
        yr_coaching = 0
    prev = row['coaches_url']
    p_t = row['team_id']

In [50]:
new = {
    'CLE', 'LAL', 'MEM', 'MIN', 'PHO', 'SAC'
}

In [51]:
for n in new:
    mask = (merged['team_id'] == n) & (merged['year'] == 2020)
    if sum(mask) != 1:
        raise ValueError
    merged.loc[mask, 'year_coaching'] = 0

In [52]:
all_stats_team = all_stats.merge(merged[['started_team_url', 'year_coaching']], how='left', on='started_team_url')

In [53]:
all_stats_team = all_stats_team.merge(
    mvp_df[['id', 'award_share']].rename(columns={'award_share': 'mvp_award_share'}),
    how='left', on='id'
)
all_stats_team = all_stats_team.merge(
    mip_df[['id', 'award_share']].rename(columns={'award_share': 'mip_award_share'}),
    how='left', on='id'
)
all_stats_team = all_stats_team.merge(
    roy_df[['id', 'award_share']].rename(columns={'award_share': 'roy_award_share'}),
    how='left', on='id'
)

### Save data

In [54]:
all_stats_team.to_msgpack('all_stats.mp')