In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re

In [2]:
url = 'https://www.basketball-reference.com/leagues/NBA_2015_totals.html'

In [3]:
req0 = requests.get(url=url)
soup = BeautifulSoup(req0.content, 'lxml')

In [4]:
data_ = soup.find_all('tr')[1:]

In [5]:
headers = soup.find_all('tr')[0].find_all('th')

headers_columns = []
for i in headers:
    headers_columns.append(i.get_text())

In [6]:
player_data = [[td.getText() for td in data_[i].findAll('td')]
                for i in range(len(data_))]

In [7]:
df = pd.DataFrame(player_data, columns=headers_columns[1:])
df = df.loc[df['Player'] != 'Player'].reset_index(drop=True)
df.dropna(inplace=True)

In [8]:
df.columns = df.columns.str.replace('%', '_pct').str.lower()

In [9]:
# data_[24].find_all('a')[0].get('href')
# data_[24].find_all('a')

playerid_v0 = []

for j in data_:
    if len(j.find_all('a')) > 0:
        playerid_v0.append(j.find_all('a')[0].get('href'))
    

playerid_v1 = []

for p in playerid_v0:
    playerid_v1.append(p.replace('players/', '').replace('.html', '').split('/')[2])

In [10]:
df['year'] = 2015
df['playerid'] = playerid_v1


In [11]:
len(df), len(playerid_v1)

(651, 651)

In [12]:
size_df = df.groupby(['player', 'playerid', 'pos', 'age']).size().reset_index().rename(columns={0: 'freq'})

In [13]:
size_df.head()

Unnamed: 0,player,playerid,pos,age,freq
0,A.J. Price,priceaj01,PG,28,4
1,Aaron Brooks,brookaa01,PG,30,1
2,Aaron Gordon,gordoaa01,PF,19,1
3,Adreian Payne,paynead01,PF,23,3
4,Al Horford,horfoal01,C,28,1


In [14]:
df0 = pd.merge(df, size_df, how='left', on=['player', 'playerid', 'pos', 'age'])

In [15]:
more2_df = df0.loc[df0['freq'] > 1].reset_index(drop=True)
only1_df = df0.loc[df0['freq'] == 1].reset_index(drop=True)

In [16]:
more2_df = more2_df.loc[more2_df['tm'] == 'TOT'].reset_index(drop=True)

In [17]:
only1_df = pd.concat([only1_df, more2_df], axis=0).reset_index(drop=True)

In [18]:
only1_df.head()

Unnamed: 0,player,pos,age,tm,g,gs,mp,fg,fga,fg_pct,...,trb,ast,stl,blk,tov,pf,pts,year,playerid,freq
0,Quincy Acy,PF,24,NYK,68,22,1287,152,331,0.459,...,301,68,27,22,60,147,398,2015,acyqu01,1
1,Jordan Adams,SG,20,MEM,30,0,248,35,86,0.407,...,28,16,16,7,14,24,94,2015,adamsjo01,1
2,Steven Adams,C,21,OKC,70,67,1771,217,399,0.544,...,523,66,38,86,99,222,537,2015,adamsst01,1
3,Jeff Adrien,PF,28,MIN,17,0,215,19,44,0.432,...,77,15,4,9,9,30,60,2015,adrieje01,1
4,Alexis Ajinca,C,26,NOP,68,8,957,181,329,0.55,...,315,47,21,51,69,151,443,2015,ajincal01,1


In [41]:
def bref_adv(year, table_type):
    
    
    """
    Scrapes basketball-reference data
    
    Parameters:
    -----------
    year: int
        NBA season to scrape
        Example:
            1976-77 nba season is 1977
            2009-10 nba season is 2010
    table_type: str
        options -> {'totals', 'advanced', 'per_game', 'per_minute', 'per_poss'}
    
    """
    
    url = 'https://www.basketball-reference.com/leagues/NBA_{}_{}.html'.format(year, table_type)
    req0 = requests.get(url=url)
    soup = BeautifulSoup(req0.content, 'lxml')
    
    # find all the table row (tr) tags, skipping first row
    data_ = soup.find_all('tr')[1:]
    
    # find all table header (th) tags in first row
    headers = soup.find_all('tr')[0].find_all('th')

    # loop through and get table headers
    headers_columns = []
    for i in headers:
        headers_columns.append(i.get_text())
        
    player_data = [[td.getText() for td in data_[i].findAll('td')]
                    for i in range(len(data_))]
    
    df = pd.DataFrame(player_data, columns=headers_columns[1:])
    df = df.loc[df['Player'] != 'Player'].reset_index(drop=True)
    df.dropna(inplace=True)
    
    df.columns = df.columns.str.replace('%', '_pct').str.replace('/', '_').str.lower()
    
    # find playerids
    playerid_v0 = []

    for j in data_:
        if len(j.find_all('a')) > 0:
            playerid_v0.append(j.find_all('a')[0].get('href'))

    playerid_v1 = []

    for p in playerid_v0:
        playerid_v1.append(p.replace('players/', '').replace('.html', '').split('/')[2])
        
    df['year'] = year
    df['playerid'] = playerid_v1
    
    # create new dataframe of that holds the frequency that each observation appears
    size_df = df.groupby(['player', 'playerid', 'age']).size().reset_index().rename(columns={0: 'freq'})
    df1 = pd.merge(df, size_df, how='left', on=['player', 'playerid', 'age'])
    
    more2_df = df1.loc[df0['freq'] > 1].reset_index(drop=True)
    only1_df = df1.loc[df0['freq'] == 1].reset_index(drop=True)
    
    # keep only observations where team equals TOT
    more2_df = more2_df.loc[more2_df['tm'] == 'TOT'].reset_index(drop=True)
    
    only1_df = pd.concat([only1_df, more2_df], axis=0).reset_index(drop=True)
    
    only1_df['player'] = only1_df['player'].str.replace('*', '').str.strip()
    
    return only1_df

In [48]:
A = bref_adv(year=1992, table_type='advanced')