Python scraper for basketball reference

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re

In [None]:
def bref_scrape(year, table_type):
    
    
    """
    Scrapes basketball-reference data
    
    Parameters:
    -----------
    year: int
        NBA season to scrape
        Example:
            1976-77 nba season is 1977
            2009-10 nba season is 2010
    table_type: str
        options -> {'totals', 'advanced', 'per_game', 'per_minute', 'per_poss'}
    
    Returns:
    --------
    dataframe
    """
    
    url = 'https://www.basketball-reference.com/leagues/NBA_{}_{}.html'.format(year, table_type)
    req0 = requests.get(url=url)
    soup = BeautifulSoup(req0.content, 'lxml')
    
    # find all the table row (tr) tags, skipping first row
    data_ = soup.find_all('tr')[1:]
    
    # find all table header (th) tags in first row
    headers = soup.find_all('tr')[0].find_all('th')

    # loop through and get table headers
    headers_columns = []
    for i in headers:
        headers_columns.append(i.get_text())
        
    player_data = [[td.getText() for td in data_[i].findAll('td')]
                    for i in range(len(data_))]
    
    df = pd.DataFrame(player_data, columns=headers_columns[1:])
    df = df.loc[df['Player'] != 'Player'].reset_index(drop=True)
    df.dropna(inplace=True)
    
    df.columns = df.columns.str.replace('%', '_pct').str.replace('/', '_').str.lower()
    
    # find playerids
    playerid_v0 = []

    for j in data_:
        if len(j.find_all('a')) > 0:
            playerid_v0.append(j.find_all('a')[0].get('href'))

    playerid_v1 = []

    for p in playerid_v0:
        playerid_v1.append(p.replace('players/', '').replace('.html', '').split('/')[2])
        
    df['year'] = year
    df['playerid'] = playerid_v1
    
    # create new dataframe of that holds the frequency that each observation appears
    size_df = df.groupby(['player', 'playerid', 'age']).size().reset_index().rename(columns={0: 'freq'})
    df1 = pd.merge(df, size_df, how='left', on=['player', 'playerid', 'age'])
    
    more2_df = df1.loc[df1['freq'] > 1].reset_index(drop=True)
    only1_df = df1.loc[df1['freq'] == 1].reset_index(drop=True)
    
    # keep only observations where team equals TOT
    more2_df = more2_df.loc[more2_df['tm'] == 'TOT'].reset_index(drop=True)
    
    only1_df = pd.concat([only1_df, more2_df], axis=0).reset_index(drop=True)
    
    only1_df['player'] = only1_df['player'].str.replace('*', '').str.strip()
    
    #drop empty column names
    try:
        only1_df.drop('\xa0', axis=1, inplace=True)
    except ValueError:
        print('no empty column names')
    
    # rename most columns to show which table they belong
    #     example: g_table_type, mp_table_type, ...
    
    static = ['player', 'pos', 'age', 'tm', 'year', 'playerid', 'freq']
    lst0 = [x for x in only1_df.columns if x not in static]
    lst1 = [x + '_' + table_type for x in only1_df.columns if x not in static]

    col_dt = dict(list(zip(lst0, lst1)))
    only1_df.rename(columns=col_dt, inplace=True)
    
    return only1_df

In [None]:
A = bref_scrape(year=1992, table_type='totals')

In [None]:
A.head()