In [1]:
# imports used for webscraping
from bs4 import BeautifulSoup
from multiprocess import Pool
import sys, requests, pandas, numpy

# any config/setup for imported libraries
pandas.options.display.max_columns = None
pandas.options.display.max_rows = None

In [2]:
# define function to simplify pulling all player data based on Lebron test
def scrape_profile(url):
    '''
    This function starts by using the request library which creates the request to a html page based on the url parameter. Next it uses beautiful soup to transform the response data into a format that can be accessed and modified. After retrieving the players name from the html, the next two lines filter the response for tables holding that player's career totals in the regular season and the playoffs. The next compound statement focuses on retrieving information about any awards they've has received throughout their career. After locating the html by id, the information is transformed and stored. Moving to the next block of code, the parsed html data is manipulated to retrieve the pieces of text that are important for collection purposes. First, the names of each stat is retrieved and stored. Then, the unneeded columns are sliced from the new list of stat names.

    The third compound statement actually starts the data mining. The table containing regular season totals that was found is filtered and transformed by creating a new list from every row that is present in the table. Each row represents another season the player was active in their league. Each of the three pieces of extracted data (regular seasons stats, playoff stats, and awards) are stored in their own variable and returned.
    '''
    
    player_page = requests.get(url)
    player_soup = BeautifulSoup(player_page.content, 'html.parser')
    player_name = player_soup.find(id='meta').find("h1").find("span").text
    player_totals = player_soup.find(id="totals")
    player_playoffs_totals = player_soup.find(id="playoffs_totals")

    html_awards = player_soup.body.find_all(id='bling')
    player_awards = []
    player_awards.append("awards")
    player_awards.append(player_name)
    if html_awards:
        award = html_awards[0].find_all('a')
        for i in range(len(award)):
            player_awards.append(award[i].text)

    table_columns = player_totals.find_all("th")
    stat_columns = []
    for stat in table_columns:
        if stat.text == "\xa0":
            continue
        stat_columns.append(stat.text)
    stat_names = stat_columns[0:31]
    
    stat_names.insert(1, "Name")

    player_reg = []
    if player_totals is not None:
        reg_table = player_totals.find("tbody").find_all("tr")
        for row in reg_table:
            season=[]
            season.append("reg")
            season.append(player_name)
            season.append(row.find("th").text)
            for stat in row.find_all("td"):
                if stat.text == "":
                    continue
                season.append(stat.text)
            player_reg.append(season)

    player_po = []
    if player_playoffs_totals is not None:
        po_table = player_playoffs_totals.find("tbody").find_all("tr")
        for row in po_table:
            season=[]
            season.append("post")
            season.append(player_name)
            season.append(row.find("th").text)
            for stat in row.find_all("td"):
                if stat.text == "":
                    continue
                season.append(stat.text)
            player_po.append(season)
        
    return player_reg, player_po, player_awards

In [None]:

links = []

for x in range(97,123): # loop over every letter of the alphabet
    char = chr(x)
    page = requests.get(f'https://www.basketball-reference.com/players/{char}')
    soup = BeautifulSoup(page.content, 'html.parser')
    player_links = soup.find_all('tr')
    
    for i in range(len(player_links)):
        if i == 0:
            continue
        else: 
            player_link = player_links[i].find('a', href=True)['href']
            player_url = f'https://www.basketball-reference.com/{player_link}'
            links.append(player_url)

In [None]:
%%time
# using multiprocess to speed up scraping

with Pool(10) as p:
    records = p.map(scrape_profile, links)

In [None]:
# save records into three seperate pandas dataframes: one for the regular season, one for the post season and one for awards

col_names = stat_columns[0:31]
col_names.insert(0, "type")
col_names.insert(1, "Name")

reg_season = pandas.DataFrame(columns = col_names)
po_season = pandas.DataFrame(columns = col_names)
awards = pandas.DataFrame()

for record in records:
    reg, po, aw = record
    add_rg = pandas.DataFrame(reg)
    add_po = pandas.DataFrame(po)
    add_aw = pandas.DataFrame(aw)
    reg_season = pandas.concat([reg_season, add_rg], sort=False)
    po_season = pandas.concat([po_season, add_po], sort=False)
    awards = pandas.concat([awards, add_aw], sort=False)
    
reg_season