<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import numpy as np
import urllib.request
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.chrome.options import Options as ChromeOptions
import lxml.html
from lxml import etree
import re
import time
import pandas as pd
from functools import reduce
from operator import itemgetter

In [56]:
# Modify dataframe to have appropriate data types
def ConvertDataFrame(df):
    cols = df.columns.drop(['name', 'college', 'country', 'draft', 'nationality'])
    df[cols] = df[cols].apply(pd.to_numeric, errors='coerce', axis=1)
    df['name'] = df['name'].astype('str')
    df['college'] = df['college'].astype('str')
    df['country'] = df['country'].astype('str')
    df['draft'] = df['draft'].astype('str')
    df['nationality'] = df['nationality'].astype('str')
    df['actual_draft_year'] = df['actual_draft_year'].astype('float')
    df['draft_round'] = df['draft_round'].astype('float')
    df['draft_number'] = df['draft_number'].astype('float')
    #df = df.groupby(['name', 'actual_draft_year']).mean().reset_index()
    df = df.drop_duplicates(subset=['name', 'actual_draft_year'], keep='first')
    return df


In [52]:
# Scrape player bio tables from NBA.com webpages, grabbing only
# the specified columns (by index) and for the specified seasons
def FetchStatsTables(urls, years, col_list):
    # Create a headless Firefox browser instance
    opt = FirefoxOptions()
    opt.add_argument("--headless")
    driver = webdriver.Firefox(options=opt)
    
    arr = []
    for i,url in enumerate(urls):
        year = years[i]
        print("Fetching player bios from the", year, "season...")
        
        driver.get(url)
        wait = WebDriverWait(driver, 30)
        wait.until(EC.presence_of_element_located((By.XPATH, "//select[contains(@class, 'stats-table-pagination__select')]")))
        sel = Select(driver.find_element_by_class_name('stats-table-pagination__select'))
        sel.select_by_visible_text("All")
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
        
        retries = 1
        while retries <= 3:
            try:
                wait.until(EC.presence_of_element_located((By.XPATH, "//*[@class='nba-stat-table__overflow']//table/tbody/tr")))
                break
            except TimeoutException:
                print('\nRefreshing NBA bios page due to timeout (retry #', retries,')...')
                driver.refresh()
                time.sleep(1)
                retries += 1

        root = lxml.html.fromstring(driver.page_source)
        results = root.xpath("//*[@class='nba-stat-table__overflow']//table/tbody/tr")
    
        # Process the table text and break into columns, 
        # stripping extraneous newline characters and inserting the season year
        counter = 0
        for result in results:
            item = result.xpath("./td//text()")
            item = [re.sub('\n +', '', x) for x in item]
            data = [x for x in item if x != '' and x != '\n']
            data = [s.strip('%') for s in data]
            if len(data) < col_list[-1]+1:
                continue
            elif len(data) > col_list[-1]+1:
                data[6:-13] = [' '.join(data[6:-13])]
            data = list(itemgetter(*col_list)(data))
            #print(data)
            
            if len(data[1].split('-')) == 2 and data[1].split('-')[0] != '':
                ft_in = data[1].split('-')
                data[1] = float(ft_in[0])*12. + float(ft_in[1])
                #print(ft_in, data[1])
            else:
                data[1] = ''
            
            if data[2] == ' ':
                data[2] = ''

            if data[5].lower() == 'undrafted':
                data.append('undrafted')
                data[5] = ''
                data[6] = ''
                data[7] = ''
            else:
                data.append('drafted')
                
            if data[4] == 'USA' or data[4] == '':
                data.append('domestic')
            else:
                data.append('foreign')
            
            arr.append(data)
            counter += 1
            #print(data)
        
        print("Fetched bios for", counter, "NBA players.")
        
    driver.quit()
    #print(arr)
    return np.array(arr)


In [53]:
# Establish the years for which we want to fetch player data
# (for data available for 201-15 onward)
ya = [str(n).zfill(2) for n in range(0, 20)]
yb = [str(n).zfill(2) for n in range(1, 21)]
years = [int("20"+y) for y in yb]


In [None]:
# Create URLs for player bios on NBA.com, 
# fetch the data in 2D array format, and put into a Pandas dataframe
#https://stats.nba.com/players/bio/?Season=2000-01&SeasonType=Regular%20Season
urls = [ "https://stats.nba.com/players/bio/?Season=20{0}-{1}&SeasonType=Regular%20Season".format(ya[i], yb[i]) for i in range(len(ya)) ]
np_arr = FetchStatsTables(urls, years, [0, 3, 4, 5, 6, 7, 8, 9])
df = pd.DataFrame(np_arr, columns=['name', 'height', 'weight', 'college', 'country', 'actual_draft_year', 'draft_round', 'draft_number', 'draft', 'nationality'])


In [57]:
df = ConvertDataFrame(df)
print(df)

                  name  height  weight       college   country  \
0           A.C. Green    81.0   225.0  Oregon State       USA   
1          A.J. Guyton    73.0   180.0       Indiana       USA   
2          Aaron McKie    77.0   209.0        Temple       USA   
3       Aaron Williams    81.0   225.0        Xavier       USA   
4           Adam Keefe    81.0   230.0      Stanford       USA   
...                ...     ...     ...           ...       ...   
9339   Vincent Poirier    84.0   235.0          None    France   
9340     Vlatko Cancar    80.0   236.0          None  Slovenia   
9343    Wenyen Gabriel    81.0   205.0          None     Sudan   
9354  Zach Norvell Jr.    77.0   205.0          None       USA   
9355    Zylan Cheatham    77.0   220.0          None       USA   

      actual_draft_year  draft_round  draft_number      draft nationality  
0                1985.0          1.0          23.0    drafted    domestic  
1                2000.0          2.0          32.0    d

In [58]:
# Write NBA player bios dataframe to a .csv file
df.to_csv("NBAPlayerBios.csv")
