In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import time

In [2]:
# url that we are scraping
base_url = "http://www.basketball-reference.com/draft/NBA_{year}.html"

In [3]:
# initialize large DataFrame to append individual draft classes into
draft_df = pd.DataFrame()

In [4]:
# create a list of strings that indicate second-round status
second_round_numbers = list(range(31,61))
second_round_numbers = [str(x) for x in second_round_numbers]

In [5]:
for year in range(2005, 2015): 
    # insert year into url to scrape by year
    url = base_url.format(year=year)
    
    # get the html for the url we use and create the soup object to parse the html
    html = urlopen(url) 
    soup = BeautifulSoup(html, 'html5lib') 
    
    # parse for the headers to have our DataFrame labels
    column_headers = [th.getText() for th in soup.findAll('tr', limit=2)[1].findAll('th')]
    column_headers.remove('Rk') # Rk does not get caught by player data
    
    # get our player data
    data_rows = soup.findAll('tr')[2:] 
    player_data = [[td.getText() for td in data_rows[i].findAll('td')] for i in range(len(data_rows))]
    
    # replace empty string values with 0 
    for i in player_data:
        i[:] = [0 if x=='' else x for x in i]
    
    # remove empty data that was grabbed from scraping empty rows
    counter = 0
    length = len(player_data)
    while(counter<length):
        if(player_data[counter]==[]):
            player_data.remove(player_data[counter])
            # as an element is removed
            # so decrease the length by 1
            length = length - 1  
            # run loop again to check element
            # at same index, when item removed 
            # next item will shift to the left 
            continue
        counter = counter + 1
    
    # only access 2nd rounders
    player_data = [x for x in player_data if x[0] in second_round_numbers]
        
    # insert data into pandas DataFrame
    year_df = pd.DataFrame(player_data, columns=column_headers)
    
    # separate draft classes by inserting new column
    year_df.insert(0, 'Draft Year', year)
    
    # Append to the big dataframe
    draft_df = draft_df.append(year_df, ignore_index=True)
    
    #sleep in order to not overload servers and get blacklisted
    time.sleep(45)

In [6]:
# Convert data to proper data types (strings to int/float)
draft_df = draft_df.infer_objects()

# Get rid of the rows full of null values
draft_df = draft_df[draft_df.Player.notnull()]

# Replace NaNs with 0s
draft_df = draft_df.fillna(0)

# Rename Columns
draft_df.rename(columns={'WS/48':'WS_per_48'}, inplace=True)

# Change % symbol
draft_df.columns = draft_df.columns.str.replace('%', '_Perc')

# Add per_G to per game stats
draft_df.columns.values[15:19] = [draft_df.columns.values[15:19][col] + 
                                  "_per_G" for col in range(4)]

draft_df.infer_objects()

Unnamed: 0,Draft Year,Pk,Tm,Player,College,Yrs,G,MP,PTS,TRB,...,3P_Perc,FT_Perc,MP.1,PTS_per_G,TRB_per_G,AST_per_G,WS_per_G,WS_per_48,BPM,VORP
0,2005,31,ATL,Salim Stoudamire,University of Arizona,3,157,2672,1260,214,...,.366,.882,17.0,8.0,1.4,1.0,2.2,.040,-5.1,-2.1
1,2005,32,LAC,Daniel Ewing,Duke University,2,127,1683,431,158,...,.295,.780,13.3,3.4,1.2,1.4,0.8,.024,-3.7,-0.7
2,2005,33,NOH,Brandon Bass,Louisiana State University,12,758,16410,6575,3448,...,.207,.832,21.6,8.7,4.5,0.8,42.8,.125,-1.1,3.7
3,2005,34,UTA,C.J. Miles,0,14,838,17121,8044,1999,...,.359,.810,20.4,9.6,2.4,1.1,30.0,.084,-1.0,4.3
4,2005,35,POR,Ricky Sanchez,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,2005,36,MIL,Ersan Ilyasova,0,11,745,17823,7885,4330,...,.365,.773,23.9,10.6,5.8,1.1,44.9,.121,-0.2,8.3
6,2005,37,LAL,Ronny Turiaf,Gonzaga University,10,473,8020,2209,1757,...,.000,.636,17.0,4.7,3.7,1.3,19.2,.115,1.0,6.0
7,2005,38,ORL,Travis Diener,Marquette University,5,179,2660,854,243,...,.353,.847,14.9,4.8,1.4,2.4,5.6,.102,-1.8,0.1
8,2005,39,LAL,Von Wafer,Florida State University,6,200,2475,1054,234,...,.325,.751,12.4,5.3,1.2,0.7,3.3,.064,-3.2,-0.7
9,2005,40,GSW,Monta Ellis,0,12,833,29011,14858,2874,...,.314,.772,34.8,17.8,3.5,4.6,41.9,.069,-0.3,12.3


In [7]:
# uncomment to download the csv file
draft_df.to_csv("second_rounders_2005-14.csv")