In [33]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd


In [34]:
# url that we are scraping
base_url = "http://www.basketball-reference.com/draft/NBA_{year}.html"

In [35]:
draft_df = pd.DataFrame()

In [36]:
second_round_numbers = list(range(31,61))
second_round_numbers = [str(x) for x in second_round_numbers]
#second_round_numbers

In [37]:
for year in range(2004, 2015):  # for each year
    url = base_url.format(year=year)  # get the url
    
    html = urlopen(url)  # get the html
    soup = BeautifulSoup(html, 'html5lib') # create our BS object
    
    column_headers = [th.getText() for th in soup.findAll('tr', limit=2)[1].findAll('th')]
    column_headers.remove('Rk') #Rk does not get caught by player data
    
    # get our player data
    data_rows = soup.findAll('tr')[2:] 
    player_data = [[td.getText() for td in data_rows[i].findAll('td')] for i in range(len(data_rows))]
    
    #replace '' with 0 
    for i in player_data:
        i[:] = [0 if x=='' else x for x in i]
    
    #remove empty data
    counter = 0
    length = len(player_data)
    while(counter<length):
        if(player_data[counter]==[]):
            player_data.remove(player_data[counter])
            # as an element is removed
            # so decrease the length by 1
            length = length - 1  
            # run loop again to check element
            # at same index, when item removed 
            # next item will shift to the left 
            continue
        counter = counter + 1
    
    #only access 2nd rounders
    player_data = [x for x in player_data if x[0] in second_round_numbers]
    
    #print(second_round_data)
        
    # Turn yearly data into a DatFrame
    year_df = pd.DataFrame(player_data, columns=column_headers)
    # create and insert the Draft_Yr column
    year_df.insert(0, 'Draft Year', year)
    
    # Append to the big dataframe
    draft_df = draft_df.append(year_df, ignore_index=True)

In [38]:
# Convert data to proper data types
draft_df = draft_df.infer_objects()

# Get rid of the rows full of null values
draft_df = draft_df[draft_df.Player.notnull()]

# Replace NaNs with 0s
draft_df = draft_df.fillna(0)

# Rename Columns
draft_df.rename(columns={'WS/48':'WS_per_48'}, inplace=True)

# Change % symbol
draft_df.columns = draft_df.columns.str.replace('%', '_Perc')
# Add per_G to per game stats
draft_df.columns.values[15:19] = [draft_df.columns.values[15:19][col] + 
                                  "_per_G" for col in range(4)]

draft_df.infer_objects()

Unnamed: 0,Draft Year,Pk,Tm,Player,College,Yrs,G,MP,PTS,TRB,...,3P_Perc,FT_Perc,MP.1,PTS_per_G,TRB_per_G,AST_per_G,WS_per_G,WS_per_48,BPM,VORP
0,2004,31,CHI,Jackson Vroman,Iowa State University,2,87,1111,285,261,...,.000,.580,12.8,3.3,3.0,0.6,0.0,.001,-3.9,-0.5
1,2004,32,WAS,Peter John Ramos,0,1,6,20,11,4,...,0,.500,3.3,1.8,0.7,0.0,0.0,-.108,-18.3,-0.1
2,2004,33,LAC,Lionel Chalmers,Xavier University,1,36,433,111,31,...,.245,.625,12.0,3.1,0.9,1.4,-0.2,-.025,-6.3,-0.5
3,2004,34,ATL,Donta Smith,Southeastern Illinois College,2,61,560,165,66,...,.308,.650,9.2,2.7,1.1,0.8,0.5,.043,-3.2,-0.2
4,2004,35,SEA,Andre Emmett,Texas Tech University,2,14,73,20,8,...,.000,.615,5.2,1.4,0.6,0.1,0.1,.040,-6.8,-0.1
5,2004,36,ORL,Antonio Burks,University of Memphis,2,81,789,186,49,...,.235,.571,9.7,2.3,0.6,1.3,0.1,.004,-6.6,-0.9
6,2004,37,ATL,Royal Ivey,University of Texas at Austin,10,492,6162,1641,557,...,.361,.706,12.5,3.3,1.1,1.0,4.5,.035,-3.8,-2.8
7,2004,38,CHI,Chris Duhon,Duke University,9,606,15526,3946,1364,...,.363,.784,25.6,6.5,2.3,4.4,25.0,.077,-1.5,2.0
8,2004,39,TOR,Albert Miralles,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,2004,40,BOS,Justin Reed,University of Mississippi,3,136,1436,472,186,...,.000,.752,10.6,3.5,1.4,0.5,0.5,.016,-5.9,-1.4
