In [1]:
from urllib2 import urlopen
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
url_template = "http://www.basketball-reference.com/draft/NBA_{year}.html"
draft_df = pd.DataFrame()


In [3]:
for year in range(2000, 2011):  # for each year
    url = url_template.format(year=year)  # get the url
    
    html = urlopen(url)  # get the html
    soup = BeautifulSoup(html, "lxml") # create our BS object
    

    # get our player data
    data_rows = soup.findAll('tr')[2:] 
    player_data = [[td.getText() for td in data_rows[i].findAll('td')]
                for i in range(len(data_rows))]
    
    column_headers = [th.getText() for th in 
                  soup.findAll('tr', limit=2)[1].findAll('th')]
    
    # Turn yearly data into a DatFrame
    year_df = pd.DataFrame(player_data, columns=column_headers)
    # create and insert the Draft_Yr column
    year_df.insert(0, 'Draft_Yr', year)
    
    # Append to the big dataframe
    draft_df = draft_df.append(year_df, ignore_index=True)


In [10]:
draft_df.head()

Unnamed: 0,Draft_Yr,Pk,Tm,Player,College,Yrs,G,MP,PTS,TRB,...,3P_Perc,FT_Perc,MP_per_G,PTS_per_G,TRB_per_G,AST_per_G,WS,WS_per_48,BPM,VORP
0,2000,1,NJN,Kenyon Martin,University of Cincinnati,15,757,23134,9325,5159,...,0.234,0.629,30.6,12.3,6.8,1.9,48.0,0.1,0.7,15.5
1,2000,2,VAN,Stromile Swift,Louisiana State University,9,547,10804,4582,2535,...,0.074,0.699,19.8,8.4,4.6,0.5,21.3,0.095,-1.9,0.2
2,2000,3,LAC,Darius Miles,,7,446,11730,4507,2190,...,0.168,0.59,26.3,10.1,4.9,1.9,9.5,0.039,-1.2,2.3
3,2000,4,CHI,Marcus Fizer,Iowa State University,6,289,6032,2782,1340,...,0.191,0.691,20.9,9.6,4.6,1.2,2.7,0.022,-4.7,-4.1
4,2000,5,ORL,Mike Miller,University of Florida,16,1003,27598,10928,4330,...,0.407,0.769,27.5,10.9,4.3,2.6,60.6,0.105,0.9,20.4


In [5]:
draft_df.tail()


Unnamed: 0,Draft_Yr,Rk,Pk,Tm,Player,College,Yrs,G,MP,PTS,...,3P%,FT%,MP.1,PTS.1,TRB,AST,WS,WS/48,BPM,VORP
666,2010,56,56,MIN,Hamady N'Diaye,Rutgers University,3.0,33.0,157.0,20.0,...,,0.462,4.8,0.6,0.8,0.1,0.0,0.013,-4.9,-0.1
667,2010,57,57,IND,Ryan Reid,Florida State University,1.0,5.0,17.0,8.0,...,,,3.4,1.6,0.4,0.0,0.1,0.142,-5.5,0.0
668,2010,58,58,LAL,Derrick Caracter,University of Texas at El Paso,1.0,41.0,215.0,81.0,...,,0.739,5.2,2.0,1.0,0.2,0.3,0.057,-4.9,-0.2
669,2010,59,59,ORL,Stanley Robinson,University of Connecticut,,,,,...,,,,,,,,,,
670,2010,60,60,PHO,Dwayne Collins,University of Miami,,,,,...,,,,,,,,,,


In [6]:
# Clean the Data

# Convert data to proper data types
draft_df = draft_df.convert_objects(convert_numeric=True)

# Get rid of the rows full of null values
draft_df = draft_df[draft_df.Player.notnull()]

# Replace NaNs with 0s
draft_df = draft_df.fillna(0)

# Rename Columns
draft_df.rename(columns={'WS/48':'WS_per_48'}, inplace=True)
# Change % symbol
draft_df.columns = draft_df.columns.str.replace('%', '_Perc')
# Add per_G to per game stats
draft_df.columns.values[15:19] = [draft_df.columns.values[15:19][col] + 
                                  "_per_G" for col in range(4)]

# Changing the Data Types to int
draft_df.loc[:,'Yrs':'AST'] = draft_df.loc[:,'Yrs':'AST'].astype(int)

# Delete the 'Rk' column
draft_df.drop('Rk', axis='columns', inplace=True)



In [7]:
draft_df.dtypes


Draft_Yr       int64
Pk           float64
Tm            object
Player        object
College       object
Yrs            int64
G              int64
MP             int64
PTS            int64
TRB            int64
AST            int64
FG_Perc      float64
3P_Perc      float64
FT_Perc      float64
MP_per_G     float64
PTS_per_G    float64
TRB_per_G    float64
AST_per_G    float64
WS           float64
WS_per_48    float64
BPM          float64
VORP         float64
dtype: object

In [8]:
draft_df['Pk'] = draft_df['Pk'].astype(int) # change Pk to int


In [9]:
draft_df.isnull().sum() # No missing values in our DataFrame


Draft_Yr     0
Pk           0
Tm           0
Player       0
College      0
Yrs          0
G            0
MP           0
PTS          0
TRB          0
AST          0
FG_Perc      0
3P_Perc      0
FT_Perc      0
MP_per_G     0
PTS_per_G    0
TRB_per_G    0
AST_per_G    0
WS           0
WS_per_48    0
BPM          0
VORP         0
dtype: int64