# Project Luther

Kenny Leung - kenleung11@gmail.com

Part 1/8 - Data scraping NBA draft statistics from 1966 to 2016, and rookie year data since 2003

This notebook documents the process of scraping NBA draft and rookie year data from https://www.basketball-reference.com/.

In [1]:
# import packages
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import pickle

# Data Scraping

The data scraped from this website is the statistics of all the drafted players since 1966. The data consists of all major stats categories in basketball and are averaged over all the seasons the player has played to date. Further below in the notebook I will scrape the statistics for each drafted player's rookie year, which will contain the dependent variable (Box Plus/Minus "BPM") I wish to predict with my regression model.

I want to test scraping data for one season to make sure the code works as intended.

In [2]:
url = 'https://www.basketball-reference.com/draft/NBA_2016.html'

response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, "lxml")

# get column headers
column_headers = [th.getText() for th in soup.findAll('tr', limit=2)[1].findAll('th')]
# remove first column
column_headers.pop(0)
# skip the first 2 header rows
data_rows = soup.findAll('tr')[2:]  
# get player data
player_data = [[td.getText() for td in data_rows[i].findAll('td')] for i in range(len(data_rows))]

# create datadrame with columns and data
df = pd.DataFrame(player_data, columns=column_headers)

In [18]:
df.head()

Unnamed: 0,Pk,Tm,Player,College,Yrs,G,MP,PTS,TRB,AST,...,3P%,FT%,MP.1,PTS.1,TRB.1,AST.1,WS,WS/48,BPM,VORP
0,1,PHI,Ben Simmons,Louisiana State University,,,,,,,...,,,,,,,,,,
1,2,LAL,Brandon Ingram,Duke University,1.0,79.0,2279.0,740.0,316.0,166.0,...,0.294,0.621,28.8,9.4,4.0,2.1,-0.3,-0.007,-3.8,-1.1
2,3,BOS,Jaylen Brown,University of California,1.0,78.0,1341.0,515.0,220.0,64.0,...,0.341,0.685,17.2,6.6,2.8,0.8,1.5,0.053,-4.0,-0.7
3,4,PHO,Dragan Bender,,1.0,43.0,574.0,146.0,103.0,23.0,...,0.277,0.364,13.3,3.4,2.4,0.5,-0.3,-0.029,-4.3,-0.3
4,5,MIN,Kris Dunn,Providence College,1.0,78.0,1333.0,293.0,166.0,188.0,...,0.288,0.61,17.1,3.8,2.1,2.4,0.1,0.004,-2.2,-0.1


# Scraping Draft Data from 1966 to 2016

Now, to scrape all draft data for all drafts since 1966.

In [21]:
url_template = "http://www.basketball-reference.com/draft/NBA_{year}.html"
draft_df = pd.DataFrame()

for year in range(1966, 2017):  # for each year
    url = url_template.format(year=year)  # get the url
    
    response = requests.get(url)
    page = response.text
    
    soup = BeautifulSoup(page, 'html5lib')
    
    # get column headers
    column_headers = [th.getText() for th in soup.findAll('tr', limit=2)[1].findAll('th')]
    column_headers.pop(0)

    # get player data
    data_rows = soup.findAll('tr')[2:] 
    player_data = [[td.getText() for td in data_rows[i].findAll('td')] for i in range(len(data_rows))]
    
    # turn yearly data into a dataframe
    year_df = pd.DataFrame(player_data, columns=column_headers)
    # create and insert the Draft_Yr column
    year_df.insert(0, 'Draft_Yr', year)
    
    # append to the big dataframe
    draft_df = draft_df.append(year_df, ignore_index=True)
    
    print(year)
    
    time.sleep(.5+2*random.random())

1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016


In [74]:
# save to csv
draft_df.to_csv('draft_raw.csv')

# Cleaning the data

Next, I cleaned the raw draft data by dropping unnecessary columns, changing the dtypes of the data, and checking for null values.

In [3]:
# read in csv file
draft_df_copy = pd.read_csv('draft_raw.csv',index_col=0)

In [4]:
draft_df_copy.columns

Index(['Draft_Yr', 'Pk', 'Tm', 'Player', 'College', 'Yrs', 'G', 'MP', 'PTS',
       'TRB', 'AST', 'FG%', '3P%', 'FT%', 'MP.1', 'PTS.1', 'TRB.1', 'AST.1',
       'WS', 'WS/48', 'BPM', 'VORP'],
      dtype='object')

In [5]:
# change dataframe to numeric values except for above columns
cols=[i for i in draft_df_copy.columns if i not in ['Tm', 'Player', 'College']]
for col in cols:
    draft_df_copy[col] = draft_df_copy[col].apply(pd.to_numeric, errors='coerce')

In [6]:
# get rid of the rows full of null values
draft_df_copy = draft_df_copy[draft_df_copy['Player'].notnull()]

In [7]:
# replace NaNs with 0s
draft_df_copy = draft_df_copy.fillna(0)

In [8]:
# rename columns
draft_df_copy.rename(columns={'WS/48':'WS_per_48'}, inplace=True)

In [9]:
# change % symbol
draft_df_copy.columns = draft_df_copy.columns.str.replace('%', '_Perc')

In [10]:
# rename columns
column_indices = [14,15,16,17]
new_names = ['MPG','PPG','RPG','APG']
old_names = draft_df_copy.columns[column_indices]
draft_df_copy.rename(columns=dict(zip(old_names, new_names)), inplace=True)

In [11]:
# change the data types to int
draft_df_copy.loc[:,'Yrs':'AST'] = draft_df_copy.loc[:,'Yrs':'AST'].astype(int)

In [12]:
# filter out players who did not play any games
draft_df_copy = draft_df_copy[draft_df_copy['G'] != 0]

In [13]:
# filter out players prior to the 2003 draft, since we only have college statistics from 2003 (see notebook 2/6)
draft_df_copy = draft_df_copy[draft_df_copy['Draft_Yr']>2002]

In [14]:
# see who has the most points 
draft_df_copy[draft_df_copy['PTS'] == draft_df_copy['PTS'].max()]

Unnamed: 0,Draft_Yr,Pk,Tm,Player,College,Yrs,G,MP,PTS,TRB,...,3P_Perc,FT_Perc,MPG,PPG,RPG,APG,WS,WS_per_48,BPM,VORP
5711,2003,1.0,CLE,LeBron James,0,14,1061,41272,28787,7706,...,0.342,0.74,38.9,27.1,7.3,7.0,205.4,0.239,9.1,115.9


In [15]:
draft_df_copy.reset_index().drop('index',axis=1)

Unnamed: 0,Draft_Yr,Pk,Tm,Player,College,Yrs,G,MP,PTS,TRB,...,3P_Perc,FT_Perc,MPG,PPG,RPG,APG,WS,WS_per_48,BPM,VORP
0,2003,1.0,CLE,LeBron James,0,14,1061,41272,28787,7706,...,0.342,0.740,38.9,27.1,7.3,7.0,205.4,0.239,9.1,115.9
1,2003,2.0,DET,Darko Milicic,0,10,468,8638,2813,1971,...,0.000,0.574,18.5,6.0,4.2,0.9,7.1,0.040,-1.6,0.8
2,2003,3.0,DEN,Carmelo Anthony,Syracuse University,14,976,35334,24156,6431,...,0.346,0.813,36.2,24.8,6.6,3.1,97.0,0.132,1.3,29.4
3,2003,4.0,TOR,Chris Bosh,Georgia Institute of Technology,13,893,31936,17189,7592,...,0.335,0.799,35.8,19.2,8.5,2.0,106.0,0.159,1.4,27.5
4,2003,5.0,MIA,Dwyane Wade,Marquette University,14,915,32352,21317,4396,...,0.287,0.768,35.4,23.3,4.8,5.7,117.2,0.174,5.1,57.7
5,2003,6.0,LAC,Chris Kaman,Central Michigan University,13,735,19642,8208,5582,...,0.042,0.743,26.7,11.2,7.6,1.3,24.6,0.060,-2.0,0.0
6,2003,7.0,CHI,Kirk Hinrich,University of Kansas,13,879,27015,9594,2576,...,0.375,0.800,30.7,10.9,2.9,4.8,52.5,0.093,0.2,14.7
7,2003,8.0,MIL,T.J. Ford,University of Texas at Austin,8,429,11882,4797,1331,...,0.289,0.815,27.7,11.2,3.1,5.8,16.9,0.068,-0.8,3.6
8,2003,9.0,NYK,Mike Sweetney,Georgetown University,4,233,3610,1516,1045,...,0.000,0.689,15.5,6.5,4.5,0.6,7.8,0.103,-2.1,-0.1
9,2003,10.0,WAS,Jarvis Hayes,University of Georgia,7,427,9898,3553,1330,...,0.356,0.798,23.2,8.3,3.1,1.1,10.2,0.049,-2.9,-2.2


In [21]:
draft_df_copy['Tm'].unique()

array(['CLE', 'DET', 'DEN', 'TOR', 'MIA', 'LAC', 'CHI', 'MIL', 'NYK',
       'WAS', 'GSW', 'SEA', 'MEM', 'ORL', 'BOS', 'PHO', 'NOH', 'UTA',
       'ATL', 'NJN', 'POR', 'LAL', 'MIN', 'SAS', 'DAL', 'IND', 'CHA',
       'PHI', 'SAC', 'HOU', 'NOK', 'OKC', 'BRK', 'CHH', 'CHO', 'NOP'], dtype=object)

In [16]:
# inspect duplicates. only two players share the same name. Marcus D. Williams was drafted in 2006
# and Marcus E. Williams was drafted in 2007
duplicates = draft_df_copy[draft_df_copy.duplicated(['Player'],keep=False)]
duplicates.sort_values('Player')

Unnamed: 0,Draft_Yr,Pk,Tm,Player,College,Yrs,G,MP,PTS,TRB,...,3P_Perc,FT_Perc,MPG,PPG,RPG,APG,WS,WS_per_48,BPM,VORP
5915,2006,22.0,NJN,Marcus Williams,University of Connecticut,4,203,3095,1128,361,...,0.321,0.767,15.2,5.6,1.8,2.8,0.0,-0.001,-4.6,-2.0
5990,2007,33.0,SAS,Marcus Williams,University of Arizona,2,13,39,14,12,...,0.0,0.0,3.0,1.1,0.9,0.2,-0.1,-0.084,-7.5,-0.1


In [17]:
# rename these duplicate players
#draft_df_copy.loc[5915,'Player'] = 'Marcus D. Williams'
#draft_df_copy.loc[5990,'Player'] = 'Marcus E. Williams'

In [18]:
# save to csv file
draft_df_copy.to_csv('draft_data.csv')

# Focusing on first-year data

Here, I am scraping rookie statistics for each drafted player. First I will try code to scrape data for Lebron James's rookie year.

In [43]:
url = 'https://www.basketball-reference.com/players/j/jamesle01.html'

response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, "lxml")

# get column headers
column_headers = [th.getText() for th in soup.findAll(class_="overthrow table_container")[0].findAll('th')][0:30]
column_headers.insert(0,"Player") # insert Player column

player_data = [x.getText() for x in soup.findAll('tr')[1]] # get player data
Player = soup.find_all('h1')[0].getText() # get player name from page
player_data.insert(0,Player) # insert player name to data

dic = dict(zip(column_headers,player_data))
df = pd.DataFrame(dic,columns=column_headers, index=[0])

In [44]:
df

Unnamed: 0,Player,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,LeBron James,2003-04,19,CLE,NBA,SG,79,79,39.5,7.9,...,0.754,1.3,4.2,5.5,5.9,1.6,0.7,3.5,1.9,20.9


# Scraping Rookie Year Stats

This url_list is a pickle object that I saved in the next notebook (Part 3/6) and contains a list of url extensions for each drafted player since 2003. I will loop through this list and scrape the rookie year statistics and advanced statistics (which contains my dependent variable, BPM).

In [61]:
# unpickle url_list
with open("draft_url_list.pkl", 'rb') as picklefile: 
    url_list = pickle.load(picklefile)

In [89]:
# loop through url_list to scrape rookie year stats for all drafted players
url_template2 = "https://www.basketball-reference.com{player}"
rookie_df = pd.DataFrame()

for player in url_list:
    print(player)
    try: # try/except needed for players with no extension page
        url = url_template2.format(player=player)

        response = requests.get(url)
        page = response.text

        soup = BeautifulSoup(page, 'html5lib')

        column_headers = [th.getText() for th in soup.findAll(class_="overthrow table_container")[0].findAll('th')][0:30]
        column_headers.insert(0,"Player")
        Player = soup.find_all('h1')[0].getText()
        player_data = [x.getText() for x in soup.findAll('tr')[1]]
        player_data.insert(0,Player)

        dic = dict(zip(column_headers,player_data))
        player_df = pd.DataFrame(dic,columns=column_headers, index=[0])

        rookie_df = rookie_df.append(player_df, ignore_index=True)

        time.sleep(.5+2*random.random())
    except:
        pass

/players/j/jamesle01.html
/players/m/milicda01.html
/players/a/anthoca01.html
/players/b/boshch01.html
/players/w/wadedw01.html
/players/k/kamanch01.html
/players/h/hinriki01.html
/players/f/fordtj01.html
/players/s/sweetmi01.html
/players/h/hayesja01.html
/players/p/pietrmi01.html
/players/c/collini01.html
/players/b/banksma01.html
/players/r/ridnolu01.html
/players/g/gainere01.html
/players/b/belltr01.html
/players/c/cabarza01.html
/players/w/westda01.html
/players/p/pavloal01.html
/players/j/jonesda02.html
/players/d/diawbo01.html
/players/p/planizo01.html
/players/o/outlatr01.html
/players/c/cookbr01.html
/players/d/delfica01.html
/players/e/ebind01.html
/players/p/perkike01.html
/players/b/barbole01.html
/players/h/howarjo01.html
/players/l/lampema01.html
/players/k/kaponja01.html
/players/w/waltolu01.html
/players/b/beaslje01.html
/players/s/schorso01.html
/players/s/szewcsz01.html
/players/a/austima01.html
/players/h/hansetr01.html
/players/b/blakest01.html
/players/v/vranesl01.

/players/t/tomican01.html
/players/d/dragigo01.html
/players/w/walkebi01.html
/players/h/hairsma01.html
/players/h/hardide01.html
/players/j/jacksda01.html
/players/d/dragita01.html
/players/l/leunema01.html
/players/t/taylomi01.html
/players/k/kaunsa01.html
/players/c/crawfjo01.html
/players/e/erdense01.html
/players/g/griffbl01.html
/players/t/thabeha01.html
/players/h/hardeja01.html
/players/e/evansty01.html
/players/r/rubiori01.html
/players/f/flynnjo01.html
/players/c/curryst01.html
/players/h/hilljo01.html
/players/d/derozde01.html
/players/j/jennibr01.html
/players/w/willite01.html
/players/h/hendege02.html
/players/h/hansbty01.html
/players/c/clarkea01.html
/players/d/dayeau01.html
/players/j/johnsja01.html
/players/h/holidjr01.html
/players/l/lawsoty01.html
/players/t/teaguje01.html
/players/m/maynoer01.html
/players/c/collida01.html
/players/c/clavevi01.html
/players/c/casspom01.html
/players/m/mulleby01.html
/players/b/beaubro01.html
/players/g/gibsota01.html
/players/c/carr

/players/e/ennisty01.html
/players/h/harriga01.html
/players/c/cabocbr01.html
/players/m/mcgarmi01.html
/players/a/adamsjo01.html
/players/h/hoodro01.html
/players/n/napiesh01.html
/players/c/capelca01.html
/players/h/hairspj02.html
/players/b/bogdabo01.html
/players/w/wilcocj01.html
/players/h/huestjo01.html
/players/a/anderky01.html
/players/i/inglida01.html
/players/m/mcdankj01.html
/players/h/harrijo01.html
/players/e/earlycl01.html
/players/s/stokeja01.html
/players/o/obryajo01.html
/players/d/daniede01.html
/players/d/dinwisp01.html
/players/g/grantje01.html
/players/r/robingl02.html
/players/j/jokicni01.html
/players/j/johnsni01.html
/players/t/tavarwa01.html
/players/b/brownma02.html
/players/p/poweldw01.html
/players/c/clarkjo01.html
/players/s/smithru01.html
/players/p/pattela01.html
/players/b/bairsca01.html
/players/b/brownal01.html
/players/a/antetth01.html
/players/m/micicva01.html
/players/g/gential01.html
/players/d/dangune01.html
/players/c/chrisse01.html
/players/m/ma

In [110]:
rookie_df

Unnamed: 0,Player,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,LeBron James,2003-04,19,CLE,NBA,SG,79,79,39.5,7.9,...,.754,1.3,4.2,5.5,5.9,1.6,0.7,3.5,1.9,20.9
1,Darko Milicic,2003-04,18,DET,NBA,C,34,0,4.7,0.5,...,.583,0.3,0.9,1.3,0.2,0.2,0.4,0.4,1.0,1.4
2,Carmelo Anthony,2003-04,19,DEN,NBA,SF,82,82,36.5,7.6,...,.777,2.2,3.8,6.1,2.8,1.2,0.5,3.0,2.7,21.0
3,Chris Bosh,2003-04,19,TOR,NBA,C,75,63,33.5,4.4,...,.701,2.5,4.9,7.4,1.0,0.8,1.4,1.4,2.9,11.5
4,Dwyane Wade,2003-04,22,MIA,NBA,PG,61,56,34.9,6.1,...,.747,1.4,2.7,4.0,4.5,1.4,0.6,3.2,2.3,16.2
5,Chris Kaman,2003-04,21,LAC,NBA,C,82,61,22.5,2.4,...,.697,1.5,4.1,5.6,1.0,0.3,0.9,1.9,2.6,6.1
6,Kirk Hinrich,2003-04,23,CHI,NBA,PG,76,66,35.6,4.2,...,.804,0.6,2.9,3.4,6.8,1.3,0.3,2.7,3.6,12.0
7,T.J. Ford,2003-04,20,MIL,NBA,PG,55,55,26.8,2.8,...,.816,0.7,2.5,3.2,6.5,1.1,0.1,2.5,2.2,7.1
8,Mike Sweetney,2003-04,21,NYK,NBA,PF,42,1,11.8,1.6,...,.724,1.6,2.1,3.7,0.3,0.4,0.3,0.8,1.4,4.3
9,Jarvis Hayes,2003-04,22,WAS,NBA,SF,70,42,29.2,4.0,...,.786,1.0,2.8,3.8,1.5,1.0,0.2,1.6,2.2,9.6


In [111]:
# save to csv file
rookie_df.to_csv('rookie_raw.csv')

# Scraping Rookie Year Advance Stats

Here I am scraping the advance statistics for a rookie. The data is contained in another table in each drafted players extension. I had issues reading the HTML with BeautifulSoup because as it turns out the HTML was commented out in the source page, and I need to replace the the comments tag from the text for BeautifulSoup to properly read the HTML.

In [330]:
url = 'https://www.basketball-reference.com/players/j/jamesle01.html'

response = requests.get(url)
page = response.text.replace('<!--', '').replace('-->', '') # replacing comment tags from HTML
soup = BeautifulSoup(page, "lxml")

# get column headers
column_headers = [th.getText() for th in soup.findAll(class_="overthrow table_container")[4].findAll('th')][0:29]
column_headers.insert(0,"Player")

Player = soup.find_all('h1')[0].getText() # get player name
player_data = [tr.getText() for tr in soup.findAll(class_="overthrow table_container")[4].findAll('tr')[1]]
player_data.insert(0,Player) # insert player name to data

dic = dict(zip(column_headers,player_data))
df = pd.DataFrame(dic,columns=column_headers, index=[0])

In [363]:
# the second to last column (BPM for the player's rookie year) is the dependent variable
# I want to predict with the model
df

Unnamed: 0,Player,Season,Age,Tm,Lg,Pos,G,MP,PER,TS%,...,Unnamed: 12,OWS,DWS,WS,WS/48,Unnamed: 17,OBPM,DBPM,BPM,VORP
0,LeBron James,2003-04,19,CLE,NBA,SG,79,3122,18.3,0.488,...,,2.4,2.6,5.1,0.078,,2.2,-0.2,1.9,3.1


In [366]:
# loop through url_list to get rookie data for all drafted players
url_template2 = "https://www.basketball-reference.com{player}"
rookie_advanced_df = pd.DataFrame()

for player in url_list:
    print(player)
    try:
        url = url_template2.format(player=player)

        response = requests.get(url)
        page = response.text.replace('<!--', '').replace('-->', '')

        soup = BeautifulSoup(page, 'html5lib')
        
        column_headers = [th.getText() for th in soup.findAll(class_="overthrow table_container")[4].findAll('th')][0:29]
        column_headers.insert(0,"Player")
        Player = soup.find_all('h1')[0].getText()
        player_data = [tr.getText() for tr in soup.findAll(class_="overthrow table_container")[4].findAll('tr')[1]]
        player_data.insert(0,Player)
        
        dic = dict(zip(column_headers,player_data))
        player_df = pd.DataFrame(dic,columns=column_headers, index=[0])
        
        rookie_advanced_df = rookie_advanced_df.append(player_df, ignore_index=True)
        
        time.sleep(.5+2*random.random())
    except:
        pass

/players/j/jamesle01.html
/players/m/milicda01.html
/players/a/anthoca01.html
/players/b/boshch01.html
/players/w/wadedw01.html
/players/k/kamanch01.html
/players/h/hinriki01.html
/players/f/fordtj01.html
/players/s/sweetmi01.html
/players/h/hayesja01.html
/players/p/pietrmi01.html
/players/c/collini01.html
/players/b/banksma01.html
/players/r/ridnolu01.html
/players/g/gainere01.html
/players/b/belltr01.html
/players/c/cabarza01.html
/players/w/westda01.html
/players/p/pavloal01.html
/players/j/jonesda02.html
/players/d/diawbo01.html
/players/p/planizo01.html
/players/o/outlatr01.html
/players/c/cookbr01.html
/players/d/delfica01.html
/players/e/ebind01.html
/players/p/perkike01.html
/players/b/barbole01.html
/players/h/howarjo01.html
/players/l/lampema01.html
/players/k/kaponja01.html
/players/w/waltolu01.html
/players/b/beaslje01.html
/players/s/schorso01.html
/players/s/szewcsz01.html
/players/a/austima01.html
/players/h/hansetr01.html
/players/b/blakest01.html
/players/v/vranesl01.

/players/t/tomican01.html
/players/d/dragigo01.html
/players/w/walkebi01.html
/players/h/hairsma01.html
/players/h/hardide01.html
/players/j/jacksda01.html
/players/d/dragita01.html
/players/l/leunema01.html
/players/t/taylomi01.html
/players/k/kaunsa01.html
/players/c/crawfjo01.html
/players/e/erdense01.html
/players/g/griffbl01.html
/players/t/thabeha01.html
/players/h/hardeja01.html
/players/e/evansty01.html
/players/r/rubiori01.html
/players/f/flynnjo01.html
/players/c/curryst01.html
/players/h/hilljo01.html
/players/d/derozde01.html
/players/j/jennibr01.html
/players/w/willite01.html
/players/h/hendege02.html
/players/h/hansbty01.html
/players/c/clarkea01.html
/players/d/dayeau01.html
/players/j/johnsja01.html
/players/h/holidjr01.html
/players/l/lawsoty01.html
/players/t/teaguje01.html
/players/m/maynoer01.html
/players/c/collida01.html
/players/c/clavevi01.html
/players/c/casspom01.html
/players/m/mulleby01.html
/players/b/beaubro01.html
/players/g/gibsota01.html
/players/c/carr

/players/e/ennisty01.html
/players/h/harriga01.html
/players/c/cabocbr01.html
/players/m/mcgarmi01.html
/players/a/adamsjo01.html
/players/h/hoodro01.html
/players/n/napiesh01.html
/players/c/capelca01.html
/players/h/hairspj02.html
/players/b/bogdabo01.html
/players/w/wilcocj01.html
/players/h/huestjo01.html
/players/a/anderky01.html
/players/i/inglida01.html
/players/m/mcdankj01.html
/players/h/harrijo01.html
/players/e/earlycl01.html
/players/s/stokeja01.html
/players/o/obryajo01.html
/players/d/daniede01.html
/players/d/dinwisp01.html
/players/g/grantje01.html
/players/r/robingl02.html
/players/j/jokicni01.html
/players/j/johnsni01.html
/players/t/tavarwa01.html
/players/b/brownma02.html
/players/p/poweldw01.html
/players/c/clarkjo01.html
/players/s/smithru01.html
/players/p/pattela01.html
/players/b/bairsca01.html
/players/b/brownal01.html
/players/a/antetth01.html
/players/m/micicva01.html
/players/g/gential01.html
/players/d/dangune01.html
/players/c/chrisse01.html
/players/m/ma

# Cleaning the data

I cleaned the rookie year data by dropping empty columns and dropping null values.

In [384]:
rookie_advanced_df.columns

Index(['Player', 'Season', 'Age', 'Tm', 'Lg', 'Pos', 'G', 'MP', 'PER', 'TS%',
       '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%',
       'USG%', ' ', 'OWS', 'DWS', 'WS', 'WS/48', ' ', 'OBPM', 'DBPM', 'BPM',
       'VORP'],
      dtype='object')

In [392]:
# rename null columns to drop
rookie_advanced_df.columns.values[20] = 'null'
rookie_advanced_df.columns.values[25] = 'null'

In [396]:
# dropped null columns
rookie_advanced_df = rookie_advanced_df.drop(['null'], axis=1)

In [397]:
rookie_advanced_df.columns

Index(['Player', 'Season', 'Age', 'Tm', 'Lg', 'Pos', 'G', 'MP', 'PER', 'TS%',
       '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%',
       'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP'],
      dtype='object')

In [398]:
rookie_advanced_df

Unnamed: 0,Player,Season,Age,Tm,Lg,Pos,G,MP,PER,TS%,...,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
0,LeBron James,2003-04,19,CLE,NBA,SG,79,3122,18.3,.488,...,13.9,28.2,2.4,2.6,5.1,.078,2.2,-0.2,1.9,3.1
1,Darko Milicic,2003-04,18,DET,NBA,C,34,159,6.1,.318,...,14.7,26.0,-0.6,0.5,-0.2,-0.049,-10.2,3.4,-6.8,-0.2
2,Carmelo Anthony,2003-04,19,DEN,NBA,SF,82,2995,17.6,.509,...,12.7,28.5,3.7,2.4,6.1,.098,1.2,-1.2,0.1,1.6
3,Chris Bosh,2003-04,19,TOR,NBA,C,75,2510,15.1,.513,...,11.3,18.3,2.5,3.8,6.2,.119,-2.0,1.7,-0.3,1.1
4,Dwyane Wade,2003-04,22,MIA,NBA,PG,61,2126,17.6,.530,...,17.3,25.0,2.2,2.3,4.6,.103,0.9,0.4,1.3,1.8
5,Chris Kaman,2003-04,21,LAC,NBA,C,82,1843,9.6,.502,...,23.8,15.7,-0.9,0.9,0.0,.001,-4.0,0.1,-3.8,-0.9
6,Kirk Hinrich,2003-04,23,CHI,NBA,PG,76,2706,13.1,.510,...,18.5,18.2,2.1,1.9,4.1,.072,0.8,-1.3,-0.5,1.0
7,T.J. Ford,2003-04,20,MIL,NBA,PG,55,1472,12.1,.443,...,24.0,17.9,-0.2,0.9,0.7,.022,-1.6,-0.9,-2.6,-0.2
8,Mike Sweetney,2003-04,21,NYK,NBA,PF,42,494,17.2,.544,...,16.2,18.6,0.8,0.7,1.4,.138,-0.8,-0.2,-1.0,0.1
9,Jarvis Hayes,2003-04,22,WAS,NBA,SF,70,2044,9.4,.456,...,13.0,18.4,-0.6,1.1,0.5,.012,-2.6,-1.1,-3.7,-0.9


In [399]:
# save to csv file
rookie_advanced_df.to_csv('rookie_advanced_raw.csv')