In [1]:
# imports and setup 
from bs4 import BeautifulSoup
# you can use either of these libraries to get html from a website
import requests
import urllib.request
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import re

import pandas as pd
import scipy as sc
import numpy as np
import time
import statsmodels.formula.api as sm

import matplotlib.pyplot as plt 
plt.style.use('ggplot')
%matplotlib inline 
plt.rcParams['figure.figsize'] = (10, 6) 
from string import ascii_lowercase

Below is a sample page that we are trying to pull our data from. At the top right is where we are getting their different accomplishments/accolades. We are also pulling their draft position from the top left. We are then grabbing the first row of 3 different tables (Per Game, Per 100, and Advanced) which gives us their rookie stats. We grab the career row of each of these tables as well.

![LBJ](LBJ Page.png)

In [2]:
#Get all the pages for each letter
url = "https://www.basketball-reference.com/players"
all_letters = ""
html = {}
i = 0
for x in ascii_lowercase:
    if x == 'x':
        pass
    else:
        link = url + '/' + str(x) + '/'
        with urllib.request.urlopen(link) as response:
            html[i] = response.read()
            html[i] = html[i].decode('utf-8')
        all_letters = all_letters + str(html[i])
        i += 1

In [3]:
with open('all_letters.html','w') as new_file:
    new_file.write(all_letters)

In [4]:
letters_soup = BeautifulSoup(open('all_letters.html',encoding = 'utf-8'), "html.parser")

In [5]:
#Get every players' link
links = []
url = "https://www.basketball-reference.com"
for x in letters_soup.find_all(class_="left "):
    if x.get("data-stat") == "player":
        link_tail = x.find("a").get("href")
        link = url + str(link_tail)
        links.append(link)


In [None]:
rookie = []
columns_pergame_r = ['Name','link','Year','Age','Team','League','Position','Games Played','Games Started','MinPerGame','FGPerGame','FGAPerGame','FG%PerGame','3pPerGame','3PAPerGame','3p%PerGame','2pPerGame','2pAPerGame','2p%PerGame','eFG%PerGame','FTPerGame','FTAPerGame','FT%PerGame','ORGPerGame','DRBPerGame','TotalRebPerGame','AstPerGame','StealPerGame','BlkPerGame','TOVPerGame','FoulsPerGame','PtsPerGame']
df_pergame = pd.DataFrame(columns = columns_pergame_r)
columns_per100_r = ['Name','link','Year','Age','Team','League','Position','Games Played','Games Started','TotalMin','FGPer100Poss','FGAPer100Poss','FG%Per100Poss','3pPer100Poss','3PAPer100Poss','3p%Per100Poss','2pPer100Poss','2pAPer100Poss','2p%Per100Poss','FTPer100Poss','FTAPer100Poss','FT%Per100Poss','ORGPer100Poss','DRBPer100Poss','TotalRebPer100Poss','AstPer100Poss','StealPer100Poss','BlkPer100Poss','TOVPer100Poss','FoulsPer100Poss','PtsPer100Poss','blank','ORtg','DRtg']
df_per100 = pd.DataFrame(columns = columns_per100_r)
columns_advanced_r = ['Name','link','Year','Age','Team','League','Position','Games Played','TotalMin','PER','TS%','3PAr','FTr','ORB%','DRB%','TRB%','AST%','STL%','BLK%','TOV%','USG%','blank','OWS','DWS','WS','WS/48','blank','OBPM','DBPM','BPM','VORP']
df_advanced = pd.DataFrame(columns = columns_advanced_r)

career = []
columns_pergame_c = ['Name','link','Year','blank','blank','League','blank','Games Played','Games Started','MinPerGame','FGPerGame','FGAPerGame','FG%PerGame','3pPerGame','3PAPerGame','3p%PerGame','2pPerGame','2pAPerGame','2p%PerGame','eFG%PerGame','FTPerGame','FTAPerGame','FT%PerGame','ORGPerGame','DRBPerGame','TotalRebPerGame','AstPerGame','StealPerGame','BlkPerGame','TOVPerGame','FoulsPerGame','PtsPerGame']
career_pergame = pd.DataFrame(columns = columns_pergame_c)
columns_per100_c = ['Name','link','Year','blank','blank','League','blank','Games Played','Games Started','TotalMin','FGPer100Poss','FGAPer100Poss','FG%Per100Poss','3pPer100Poss','3PAPer100Poss','3p%Per100Poss','2pPer100Poss','2pAPer100Poss','2p%Per100Poss','FTPer100Poss','FTAPer100Poss','FT%Per100Poss','ORGPer100Poss','DRBPer100Poss','TotalRebPer100Poss','AstPer100Poss','StealPer100Poss','BlkPer100Poss','TOVPer100Poss','FoulsPer100Poss','PtsPer100Poss','blank','ORtg','DRtg']
career_per100 = pd.DataFrame(columns = columns_per100_c)
columns_advanced_c = ['Name','link','Year','blank','blank','League','blank','Games Played','TotalMin','PER','TS%','3PAr','FTr','ORB%','DRB%','TRB%','AST%','STL%','BLK%','TOV%','USG%','blank','OWS','DWS','WS','WS/48','blank','OBPM','DBPM','BPM','VORP']
career_advanced = pd.DataFrame(columns = columns_advanced_c)

def get_num(x):
    return int(''.join(ele for ele in x if ele.isdigit()))

awards = []
columns_a = ['Name','link','MVPs','All-Star Games','All-NBA Teams','All-Rookie Team','ROY']
career_awards = pd.DataFrame(columns = columns_a)
zeros = [0,0,0,0,0,0,0]

columns_dp = ['Name','link','Draft_Pick']
draft_pick = pd.DataFrame(columns = columns_dp)
zeros1 = [0,0,0]


In [1]:
# Get everything

j=1
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(chrome_options=chrome_options)
start_time = time.time()
for i in range(3646, len(links)):
    j+=1
    iter_time = time.time()
    url = links[i]
    driver.get(url)
    players_soup = BeautifulSoup(driver.page_source,'html.parser')
    for x in players_soup.find_all(id="per_game"):
        for q in players_soup.find("h1"):
            rookie.append(q)
            career.append(q)
        rookie.append(url[47:-5])
        career.append(url[47:-5])
        for y in x.find_all("tr")[1]:
            rookie.append(y.get_text())
        if len(rookie) == 32:
            df_pergame.loc[i] = rookie  ##Get rookie_pergame
        rookie = []
        for r in x.find_all("tfoot"):
            for s in r.find_all("tr")[0]:
                career.append(s.get_text())
        if len(career) == 32:
            career_pergame.loc[i] = career #Get career_pergame
        career = []
        
    for x in players_soup.find_all(id="per_poss"):
        for q in players_soup.find("h1"):
            rookie.append(q)
            career.append(q)
        rookie.append(url[47:-5])
        career.append(url[47:-5])
        for y in x.find_all("tr")[1]:
            rookie.append(y.get_text())
        if len(rookie) == 34:
            df_per100.loc[i] = rookie #Get rookie_per100
        rookie = []  
        
        for z in x.find_all("tfoot"):
            for y in z.find_all("tr")[0]:
                career.append(y.get_text())
        if len(career) == 34:
            career_per100.loc[i] = career #Get career_per100
        career = []
        
    for x in players_soup.find_all(id="advanced"):
        for q in players_soup.find("h1"):
            rookie.append(q)
            career.append(q)
        rookie.append(url[47:-5])
        career.append(url[47:-5])
        for y in x.find_all("tr")[1]:
            rookie.append(y.get_text())
        if len(rookie) == 31:
            df_advanced.loc[i] = rookie #Get rookie_advanced
        rookie = []
        
        for z in x.find_all("tfoot"):
            for y in z.find_all("tr")[0]:
                career.append(y.get_text())
        if len(career) == 31:
            career_advanced.loc[i] = career #Get career_advanced
        career = []
        
    for x in players_soup.find_all(id="info"): #Get awards
        if x.find_all(id="bling") != []:
            career_awards.loc[i] = zeros
            career_awards['link'].loc[i] = url[47:-5]
            for y in x.find_all("li"):
                for q in players_soup.find("h1"):
                    career_awards['Name'].loc[i] = q
                career_awards['link'].loc[i] = url[47:-5]
                result = y.get_text()
                if "x MVP" in result:
                    career_awards['MVPs'].loc[i] = get_num(result)
                if re.findall(r'\d\sMVP',result) != []:
                    career_awards['MVPs'].loc[i] = 1
                if "All Star" in result:
                    career_awards['All-Star Games'].loc[i] = get_num(result)
                if "All-NBA" in result:
                    if get_num(result) > 30: #For when it says 2018 All-NBA, this means they only won once. We don't want it saying they won 2018 times
                        career_awards['All-NBA Teams'].loc[i] = 1
                    else:
                        career_awards['All-NBA Teams'].loc[i] = get_num(result)
                if "All-Rookie" in result:
                    career_awards['All-Rookie Team'].loc[i] = 1
                if "ROY" in result:
                    career_awards['ROY'].loc[i] = 1
                result = ""
        else:
            career_awards.loc[i] = zeros
            career_awards['link'].loc[i] = url[47:-5]
            for q in players_soup.find("h1"):
                career_awards['Name'].loc[i] = q
            
    for x in players_soup.find_all(id="meta"): #Get their draft position
        draft_pick.loc[i] = zeros1
        result = x.get_text()
        for q in players_soup.find("h1"):
            draft_pick['Name'].loc[i] = q
        draft_pick['link'].loc[i] = url[47:-5]
        if re.findall(r'\d{1,3}\w\w\soverall',result) != []:
            pick = re.findall(r'\d{1,3}\w\w\soverall',result)[0]
            draft_pick['Draft_Pick'].loc[i] = re.findall(r'\d{1,3}',pick)[0]
        else: 
            draft_pick['Draft_Pick'].loc[i] = np.nan
driver.close()

In [41]:
df_pergame.to_csv("per_game.csv")

In [42]:
df_advanced.to_csv('rookie_advanced.csv')

In [43]:
df_per100.to_csv('per_100.csv')

In [44]:
career_pergame.to_csv('career_pergame.csv')

In [45]:
career_per100.to_csv('career_per100.csv')

In [46]:
career_advanced.to_csv('career_advanced.csv')

In [47]:
career_awards.to_csv('career_awards.csv')

In [48]:
draft_pick.to_csv('draft_pick.csv')

In [21]:
#Read in all the csv's
career_pergame = pd.read_csv('career_pergame.csv',index_col=0)

career_advanced = pd.read_csv('career_advanced.csv',index_col=0)

career_per100 = pd.read_csv('career_per100.csv',index_col=0)

rookie_pergame = pd.read_csv('per_game.csv', index_col = 0)

rookie_advanced = pd.read_csv('rookie_advanced.csv', index_col = 0)

rookie_per100 = pd.read_csv('per_100.csv', index_col = 0)

awards = pd.read_csv('career_awards.csv',index_col=0)

draft_pick = pd.read_csv('draft_pick.csv',index_col=0)

rookie_awards = awards[['Name','link','ROY','All-Rookie Team']]

career_awards = awards[['Name','link','MVPs','All-Star Games','All-NBA Teams']]

In [22]:
rookie3 = rookie_pergame.merge(rookie_per100,on=['Name','link','Year','Age','Position','Games Played','Games Started'])
rookie2 = rookie3.merge(rookie_advanced, on=['Name','link','Year','Age','Position','Games Played'])
rookie1 = rookie2.merge(rookie_awards, on=['Name','link'])
rookie = rookie1.merge(draft_pick,on=['Name','link'])

In [23]:
young_players = rookie[rookie['Year'].isin(['2017-18','2016-17','2015-16','2014-15'])]
young_players = young_players[['Name','link']]

In [24]:
cols = [c for c in rookie.columns if c.lower()[:5] != 'blank']
rookie = rookie[cols]

In [25]:
rookie = rookie.drop(['Team_x','League_x','Team_y','League_y','TotalMin_x','TotalMin_y','Games Started','Team','League'],axis=1)

In [26]:
rookie = rookie.merge(young_players, on=['Name','link'], how='outer', indicator=True).query("_merge != 'both'").drop('_merge', axis=1).reset_index(drop=True)

In [27]:
persons_of_interest = rookie[['Name','link']]

In [18]:
career3 = career_pergame.merge(persons_of_interest, on=['Name','link'])
career2 = career3.merge(career_per100, on=['Name','link','Games Played'])
career1 = career2.merge(career_advanced, on=['Name','link','Games Played'])
career = career1.merge(career_awards, on=['Name','link'])

In [19]:
cols = [c for c in career.columns if c.lower()[:5] != 'blank']
career = career[cols]
career = career.drop(['Year_x','League_x','Games Started_x','Year_y','League_y','Games Started_y','TotalMin_x','TotalMin_y'],axis=1)

In [20]:
career.to_csv('career_v2.csv')
rookie.to_csv('rookie_v2.csv')