In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import time

import requests
from bs4 import BeautifulSoup
from bs4 import Comment
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
import pandas as pd
import pickle

In [26]:
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 500)
pd.set_option("display.precision", 4)

In [170]:
headers = [
    "Season",
    "Team",
    "Pos",
    "G",
    "GS",
    "MP",
    "FG",
    "FGA",
    "FG%",
    "3P",
    "3PA",
    "3P%",
    "2P",
    "2PA",
    "2P%",
    "eFG%",
    "FT",
    "FTA",
    "FT%",
    "ORB",
    "DRB",
    "TRB",
    "AST",
    "STL",
    "BLK",
    "TOV",
    "PF",
    "PTS",
]

df = pd.DataFrame(columns=headers)
advanced_df = pd.DataFrame()
salary_df = pd.DataFrame()

path = "/usr/local/bin/chromedriver"
driver = webdriver.Chrome(path)

url = "https://www.basketball-reference.com/players/"
driver.get(url)

# loop through each alphabet to click
for page in range(1,27):
    try:
        # get xpath for the letters
        letter_index = '//*[@id="div_alphabet"]/ul/li[{}]/a'.format(page)
        letter_button = driver.find_element_by_xpath(letter_index)

        print(letter_button.text)

        # click on the letter button
        letter_button.click()
        
        # wait until the footer at the end of page is loaded
        WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.ID, 'footer')))

        # find active players (in bold text)
        player_list = driver.find_elements_by_xpath(
            '//*[@id="players"]/tbody/tr/th/strong'
        )
        print(len(player_list))
        
        try:
            # loop through all of the players with specific last letter 
            for player in range(len(player_list)):

                time.sleep(1)

                # find xpath for specific player
                player_name = driver.find_elements_by_xpath(
                    '//*[@id="players"]/tbody/tr/th/strong/a'
                )[player]

                current_player = player_name.text

                # scroll down to the location of the link
                driver.execute_script("arguments[0].scrollIntoView();", player_name)

                time.sleep(1)

                # go to player summary info
                player_name.click()
                
                # get the url for specific player
                player_url = driver.current_url

                # parse the specific page
                soup_player = BeautifulSoup(requests.get(player_url).text, "html.parser")

                player_stats = soup_player.find("tbody")

                season_played = len(soup_player.find("tbody").find_all("tr"))


                if season_played > 1:

                    # loop through all rows of player stats table
                    for season in range(season_played):
                        current_season = player_stats.find_all("tr")[season].text.strip()[:7]
                        season_stats = player_stats.find_all("tr")[season].find_all("td")
                        
                        # some player has rows with Did Not Play (played overseas)
                        # skip those rows 
                        if season_stats[2].text.__contains__('Did Not Play'):
                            continue
                        
                        season_stats = [i.text for i in (season_stats[1:2] + season_stats[3:])]
                        season_stats = ['0.0' if i=='' else i for i in season_stats]
                        season_stats = season_stats[:2]+[float(i) for i in season_stats[2:]]
                        season_stats.insert(0, current_season)
                        
                        season_dict = dict(zip(headers, season_stats))
                        season_dict["Player"] = current_player

                        # check if player was selected all stars for that season
                        all_star = player_stats.find_all('tr')[season].find('span')

                        if all_star:
                            season_dict['All_star'] = '1'
                        
                        else:
                            season_dict['All_star'] = '0'
                        
                        # append season statistics
                        df = df.append(season_dict, ignore_index=True)
                    
                    # advanced statistics and salary data was in comments section 
                    # parse the comments section
                    all_comments = soup_player.find_all(
                        string=lambda text: isinstance(text, Comment)
                    )

                    for item in all_comments:
                        # find advanced statistics
                        if "Advanced" in item:
                            adv = BeautifulSoup(item)

                            playertr = adv.find("table", id="advanced")

                            if not playertr:
                                continue  # skip comment without table - go back to `for`

                            playertr = playertr.find("tbody").findAll("tr")

                            for row in playertr:
                                if row:
                                    all_td = row.find_all("td")
                                    advanced_stats = [x.text for x in all_td]
                                    advanced_stats = ['0.0' if x=='' else x for x in advanced_stats]
                                    curr_season = row.find_all('th')[0].text
                                    advanced_headers = ['Season', "Player", "PER", "TS%", "TRB%", "AST%", "STL%", "BLK%", "TOV%", "USG%", "OWS", "DWS", "WS/48", "BPM", "VORP"]

                                    advanced_dict = dict(
                                        zip(
                                            advanced_headers,
                                            [
                                                curr_season,
                                                current_player,
                                                float(advanced_stats[6]),
                                                float(advanced_stats[7]),
                                                float(advanced_stats[12]),
                                                float(advanced_stats[13]),
                                                float(advanced_stats[14]),
                                                float(advanced_stats[15]),
                                                float(advanced_stats[16]),
                                                float(advanced_stats[17]),
                                                float(advanced_stats[19]),
                                                float(advanced_stats[20]),
                                                float(advanced_stats[22]),
                                                float(advanced_stats[26]),
                                                float(advanced_stats[27])
                                            ],
                                        )
                                    )
                                    # append advanced statistics
                                    advanced_df = advanced_df.append(
                                        advanced_dict, ignore_index=True
                                    )

                        # get all salries data             
                        if "Salaries" in item:
                            adv = BeautifulSoup(item)

                            playertr = adv.find("table", id="all_salaries")

                            if not playertr:
                                
                                continue  # skip comment without table - go back to `for`

                            playertr = playertr.find("tbody").findAll("tr")

                            for row in playertr:
                                if row:
                                    all_td = row.find_all("td")
                                    season = row.find_all("th")[0].text
                                    salaries = all_td[2].text
                                    salary_dict = dict(zip(["Player", "Season", "Salary"], [current_player, season, salaries]))
                                    # append salary info
                                    salary_df = salary_df.append(salary_dict, ignore_index=True)
                     
                        # get 2019-2020 contract
                        if "contract" in item:
                            adv = BeautifulSoup(item)
                            try:
                                playertr = adv.find("div", id='div_contract')

                                if playertr:
                                    season_1 = playertr.find_all("th")[1].text
                                    salaries_1 = playertr.find_all('span')[1].text
                                    contract_dict = dict(zip(["Player", "Season", "Salary"], [current_player, season_1, salaries_1]))
                                    # append 2019 salary
                                    salary_df = salary_df.append(contract_dict, ignore_index=True)
                            except:
                                continue

                        else:
                            continue
                    
                    driver.back()
                            
                else:
                    driver.back()
                        
        except:
            print(player, 'player not working')

    except Exception:
        print(page, 'page not working')

    driver.back()
    time.sleep(1)

    final_df = pd.merge(df, salary_df, how='left')
    final_df = final_df.merge(advanced_df, on=["Player", "Season"]).drop_duplicates(subset=["Player", "Season"], keep="first").reset_index()
    pickle.dump(final_df, open("final_{}.p".format(page), "wb"))

driver.quit()


Z
4
