## Web Scraping - All Players

In [27]:
import requests
from bs4 import BeautifulSoup, Comment
import time
import re
from datetime import datetime

# Set download delay
download_delay = 2

# to get all players, we need to loop through all the alphabet
url = "https://www.basketball-reference.com/players/"
domain = "https://www.basketball-reference.com"

players = []
salaries = []
per_game = []
similar_player = []

response = requests.get(url)
time.sleep(download_delay)
soup = BeautifulSoup(response.text, 'html.parser')
player_alphabet_list = soup.select("body > #wrap > #content > ul > li > a")
print(response.status_code)

count = 0
for a in player_alphabet_list:
    # a is an html tag like <a href="/players/a/">A</a>, extract the href attribute and the text
    player_alphabet_url = domain + a['href']

    resp = requests.get(player_alphabet_url)
    time.sleep(download_delay)
    soup = BeautifulSoup(resp.text, 'html.parser')
    player_list = soup.select("body > #wrap > #content > #all_players > #div_players > table > tbody > tr > th > a")

    for player in player_list:
        count += 1
        player_url = domain + player['href']
        resp = requests.get(player_url)
        time.sleep(download_delay)
        soup = BeautifulSoup(resp.text, 'html.parser')
        
        # get player's name
        meta = soup.select("body > #wrap > #info > #meta")
        player_name = meta[0].select('h1 > span')[0].text
        print('Count:', count, 'Name:', player_name)
        
        # get player's position, shoots, recruiting rank, draft team, years of experience
        strong_list = meta[0].find_all('strong')
        position = ''
        shoots = ''
        age = ''
        recruiting_rank = ''
        draft = ''
        yrexp = ''
        for s in strong_list:
            if "Position:" in s.text:
                position_tmp = s.next_sibling.split('▪')[0]
                position = re.sub(r'[^a-zA-Z]', '', position_tmp)
            if "Shoots:" in s.text:
                shoots_tmp = s.next_sibling
                shoots = re.sub(r'[^a-zA-Z]', '', shoots_tmp)
            if "Recruiting Rank: " in s.text:
                match_tmp = re.search(r'\((\d+)\)', s.parent.text)
                if match_tmp:
                    recruiting_rank = match_tmp.group(1)
            if "Draft:" in s.text:
                draft = s.parent.find("a").text
            if "Experience:" in s.text or "Career Length:" in s.text:
                yrexp_tmp = s.next_sibling.text
                yrexp = re.search(r'\b(\d+)\b', yrexp_tmp)
                if yrexp:
                    yrexp = yrexp.group(1)

        # get player's height and weight
        p_list = meta[0].find_all('p')
        height = ''
        weight = ''
        for p in p_list:
            if "kg)" in str(p):
                cm_pattern = r'(\d+)cm'
                cm_match = re.search(cm_pattern, str(p))
                if cm_match:
                    height = cm_match.group(1)

                kg_pattern = r'(\d+)kg'
                kg_match = re.search(kg_pattern, str(p))
                if kg_match:
                    weight = kg_match.group(1)

        # get player's birth date
        if meta[0].find(id='necro-birth'):
            player_birthdate = meta[0].find(id='necro-birth')['data-birth']
            # calculate player's age, in years
            player_birthdate = datetime.strptime(player_birthdate, '%Y-%m-%d')
            current_date = datetime.now()
            age = current_date.year - player_birthdate.year - ((current_date.month, current_date.day) < (player_birthdate.month, player_birthdate.day))

        # get player's performance stats summary
        stats = soup.select("body > #wrap > #info > .stats_pullout > div > div > p:nth-child(3)")[1:]
        stats = [s.text for s in stats]
        if len(stats) != 10:
            stats = stats[:5] + [""] + [stats[5]] + [""] + stats[-2:]

        # get player's per_game stats table
        per_game_table = soup.select("#per_game > tbody")
        for row in per_game_table[0].select("tr"):
            season = row.select("th > a")
            if not season:
                continue
            season = season[0].text
            raw = [o.text for o in row.select("td")]
            per_game.append([player_name, player_url, season] + raw)

        # Additional data: get each player's 3 most similar players and the similarity scores
        similarity_div = soup.select("#all_sims")
        if similarity_div:
            similarity_div = similarity_div[0]
            comment = similarity_div.find_all(string=lambda text: isinstance(text, Comment))[0]
            comment_soup = BeautifulSoup(str(comment), 'html.parser')
            sim_row = comment_soup.select("tbody > tr")
            for s in sim_row[1:4]:
                sim_playername = s.select("th > a")[0].text
                sim_playerurl = domain + s.select("th > a")[0]['href']
                sim_score = s.select("td")[0].text
                similar_player.append([player_name, player_url, sim_playername, sim_playerurl, sim_score])

        # get player's salaries: season, team record, league, salary(in USD), current team
        salary_record = []
        current_team = draft
        salary_div = soup.select("#all_all_salaries")
        if salary_div:
            salary_div = salary_div[0]
            comment = salary_div.find_all(string=lambda text: isinstance(text, Comment))[0]
            comment_soup = BeautifulSoup(str(comment), 'html.parser')
            salary_comment = comment_soup.select("tbody > tr")
            for s in salary_comment:
                salary_season = s.find("th").text
                salary_tmp = s.select("td > a")
                salary_info = [a.text for a in salary_tmp]
                salary_num = s.select("td:nth-child(4)")[0].text
                salary_num = salary_num.replace("$", "").replace(",", "")
                salary_row = [salary_season] + salary_info + [salary_num]
                salary_record.append(salary_row)
                # append player's salary record to the "salaries" list
                salaries.append([player_name]+[salary_season] + salary_info + [salary_num, player_url])
            
            # get player's current team from the last salary record, if salary record exists
            current_team = salary_record[-1][1]

        # append player's info to the "players" list
        players.append([player_name, position, shoots, height, weight, current_team, age, recruiting_rank, draft, yrexp] + stats + [player_url])

# write results into separate csv files
import csv
with open('players.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["Name", "Position", "Shoots", "HeightCM", "WeightKG", "Current Team", "Age", "Recruiting Rank", "Draft", "Years of Experience", "G", "PTS", "TRB", "AST", "FG%", "FG3%", "FT%", "eFG%", "PER", "WS", "url"])
    writer.writerows(players)

with open('salaries.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["Name", "Season", "Team", "League", "SalaryUSD", "url"])
    writer.writerows(salaries)

with open('per_game_stats.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["Name", "url", "Season", "Age", "Tm", "Lg", "Pos", "G", "GS", "MP", "FG", "FGA", "FG%", "3P", "3PA", "3P%", "2P", "2PA", "2P%", "eFG%", "FT", "FTA", "FT%", "ORB", "DRB", "TRB", "AST", "STL", "BLK", "TOV", "PF", "PTS", "Awards"])
    writer.writerows(per_game)

with open('similar_players.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["Name", "url", "Sim_Name", "Sim_url", "Sim_score"])
    writer.writerows(similar_player)

200
Count: 1 Name: Alaa Abdelnaby
Count: 2 Name: Zaid Abdul-Aziz
Count: 3 Name: Kareem Abdul-Jabbar
Count: 4 Name: Mahmoud Abdul-Rauf
Count: 5 Name: Tariq Abdul-Wahad
Count: 6 Name: Shareef Abdur-Rahim
Count: 7 Name: Tom Abernethy
Count: 8 Name: Forest Able
Count: 9 Name: John Abramovic
Count: 10 Name: Álex Abrines
Count: 11 Name: Alex Acker
Count: 12 Name: Don Ackerman
Count: 13 Name: Mark Acres
Count: 14 Name: Bud Acton
Count: 15 Name: Quincy Acy
Count: 16 Name: Alvan Adams
Count: 17 Name: Don Adams
Count: 18 Name: George Adams
Count: 19 Name: Hassan Adams
Count: 20 Name: Jaylen Adams
Count: 21 Name: Jordan Adams
Count: 22 Name: Michael Adams
Count: 23 Name: Rafael Addison
Count: 24 Name: Deng Adel
Count: 25 Name: Rick Adelman
Count: 26 Name: Jeff Adrien
Count: 27 Name: Arron Afflalo
Count: 28 Name: Maurice Ager
Count: 29 Name: Mark Aguirre
Count: 30 Name: Blake Ahearn
Count: 31 Name: Danny Ainge
Count: 32 Name: Matt Aitch
Count: 33 Name: Alexis Ajinça
Count: 34 Name: Henry Akin
Coun