# Code to scrape player data from Basketball-reference.com

In [155]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import urllib.request
import time 

In [124]:
# Here is a function to turn a url address into a beautiful soup object
# url should be a string
# file path should also be a string specifying the path to save the html data
def url_to_soup(url, file_path='', save=0):
    # here we actually access the website
    with urllib.request.urlopen(url) as response:
        html = response.read()
        html = html.decode('utf-8')

    # save the file
    if save: 
        with open(file_path, 'w') as new_file:
            new_file.write(html)

    # here it's already a local operation
    soup = BeautifulSoup(html, 'html.parser')
    return soup

In [21]:
def file_to_df(path):
    with open(path) as file:
        soup = BeautifulSoup(file, 'html.parser')
    df = pd.read_html(str(soup))[0]
    return df

In [147]:
# grab data for Donovan Mitchell from Sports-ref
don_url = 'https://www.basketball-reference.com/players/m/mitchdo01.html'  # contains stats for each of his seasons

don_soup = url_to_soup(don_url, 'Data/Donovan-Mitchell-career-stats')
df_don = pd.read_html(str(don_soup))[0]
df_don[df_don['Season'] == 'Career'].index[0]
df_don = df_don.drop(4)

df_don['Name'] = 'Donovan Mitchell'
df_don.head()

Unnamed: 0,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Name
0,2017-18,21.0,UTA,NBA,SG,79,71,33.4,7.5,17.2,...,0.7,3.1,3.7,3.7,1.5,0.3,2.7,2.7,20.5,Donovan Mitchell
1,2018-19,22.0,UTA,NBA,SG,77,77,33.7,8.6,19.9,...,0.8,3.3,4.1,4.2,1.4,0.4,2.8,2.7,23.8,Donovan Mitchell
2,2019-20,23.0,UTA,NBA,SG,69,69,34.3,8.7,19.4,...,0.8,3.6,4.4,4.3,1.0,0.2,2.7,2.5,24.0,Donovan Mitchell
3,2020-21,24.0,UTA,NBA,SG,44,44,33.6,8.8,20.0,...,1.0,3.5,4.5,5.5,1.0,0.3,3.0,2.2,25.7,Donovan Mitchell


In [23]:
# Clean up data, 

In [171]:
# Experiment with a more generic way to scrape data:
# Player data is stored in an alphabetized table sorted by name
# each player has a unique url address with a stats table.('https://www.basketball-reference.com/players/m/mitchdo01.html')
# the /m indicates the first letter of the player's last name followed by his unique url
# We can loop through the letters of the alphabet, and grab the unique url from the last name landing page html

# Define alphabet list, note: no present or historical players have a last name starting with x, so ignore it.
alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z']
df_players = pd.DataFrame()
url_base = 'https://www.basketball-reference.com'

# Try to extract player url from table
# for ii in range(len(alphabet)):
soup_temp = url_to_soup(url_base + '/players/' + alphabet[0])
names = soup_temp.find_all(class_='left')
for count, player in enumerate(names):
    item = str(player.find("a"))
    if '/players/' + alphabet[0] in item:
        item = item.split('">')
        # print('item: ', item)
        url = item[0].split('href="')[1]
        # print('url: ', url)
        name = item[1].split('</a>')[0]
        # print('name: ', name)

        # Turn url into soup object and extract data table
        soup_player = url_to_soup(url_base + url)
        df_temp = pd.read_html(str(soup_player))[0]
        
        # drop career stats
        index = df_temp[df_temp['Season'] == 'Career'].index[0]
        df_temp = df_temp.drop(range(index, df_temp.shape[0]))
        
        # add column with player's name
        df_temp['Name'] = name
        

        #drop data from before a certain year (1979 when 'all modern' stats were tracked)
        # so if any player finished career before 1979, don't append the dataframe to the master list
        final_year = df_temp.iloc[-1, 0].split('-')[0]
        if int(final_year) >= 1979:
            df_players = df_players.append(df_temp)

        
        # throw in a delay to avoid getting blocked
        time.sleep(5)

    # this is only for development, remove once everything is working    
    if count == 20:
        break
        
df_players.tail(15)
        

Unnamed: 0,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Name
6,2002-03,26.0,ATL,NBA,PF,81,81.0,38.1,7.0,14.6,...,2.2,6.2,8.4,3.0,1.1,0.5,2.6,3.0,19.9,Shareef Abdur-Rahim
7,2003-04,27.0,TOT,NBA,PF,85,56.0,31.6,5.9,12.4,...,2.2,5.3,7.5,2.0,0.8,0.4,2.2,2.6,16.3,Shareef Abdur-Rahim
8,2003-04,27.0,ATL,NBA,PF,53,53.0,36.9,7.2,14.9,...,2.7,6.7,9.3,2.4,0.8,0.4,2.5,2.8,20.1,Shareef Abdur-Rahim
9,2003-04,27.0,POR,NBA,PF,32,3.0,22.8,3.7,8.3,...,1.5,3.0,4.5,1.5,0.8,0.6,1.7,2.3,10.0,Shareef Abdur-Rahim
10,2004-05,28.0,POR,NBA,PF,54,49.0,34.6,6.2,12.4,...,2.3,5.0,7.3,2.1,0.9,0.5,2.2,2.8,16.8,Shareef Abdur-Rahim
11,2005-06,29.0,SAC,NBA,PF,72,30.0,27.2,4.6,8.8,...,1.5,3.5,5.0,2.1,0.7,0.6,1.5,3.2,12.3,Shareef Abdur-Rahim
12,2006-07,30.0,SAC,NBA,C,80,45.0,25.2,3.9,8.2,...,1.5,3.5,5.0,1.4,0.7,0.5,1.5,3.0,9.9,Shareef Abdur-Rahim
13,2007-08,31.0,SAC,NBA,PF,6,0.0,8.5,0.5,2.3,...,1.0,0.7,1.7,0.7,0.2,0.0,0.2,1.5,1.7,Shareef Abdur-Rahim
0,1976-77,22.0,LAL,NBA,SF,70,,19.7,2.4,5.0,...,1.6,2.5,4.2,1.4,0.7,0.1,,1.7,6.3,Tom Abernethy
1,1977-78,23.0,LAL,NBA,SF,73,,18.0,2.8,5.5,...,1.4,2.2,3.6,1.4,0.8,0.3,0.7,1.7,6.8,Tom Abernethy
