In [1]:
!pip install beautifulsoup4



In [38]:
import os
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from urllib.parse import urljoin
from urllib.request import urlretrieve
import json

In [3]:
url = "https://hoopshype.com/salaries/players/2022-2023/"

response = requests.get(url)

if response.status_code == 200:
    html_code = response.text
else:
    print(f"Failed to retrieve HTML. Status code: {response.status_code}")

In [4]:
def get_soup(url):
    response = requests.get(url)

    if response.status_code == 200:
        html_code = response.text
        return BeautifulSoup(html_code, 'html.parser')
#     else:
#         print(f"Failed to retrieve HTML. Status code: {response.status_code}")

In [5]:
type(get_soup("https://hoopshype.com/player/vince-williams-jr/salary/"))

NoneType

In [6]:
soup = get_soup("https://hoopshype.com/salaries/players/2022-2023/")

In [7]:
table = soup.find('table', class_='hh-salaries-ranking-table')

In [8]:
td_names = table.find_all('td', class_='name')

In [9]:
names = []

for td_element in td_names:
    a_element = td_element.find('a')
    if a_element:
        a_text = a_element.text.strip()
        names.append(a_text)

In [10]:
td_rows = table.find('tbody').find_all('tr')

In [11]:
salaries = []

for row in td_rows:
    salaries.append(re.search(r'\$\d{1,3}(?:,\d{3})*', row.find('td', class_="").text).group())

## Retrieving the team for which a player played for

In [12]:
tim_hardaway_jr_soup = get_soup("https://hoopshype.com/player/tim-hardaway-jr/salary/")
tim_hardaway_jr_payrolls = tim_hardaway_jr_soup.find_all('div', class_='player-payroll')

tim_hardaway_jr_tr = []
for tim_hardaway_jr_payroll in tim_hardaway_jr_payrolls:
    if tim_hardaway_jr_payroll.find('p').find('span').text == 'Past Salaries':
        tim_hardaway_jr_tr = tim_hardaway_jr_payroll.find('tbody').find_all('tr')

In [13]:
for row in tim_hardaway_jr_tr:
    a_element = row.find('td', class_='table-key').find('a')
    
    if a_element:
        a_text = a_element.text.strip()
        if a_text[:4] == '2022':
            print(row.find('td', class_='table-value2').find('a').text.strip())

Dallas Mavericks


In [14]:
def get_team(name, year):

    player_soup = get_soup("https://hoopshype.com/player/" + name + "/salary/")
    
    if player_soup:
        player_payrolls = player_soup.find_all('div', class_='player-payroll')
        player_tr = []
        for player_payroll in player_payrolls:
            if player_payroll.find('p').find('span').text == 'Past Salaries':
                player_tr = player_payroll.find('tbody').find_all('tr')
        
        for row in player_tr:
            a_element = row.find('td', class_='table-key').find('a')
            if a_element:
                a_text = a_element.text.strip()
                if a_text[:4] == str(year):
                    return row.find('td', class_='table-value2').find('a').text.strip()
    else:
        return "None"

## Get Headshot Photo 

In [15]:
def get_headshot_photo(url, output_folder):
    
    player_soup = get_soup(url)

    if player_soup:
        headshot_tag = player_soup.find('div', class_='player-headshot')
        
        if headshot_tag:
            img_tag = headshot_tag.find('img')

            os.makedirs(output_folder, exist_ok=True)

            img_url = img_tag.get('src')

            if img_url:
                img_url = urljoin(url, img_url)
                img_filename = os.path.join(output_folder, os.path.basename(img_url))
                urlretrieve(img_url, img_filename)
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")

webpage_url = "https://hoopshype.com/player/stephen-curry/salary/"
output_folder = "downloaded_images"

get_headshot_photo(webpage_url, output_folder)

* Not included in the combined function

## Combining all the functions into one

In [16]:
def get_formatted_name(name):
    lowercase_name = name.lower()
    formatted_name = lowercase_name.replace(" ", "-")
    return formatted_name

In [17]:
'''
Retrieves NBA players salary data
Enter season as {Year}-{Year+1}
'''
def retrieve(season):
    url = "https://hoopshype.com/salaries/players/" + season
    soup = get_soup(url)
    
    table = soup.find('table', class_='hh-salaries-ranking-table')
    td_names = table.find_all('td', class_='name')
    names = []
    teams = []

    for td_element in td_names:
        a_element = td_element.find('a')
        if a_element:
            name = a_element.text.strip()
            names.append(name)
            
            team = get_team(get_formatted_name(name), season[:4])
            teams.append(team)
            
    td_rows = table.find('tbody').find_all('tr')
    
    salaries = []

    for row in td_rows:
        salaries.append(re.search(r'\$\d{1,3}(?:,\d{3})*', row.find('td', class_="").text).group())
    
    df = pd.DataFrame({'Name': names, 'Team': teams, 'Salary': salaries})
    
    return df

In [46]:
player_df_list = []

for i in range(1990, 2023):
    season = str(i) + "-" + str(i+1)
    print(season)
    player_df = retrieve(season)
    
    player_df['Salary'] = player_df['Salary'].str.replace('[\$,]', '', regex=True).astype(int)
    player_df.to_json(season + '.json', orient='records', indent=4)

1990-1991
1991-1992
1992-1993
1993-1994
1994-1995
1995-1996
1996-1997
1997-1998
1998-1999
1999-2000
2000-2001
2001-2002
2002-2003
2003-2004
2004-2005
2005-2006
2006-2007
2007-2008
2008-2009
2009-2010
2010-2011
2011-2012
2012-2013
2013-2014
2014-2015
2015-2016
2016-2017
2017-2018
2018-2019
2019-2020
2020-2021
2021-2022
2022-2023
