In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timezone, timedelta

In [2]:
# NBA season we will be analyzing
year = 2021
# URL page we will scraping (see image above)
url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html".format(year)
# this is the HTML from the given URL
html = urlopen(url)
soup = BeautifulSoup(html)

In [3]:
url
# url is literlly just the url link

'https://www.basketball-reference.com/leagues/NBA_2021_per_game.html'

In [4]:
html
# this is the httpresponse code we get after opening the url

<http.client.HTTPResponse at 0x1d125744c10>

In [6]:
# soup
# soup is the LITERAL HTML.  ITS HUNDREDS OF LINES LONG, THOUSANDS

# we need to grab just the elements we want.

In [5]:
# use findALL() to get the column headers
# soup.findAll('tr', limit=2)
# use getText()to extract the text we need into a list

# the tr group has th elements which are the headers we want data for.
headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]

# exclude the first column as we will not need the ranking order from Basketball Reference for the analysis
headers = headers[1:]
headers

['Player',
 'Pos',
 'Age',
 'Tm',
 'G',
 'GS',
 'MP',
 'FG',
 'FGA',
 'FG%',
 '3P',
 '3PA',
 '3P%',
 '2P',
 '2PA',
 '2P%',
 'eFG%',
 'FT',
 'FTA',
 'FT%',
 'ORB',
 'DRB',
 'TRB',
 'AST',
 'STL',
 'BLK',
 'TOV',
 'PF',
 'PTS']

In [6]:
# avoid the first header row
# the td elements have the actual data points we want.
rows = soup.findAll('tr')[1:]
player_stats = [[td.getText() for td in rows[i].findAll('td')]
            for i in range(len(rows))]

In [7]:
# combining the column headers and the data points together
stats = pd.DataFrame(player_stats, columns = headers)
stats['PTS'] = pd.to_numeric(stats['PTS'])

In [8]:
stats.sort_values('PTS', ascending = False).head(10)

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
156,Stephen Curry,PG,32,GSW,63,63,34.2,10.4,21.7,0.482,...,0.916,0.5,5.0,5.5,5.8,1.2,0.1,3.4,1.9,32.0
46,Bradley Beal,SG,27,WAS,60,60,35.8,11.2,23.0,0.485,...,0.889,1.2,3.5,4.7,4.4,1.2,0.4,3.1,2.3,31.3
410,Damian Lillard,PG,30,POR,67,67,35.8,9.0,19.9,0.451,...,0.928,0.5,3.7,4.2,7.5,0.9,0.3,3.0,1.5,28.8
196,Joel Embiid,C,26,PHI,51,51,31.1,9.0,17.6,0.513,...,0.859,2.2,8.4,10.6,2.8,1.0,1.4,3.1,2.4,28.5
17,Giannis Antetokounmpo,PF,26,MIL,61,61,33.0,10.3,18.0,0.569,...,0.685,1.6,9.4,11.0,5.9,1.2,1.2,3.4,2.8,28.1
177,Luka Dončić,PG,21,DAL,66,66,34.3,9.8,20.5,0.479,...,0.73,0.8,7.2,8.0,8.6,1.0,0.5,4.3,2.3,27.7
395,Zach LaVine,SG,25,CHI,58,58,35.1,9.8,19.4,0.507,...,0.849,0.6,4.4,5.0,4.9,0.8,0.5,3.5,2.4,27.4
714,Zion Williamson,PF,20,NOP,61,61,33.2,10.4,17.0,0.611,...,0.698,2.7,4.5,7.2,3.7,0.9,0.6,2.7,2.2,27.0
189,Kevin Durant,PF,32,BRK,35,32,33.1,9.3,17.2,0.537,...,0.882,0.4,6.7,7.1,5.6,0.7,1.3,3.4,2.0,26.9
328,Kyrie Irving,PG,28,BRK,54,54,34.9,10.2,20.1,0.506,...,0.922,1.0,3.8,4.8,6.0,1.4,0.7,2.4,2.6,26.9


In [9]:
# datetimte stuff
today = datetime.now().date()
yesterday = today - timedelta(1)
day = (datetime.now() - timedelta(1)).day
month = (datetime.now() - timedelta(1)).month
year = (datetime.now() - timedelta(1)).year

In [10]:
#### BOX SCORE WEB SCRAPING
# https://www.basketball-reference.com/friv/dailyleaders.fcgi?month=07&day=17&year=2021&type=all

url = "https://www.basketball-reference.com/friv/dailyleaders.fcgi?month={}&day={}&year={}&type=all".format(month, day, year)
html = urlopen(url)
soup = BeautifulSoup(html)

In [11]:
headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
headers = headers[1:]
headers[2] = "Location"
headers[4] = "Outcome"
# headers

IndexError: list index out of range

In [41]:
rows = soup.findAll('tr')[1:]
player_stats = [[td.getText() for td in rows[i].findAll('td')]
            for i in range(len(rows))]

In [42]:
df2 = pd.DataFrame(player_stats, columns = headers)

In [52]:
df2[['FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc']] = df2[['FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc']].apply(pd.to_numeric)

In [55]:
df2.sort_values('PTS', ascending = False)

Unnamed: 0,Player,Tm,Location,Opp,Outcome,MP,FG,FGA,FG%,3P,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,+/-,GmSc
2,Devin Booker,PHO,,MIL,L,41:32,17,33,0.515,2,...,3,4,3,2,1,2,5,40,12,25.7
1,Giannis Antetokounmpo,MIL,@,PHO,W,40:34,14,23,0.609,0,...,4,9,6,0,0,0,2,32,-7,26.8
5,Khris Middleton,MIL,@,PHO,W,44:23,12,23,0.522,3,...,7,7,5,0,0,4,3,29,4,17.7
0,Jrue Holiday,MIL,@,PHO,W,41:57,12,20,0.6,3,...,3,4,13,3,1,2,4,27,14,28.6
3,Chris Paul,PHO,,MIL,L,35:21,9,15,0.6,3,...,2,2,11,0,1,1,5,21,-6,20.1
4,Deandre Ayton,PHO,,MIL,L,44:49,7,12,0.583,0,...,7,10,1,1,2,2,4,20,6,18.1
6,Pat Connaughton,MIL,@,PHO,W,33:23,4,6,0.667,4,...,5,6,0,0,0,0,0,14,10,13.6
7,Mikal Bridges,PHO,,MIL,L,32:44,5,6,0.833,3,...,4,4,1,2,1,2,1,13,5,13.0
8,Jae Crowder,PHO,,MIL,L,40:09,4,7,0.571,2,...,4,5,3,2,0,0,3,10,-7,11.5
9,Bobby Portis,MIL,@,PHO,W,19:20,3,6,0.5,2,...,0,3,0,2,0,0,0,9,7,10.1


In [12]:
#### injury report
url = "https://www.basketball-reference.com/friv/injuries.fcgi"
html = urlopen(url)
soup = BeautifulSoup(html)

In [13]:
headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
headers = headers[1:]

In [14]:
rows = soup.findAll('tr')
player_injury_info = [[td.getText() for td in rows[i].findAll('td')]
            for i in range(len(rows))]
player_injury_info = player_injury_info[1:]

player_names = [[th.getText() for th in rows[i].findAll('th')]
            for i in range(len(rows))]
player_names = player_names[1:]
player_names = [i[0] for i in player_names]
# mylist = list(zip(player_injury_info, player_names))

In [17]:
injury_data = pd.DataFrame(player_injury_info, columns = headers)
injury_data['Player'] = player_names

injury_data = injury_data[['Player', 'Team', 'Update', 'Description']] \
    .rename(columns = {"Update": "Date"})
injury_data

Unnamed: 0,Player,Team,Date,Description
0,Onyeka Okongwu,Atlanta Hawks,"Wed, Jul 21, 2021",Out (Shoulder) - The Hawks announced that Okon...
1,Jaylen Brown,Boston Celtics,"Thu, May 13, 2021",Out (Wrist) - The Celtics announced that Brown...
2,Coby White,Chicago Bulls,"Thu, Jun 10, 2021",Out (Shoulder) - The Bulls announced that Whit...
3,Taurean Prince,Cleveland Cavaliers,"Thu, Apr 22, 2021",Out (Ankle) - The Cavaliers announced F Taurea...
4,Jamal Murray,Denver Nuggets,"Thu, Jul 22, 2021",Out (Knee) - Murray is recovering from a torn ...
5,Klay Thompson,Golden State Warriors,"Thu, Jul 22, 2021",Out (Right Achilles) - Thompson is on track to...
6,James Wiseman,Golden State Warriors,"Thu, Jul 22, 2021",Out (Knee) - Wiseman is on track to be ready b...
7,T.J. Warren,Indiana Pacers,"Thu, Mar 25, 2021",Out (Foot) - Warren underwent foot surgery and...
8,Serge Ibaka,Los Angeles Clippers,"Fri, Jun 11, 2021",Out (Back) - The Clippers announced Serge Ibak...
9,Kawhi Leonard,Los Angeles Clippers,"Tue, Jul 13, 2021",Out (Knee) - The Clippers announced Kawhi Leon...


In [18]:
##### transactions
url = "https://www.basketball-reference.com/leagues/NBA_2021_transactions.html"
html = urlopen(url)
soup = BeautifulSoup(html)

In [23]:
rows = soup.findAll('li')
date_info = [[span.getText() for span in rows[i].findAll('span')]
            for i in range(len(rows))]

transaction_info = [[p.getText() for p in rows[i].findAll('p')]
            for i in range(len(rows))]

# transaction_info

In [30]:
# random web scrape i found - might be useful
import csv 
import requests
from bs4 import BeautifulSoup
import csv
import re
url_list = ['https://basketball.realgm.com/player/player/Summary/2',
            'https://basketball.realgm.com/player/player/Summary/1']

for url in url_list:
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')

    player = soup.find_all('div', class_='wrapper clearfix container')[0]

    playerprofile = re.sub(
        r'\n\s*\n', r'\n', player.get_text().strip(), flags=re.M)

    output = playerprofile + "\n"


In [66]:
### SCHEDULE
raw_df = pd.DataFrame()
month_list = ['december', 'january', 'february', 'march', 'april', 'may', 'june', 'july']
url = "https://www.basketball-reference.com/leagues/NBA_2021_games-december.html"
html = urlopen(url)
soup = BeautifulSoup(html)

In [59]:
headers = [th.getText() for th in soup.findAll('tr')[0].findAll('th')]
# headers = headers[1:]
headers[6] = 'boxScoreLink'
headers[7] = 'isOT'
headers = headers[1:]

In [60]:
rows = soup.findAll('tr')[1:]
player_stats = [[td.getText() for td in rows[i].findAll('td')]
            for i in range(len(rows))]

In [61]:
rows = soup.findAll('tr')[1:]
date_info = [[th.getText() for th in rows[i].findAll('th')]
            for i in range(len(rows))]

game_info = [[td.getText() for td in rows[i].findAll('td')]
            for i in range(len(rows))]
date_info = [i[0] for i in date_info] # removes brackets from each element.
# date_info

In [62]:
schedule = pd.DataFrame(game_info, columns = headers)
schedule['Date'] = date_info

In [92]:
# variables in functions are local by default
# variables outside of functions are global by default
# to modify a global variable in local function, we have to explicity label it as a global var.
schedule_df = pd.DataFrame()
def schedule_scraper(month):
    global schedule_df
    url = "https://www.basketball-reference.com/leagues/NBA_2021_games-{}.html".format(month)
    html = urlopen(url)
    soup = BeautifulSoup(html)

    headers = [th.getText() for th in soup.findAll('tr')[0].findAll('th')]

    headers[6] = 'boxScoreLink'
    headers[7] = 'isOT'
    headers = headers[1:]

    rows = soup.findAll('tr')[1:]
    date_info = [[th.getText() for th in rows[i].findAll('th')]
            for i in range(len(rows))]

    game_info = [[td.getText() for td in rows[i].findAll('td')]
            for i in range(len(rows))]
    date_info = [i[0] for i in date_info]

    schedule = pd.DataFrame(game_info, columns = headers)
    schedule['Date'] = date_info
    
    # join_df = join_df.append(schedule)
    schedule_df = schedule_df.append(schedule)
    # return(join_df)

In [93]:
schedule_df = pd.DataFrame()
schedule_scraper('february')

In [86]:
schedule_df = pd.DataFrame()
for month in month_list:
    schedule_scraper(month)

In [91]:
raw_df.tail(5)

Unnamed: 0,Start (ET),Visitor/Neutral,PTS,Home/Neutral,PTS.1,boxScoreLink,isOT,Attend.,Notes,Date
3,9:00p,Milwaukee Bucks,108,Phoenix Suns,118,Box Score,,16583,,"Thu, Jul 8, 2021"
4,8:00p,Phoenix Suns,100,Milwaukee Bucks,120,Box Score,,16637,,"Sun, Jul 11, 2021"
5,9:00p,Phoenix Suns,103,Milwaukee Bucks,109,Box Score,,16911,,"Wed, Jul 14, 2021"
6,9:00p,Milwaukee Bucks,123,Phoenix Suns,119,Box Score,,16562,,"Sat, Jul 17, 2021"
7,9:00p,Phoenix Suns,98,Milwaukee Bucks,105,Box Score,,17397,,"Tue, Jul 20, 2021"
