# Web Scraping Tutorial

* This notebook is a quick reference on how to use beautifulsoup4 to download data from websites- specifically basketball-reference.com

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

In [2]:
#set the URL we want to scrape
url = 'https://www.basketball-reference.com/players/h/hardeja01/gamelog/2019/'
page = requests.get(url)
#Look for a 200 response- success
page

<Response [200]>

In [37]:
#pass our request's content into beautiful soup
soup = BeautifulSoup(page.content, 'html.parser')
#print(soup.prettify())

In [4]:
#stats we want to pull
stats = ['game_season', 'date_game', 'age', 'team_id', 'game_location', 'opp_id', 'game_result','gs', 'mp', 'fg',
         'fga', 'fg_pct', 'fg3', 'fg3a', 'fg3_pct', 'ft', 'fta', 'ft_pct', 'orb',
         'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'game_score', 'plus_minus']

stats_list = [[td.getText() for td in soup.findAll('td', {'data-stat': stat})] for stat in stats]

In [5]:
#stats_list

In [6]:
pd.DataFrame(stats_list).T.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,1.0,2018-10-17,29-052,HOU,,NOP,L (-19),1,34:43,6,...,7,9,10,3,1,4,2,18,18.9,-23
1,2.0,2018-10-20,29-055,HOU,@,LAL,W (+9),1,37:50,10,...,6,7,5,2,1,6,2,36,27.0,-3
2,3.0,2018-10-21,29-056,HOU,@,LAC,L (-3),1,39:45,11,...,4,4,14,2,0,3,1,31,26.4,5
3,4.0,2018-10-24,29-059,HOU,,UTA,L (-11),1,31:38,10,...,4,5,7,1,0,7,4,29,17.7,-5
4,,2018-10-26,29-061,HOU,,LAC,L (-20),1,33:16,7,...,1,1,7,4,0,8,3,25,15.1,9
5,,2018-10-30,29-065,HOU,,POR,L (-19),1,35:07,7,...,4,4,6,3,1,5,5,28,22.4,6
6,,2018-11-02,29-068,HOU,@,BRK,W (+8),1,37:32,7,...,7,8,5,2,1,5,3,19,10.5,-13
7,5.0,2018-11-03,29-069,HOU,@,CHI,W (+8),1,38:31,7,...,6,6,3,2,2,2,3,25,11.8,-8
8,6.0,2018-11-05,29-071,HOU,@,IND,W (+4),1,37:59,9,...,7,7,9,1,1,9,5,40,28.3,14
9,7.0,2018-11-08,29-074,HOU,@,OKC,L (-18),1,36:49,7,...,4,5,11,3,1,4,3,22,20.9,13


In [7]:
stats_left = [[td.getText() for td in soup.findAll('td', {'data-stat': stat})] for stat in stats[:7]]
stats_right = [[td.getText() for td in soup.findAll('td', {'data-stat': stat})] for stat in stats[7:]]

In [8]:
df_left = pd.DataFrame(stats_left).T
df_left.columns = stats[:7]
df_left.head(5)

Unnamed: 0,game_season,date_game,age,team_id,game_location,opp_id,game_result
0,1.0,2018-10-17,29-052,HOU,,NOP,L (-19)
1,2.0,2018-10-20,29-055,HOU,@,LAL,W (+9)
2,3.0,2018-10-21,29-056,HOU,@,LAC,L (-3)
3,4.0,2018-10-24,29-059,HOU,,UTA,L (-11)
4,,2018-10-26,29-061,HOU,,LAC,L (-20)


In [9]:
# Loop to add blank data for inactive games
for i in range(len(df_left)):
    if df_left['game_season'][i]=="":
        [stats_right[x].insert(i, '') for x in range(len(stats_right))]
        

In [10]:
df_right = pd.DataFrame(stats_right).T
df_right.columns = stats[7:]

In [11]:
# combine dataframes
df = pd.concat([df_left, df_right], axis=1)

In [12]:
df

Unnamed: 0,game_season,date_game,age,team_id,game_location,opp_id,game_result,gs,mp,fg,...,drb,trb,ast,stl,blk,tov,pf,pts,game_score,plus_minus
0,1,2018-10-17,29-052,HOU,,NOP,L (-19),1,34:43,6,...,7,9,10,3,1,4,2,18,18.9,-23
1,2,2018-10-20,29-055,HOU,@,LAL,W (+9),1,37:50,10,...,6,7,5,2,1,6,2,36,27.0,-3
2,3,2018-10-21,29-056,HOU,@,LAC,L (-3),1,39:45,11,...,4,4,14,2,0,3,1,31,26.4,+5
3,4,2018-10-24,29-059,HOU,,UTA,L (-11),1,31:38,10,...,4,5,7,1,0,7,4,29,17.7,-5
4,,2018-10-26,29-061,HOU,,LAC,L (-20),,,,...,,,,,,,,,,
5,,2018-10-30,29-065,HOU,,POR,L (-19),,,,...,,,,,,,,,,
6,,2018-11-02,29-068,HOU,@,BRK,W (+8),,,,...,,,,,,,,,,
7,5,2018-11-03,29-069,HOU,@,CHI,W (+8),1,33:16,7,...,1,1,7,4,0,8,3,25,15.1,+9
8,6,2018-11-05,29-071,HOU,@,IND,W (+4),1,35:07,7,...,4,4,6,3,1,5,5,28,22.4,+6
9,7,2018-11-08,29-074,HOU,@,OKC,L (-18),1,37:32,7,...,7,8,5,2,1,5,3,19,10.5,-13


In [23]:
#James Harden Average Points for 2018-19
df[df.game_season!='']['pts'].astype(int).mean().round(1)

36.1

In [24]:
# change directory and save
os.chdir('./game_logs')
df.to_csv('2018_19_Harden_James.csv', index=False)