In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
url = 'https://www.basketball-reference.com/leagues/NBA_2018_per_game.html'
r = requests.get(url)
soup = BeautifulSoup(r.text, "lxml")

In [None]:
pretty_soup = soup.prettify()
print(pretty_soup[0:20])

In [4]:
print(soup.title)

<title>2017-18 NBA Player Stats: Per Game | Basketball-Reference.com</title>


In [5]:
stat_names_info = soup.findAll('th')
stat_names = [info.get('data-stat') for info in stat_names_info[1:29]]

# there are 29 stats in the table
stat_names

['player',
 'pos',
 'age',
 'team_id',
 'g',
 'gs',
 'mp_per_g',
 'fg_per_g',
 'fga_per_g',
 'fg_pct',
 'fg3_per_g',
 'fg3a_per_g',
 'fg3_pct',
 'fg2_per_g',
 'fg2a_per_g',
 'fg2_pct',
 'efg_pct',
 'ft_per_g',
 'fta_per_g',
 'ft_pct',
 'orb_per_g',
 'drb_per_g',
 'trb_per_g',
 'ast_per_g',
 'stl_per_g',
 'blk_per_g',
 'tov_per_g',
 'pf_per_g']

In [6]:
# find all rows containing pertinent data
table_rows = soup.find_all('tr', class_ = 'full_table')

In [7]:
data = {}

for table_row in table_rows:
    
    # within each row, find all cells containing data e.g. 'td'
    row_cells = table_row.findAll('td')
    
    for cell in range(0, len(row_cells)):
        # define key and value(value needs to be in a list for this method to work)
        key = row_cells[cell].get('data-stat')
        value = [row_cells[cell].text.strip()]
        
        # for every row cell, get the stat name, and the corresponding value
        if key in data.keys():
            
            # if the key exists, update the values
            data[key] += value
            
        else:
            # else create the key and add value
            data[key] = value


In [8]:
df = pd.DataFrame(data)

In [9]:
print(df.shape)

(540, 29)


In [10]:
df.head(15)

Unnamed: 0,age,ast_per_g,blk_per_g,drb_per_g,efg_pct,fg2_pct,fg2_per_g,fg2a_per_g,fg3_pct,fg3_per_g,...,mp_per_g,orb_per_g,pf_per_g,player,pos,pts_per_g,stl_per_g,team_id,tov_per_g,trb_per_g
0,24,0.4,0.1,1.2,0.54,0.443,0.4,0.9,0.38,1.1,...,15.1,0.3,1.7,Alex Abrines,SG,4.7,0.5,OKC,0.3,1.5
1,27,0.8,0.4,3.1,0.496,0.384,0.4,1.0,0.349,1.5,...,19.4,0.6,2.1,Quincy Acy,PF,5.9,0.5,BRK,0.9,3.7
2,24,1.2,1.0,4.0,0.629,0.631,5.9,9.3,0.0,0.0,...,32.7,5.1,2.8,Steven Adams,C,13.9,1.2,OKC,1.7,9.0
3,20,1.5,0.6,3.8,0.512,0.523,2.5,4.8,0.0,0.0,...,19.8,1.7,2.0,Bam Adebayo,C,6.9,0.5,MIA,1.0,5.5
4,32,0.6,0.2,1.2,0.485,0.413,0.7,1.7,0.386,0.5,...,12.9,0.1,1.1,Arron Afflalo,SG,3.4,0.1,ORL,0.4,1.2
5,29,0.1,0.0,0.6,0.333,0.333,0.2,0.7,,0.0,...,2.3,0.1,0.5,Cole Aldrich,C,0.6,0.1,MIN,0.0,0.7
6,32,2.0,1.2,5.2,0.52,0.526,8.8,16.7,0.293,0.4,...,33.5,3.3,2.2,LaMarcus Aldridge,C,23.1,0.6,SAS,1.5,8.5
7,19,0.7,1.2,3.4,0.596,0.599,3.2,5.3,0.333,0.1,...,20.0,2.0,2.0,Jarrett Allen,C,8.2,0.4,BRK,1.1,5.4
8,25,0.7,0.1,0.4,0.273,0.545,0.3,0.6,0.0,0.0,...,5.9,0.2,0.8,Kadeem Allen,PG,1.1,0.2,BOS,0.5,0.6
9,36,0.4,0.1,1.2,0.505,0.506,1.8,3.6,0.333,0.2,...,12.4,0.9,2.2,Tony Allen,SF,4.7,0.5,NOP,0.9,2.1


In [11]:
df.to_csv('~/Desktop/per_game_2018.csv')