In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [2]:
def stattaboy(year, stattype):
    url = f"https://www.basketball-reference.com/leagues/NBA_{year}_{stattype}.html"
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
    headers = headers[1:]
    rows = soup.findAll('tr')[1:]
    player_stats = [[td.getText() for td in rows[i].findAll('td')] for i in range(len(rows))]
    return pd.DataFrame(player_stats, columns=headers)

def merge_frames(frame, aframe):
    aframe = aframe.drop('Player', axis = 1)
    aframe = aframe.drop('Pos', axis = 1)
    aframe = aframe.drop('Age', axis = 1)
    aframe = aframe.drop('Tm', axis = 1)
    aframe = aframe.drop('G', axis = 1)
    aframe = aframe.drop('MP', axis = 1)
    aframe= aframe.drop('\xa0', axis = 1)
    df = frame.join(aframe)
    return df

def yearsonyears(years):
    df = pd.DataFrame()
    for year in years:
        frames = [stattaboy(year, 'totals'), stattaboy(year, 'advanced')]
        year_df = merge_frames(frames[0], frames[1])
        year_df['year'] = [f'{year}'] * len(year_df)
        df = df.append(year_df, ignore_index = True)
        df = df.dropna()
    return df

In [3]:
df = yearsonyears(np.arange(1976,2020,1))
df.shape

(21648, 50)

In [4]:
cols = df.columns.to_list()
cols.remove('Player')
cols.remove('Pos')
cols.remove('Tm')
cols.remove('year')

In [5]:
for col in cols:
    df[col] = pd.to_numeric(df[col], errors = 'coerce')

In [6]:
df = df.dropna()

In [7]:
df.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,year
1352,Tiny Archibald*,PG,31,BOS,80,80.0,2864,383,794,0.482,...,17.0,5.9,2.9,8.9,0.148,1.9,-1.9,0.0,1.5,1980
1370,Larry Bird*,PF,23,BOS,82,82.0,2955,693,1463,0.474,...,25.3,5.6,5.6,11.2,0.182,3.6,1.8,5.3,5.4,1980
1403,M.L. Carr,SF,29,BOS,82,7.0,1994,362,763,0.474,...,20.2,1.9,3.2,5.2,0.125,0.6,0.7,1.3,1.7,1980
1408,Don Chaney,SG,33,BOS,60,0.0,523,67,189,0.354,...,18.3,-0.4,0.8,0.4,0.033,-3.2,0.1,-3.1,-0.1,1980
1424,Dave Cowens*,C,31,BOS,66,55.0,2159,422,932,0.453,...,20.1,1.9,3.6,5.4,0.121,0.1,1.2,1.4,1.8,1980


In [8]:
df.to_csv('player_stats_1976-2019.csv', index = False)