### Data Scraping

In [1]:
import os
import numpy as np
import pandas as pd

from basketball_reference_scraper.teams import get_roster_stats as grs
from basketball_reference_scraper.utils import remove_accents

from tqdm.notebook import tqdm
from constants import DATA_DIR
from constants import SEASONS
from constants import TEAMS

Using the basketball_reference_scraper package, players stats could be scraped relatively easily.

In [8]:
def get_roster_stats(team, season):
    df = grs(team, season)
    return df

Some players were traded midseason and therefore appear more than once, let's define a function to merge them.

In [9]:
def merge_duplicates(df):
    duplicate = df.pivot_table(index=['PLAYER'], aggfunc='size')
    duplicate = duplicate[duplicate != 1]
    for player in duplicate.index:
        df1 = df.loc[df['PLAYER'] == player].fillna(0)
        df1 = df1.apply(pd.to_numeric, errors='ignore')
        # create merged row
        merged = df1.iloc[0]
        total_games = df1['G'].sum()
        merged['G'] = total_games
        merged['GS'] = df1['GS'].sum()
        weighted_stats = pd.DataFrame()

        for index, row in df1._get_numeric_data().iterrows():
            weight = row['G']/total_games #calculate weight
            new_row = row*weight #muliply weight to get new values
            weighted_stats = weighted_stats.append(new_row) #add to df

        weighted_stats = weighted_stats.sum(axis=0, skipna=True)
        merged_stats = ['MP','FG','FGA','3P','3PA','2P','2PA','FT','FTA', 
                        'ORB','DRB','TRB','AST','STL','BLK','TOV','PF','PTS']

        for stat in merged_stats:
            merged[stat] = weighted_stats[stat]

        # calulate % stats
        merged['FG%'] = merged['FG']/merged['FGA']
        merged['3P%'] = merged['3P']/merged['3PA']
        merged['2P%'] = merged['2P']/merged['2PA']
        merged['eFG%'] = (merged['FG']+0.5*merged['3P'])/merged['FGA']
        merged['FT%'] = merged['FT']/merged['FTA']
        merged=merged.fillna(0)

        df = df[df.PLAYER != player]
        df = df.append(merged)

    return df

Scrape the data from the website.

In [10]:
if not os.path.exists(os.path.join(DATA_DIR,'Raw')):
    os.mkdir(os.path.join(DATA_DIR,'Raw'))

for season in tqdm(SEASONS):
    df = pd.DataFrame()
    for team in tqdm(TEAMS):
        df1 = get_roster_stats(team, season)
        df = df.append(df1)
    df = merge_duplicates(df)
    df['PLAYER'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
    df.fillna(0, inplace=True)
    df.to_csv(DATA_DIR+'/Raw/'+f'{season-1}-{str(season)[2:]}.csv', index=False)

HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))

2009


HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

2009
2009




KeyboardInterrupt: 