# Extract Stats from Basketball-Reference.com

2017-11

Scrap data from basketball-reference.com, using pd.read_html, BeaultifulSoup, Multiprocessing, & Other python modules.

## Imports, Constants, Utilities

### Imports

In [36]:
%%time

import os
import sys
import datetime
import re

import json
import pickle
import urllib
import pandas as pd
import numpy as np
import google
import random
import time
import multiprocessing
import gspread
import unidecode
import tqdm
import pprint

from collections import OrderedDict
from gspread import WorksheetNotFound
from bs4 import BeautifulSoup
from oauth2client.service_account import ServiceAccountCredentials


pd.set_option("display.max_columns", 100)


DATETIME_STRING_FORMAT = '%Y-%m-%d %H:%M:%S'

# Tables to retrieve for each player, by table html ids
TABLE_IDS = [
  'per_game',
  'totals',
  'per_minute', # per 36 minutes
  'per_poss', # per 100 possessions
  'advanced', # advanced
    
  'playoffs_per_game',
  'playoffs_totals',
  'playoffs_per_minute', # playoffs per 36 minutes
  'playoffs_per_poss', # playoffs per 100 possessions
  'playoffs_advanced', 
    
  'all_star',
  'all_college_stats',
  'all_salaries',
]

print('Current TABLE_IDS length: ', len(TABLE_IDS))

Current TABLE_IDS length:  13
CPU times: user 662 µs, sys: 517 µs, total: 1.18 ms
Wall time: 1.04 ms


In [2]:
%%time

def merge_list_of_list(nested_list):
    flattened_list = [item for lst in nested_list for item in lst]
    return flattened_list

test_list = [['a'], ['b']]
merge_list_of_list(test_list)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 4.77 µs


In [3]:
%%time

# Utility function to merge retrived data tables into 1 dictionary.
def merge_list_of_dict(list_of_dict):
    merged_dict = {}
    for dictionary in list_of_dict:
        merged_dict.update(dictionary)
    # Sort by dictionary key
    ordered_dict = OrderedDict(sorted(merged_dict.items(), key=lambda t: t[0]))
    return ordered_dict

test_list = [
    {'michael jordan': {'tables': {}, 'missing_tables': 'none', 'url': 'diety'}},
    {'kobe bryant': {'tables': {}, 'missing_tables': 'none', 'url': 'godly'}},
]

dic = merge_list_of_dict(test_list)
print(dic)

OrderedDict([('kobe bryant', {'tables': {}, 'missing_tables': 'none', 'url': 'godly'}), ('michael jordan', {'tables': {}, 'missing_tables': 'none', 'url': 'diety'})])
CPU times: user 133 µs, sys: 106 µs, total: 239 µs
Wall time: 211 µs


In [4]:
%%time

def sanitize_string(raw_string):
    sanitized_string = unidecode.unidecode(raw_string)
    sanitized_string = sanitized_string.strip().lower()
    sanitized_string = sanitized_string.replace("'", "")
    sanitized_string = sanitized_string.replace('"', '') 
    sanitized_string = sanitized_string.replace('.', '')
    if "," in sanitized_string:
        lst = sanitized_string.split(",")
        lst.reverse()
        lst = [token.strip() for token in lst]
        sanitized_string = " ".join(lst)
    return sanitized_string

print(sanitize_string("Shaquille O'neal"))
print(sanitize_string("Bryant, Kobe"))
print(sanitize_string(" CarTer, Vince .."))

shaquille oneal
kobe bryant
vince carter
CPU times: user 458 µs, sys: 515 µs, total: 973 µs
Wall time: 674 µs


In [5]:
%%time

def sanitize_list(raw_list):
            
    sanitized_list = [sanitize_string(raw_string) for raw_string in raw_list]
    return sanitized_list

test_list = ["Shaquille O'neal", "J. J. Reddick", "VinCe Carter ", "Bryant, Kobe"]

print(sanitize_list(test_list))

['shaquille oneal', 'j j reddick', 'vince carter', 'kobe bryant']
CPU times: user 90 µs, sys: 32 µs, total: 122 µs
Wall time: 126 µs


In [6]:
%%time

def dedupe_list(lst):
    return list(set(lst))

print(dedupe_list(['a', 'a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b', ]))

['b', 'a']
CPU times: user 58 µs, sys: 27 µs, total: 85 µs
Wall time: 89.2 µs


### Load Spreadsheets

In [7]:
def load_list_from_worksheet(spreadsheet_name, worksheet_name):
  
    scope = ['https://spreadsheets.google.com/feeds']
    credentials = ServiceAccountCredentials.from_json_keyfile_name('Data-35df9a696bc1.json', scope)
    gc = gspread.authorize(credentials)

    spreadsheet = gc.open(spreadsheet_name)
    worksheet = spreadsheet.worksheet(worksheet_name)

    rows = worksheet.get_all_values()
    
    first_row = rows[0]
    first_cell = first_row[0]
    
    try:
        timestamp = datetime.datetime.strptime(first_cell, DATETIME_STRING_FORMAT)
        rows.remove(first_row)
    except ValueError:
        timestamp = None

    print(
        'LOADED > {num_rows} rows from '
        'spreadsheet: "{spreadsheet_name}" | '
        'worksheet: "{worksheet_name}" | '
        'timestamp: {timestamp}'.format(
            num_rows=len(rows), spreadsheet_name=spreadsheet_name, 
            worksheet_name=worksheet_name, timestamp=timestamp), '\n')

    df = merge_list_of_list(rows)
    
    return df

worksheet = load_list_from_worksheet('test_spreadsheet', 'test')
print(worksheet[:10])

LOADED > 100 rows from spreadsheet: "test_spreadsheet" | worksheet: "test" | timestamp: 2017-11-24 09:29:12 

['michale', 'kobe', '0', '1', '2', '3', '4', '5', '6', '7']


In [8]:
%%time

def save_list_to_worksheet(lst, spreadsheet_name, worksheet_name, add_timestamp=True, overwrite=False):
    scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
    credentials = ServiceAccountCredentials.from_json_keyfile_name('Data-35df9a696bc1.json', scope)
    gc = gspread.authorize(credentials)
    
    spreadsheet = gc.open(spreadsheet_name)
    
    if type(lst) is not list:
        print('ERROR: input item is not a list!')
        return False
    
    # Remove rows with None value
    original_length = len(lst)
    lst = [item for item in lst if item is not None]
    new_length = len(lst)
    
    try:
        worksheet = spreadsheet.worksheet(worksheet_name)
        if overwrite:
            new_worksheet_name = worksheet_name + "_new"
            new_worksheet = spreadsheet.add_worksheet(new_worksheet_name, len(lst), 1)
            spreadsheet.del_worksheet(worksheet)
            new_worksheet.update_title(worksheet_name)
        else:
            print('Worksheet "{worksheet_name}" already exist! Please set overwrite=True to overwrite.')
            return False
    except WorksheetNotFound: 
        new_worksheet = spreadsheet.add_worksheet(worksheet_name, len(lst), 1)
    
    range_notation = 'A1:A{last_row_index}'.format(last_row_index=len(lst))
    
    cells_to_update = new_worksheet.range(range_notation)

    print('Remove {num_row} rows with "None" as their value.'.format(
        num_row=(original_length - new_length)))
    
    for cell, item in zip(cells_to_update, lst):
        cell.value = item
    
    new_worksheet.update_cells(cells_to_update)
    
    #Add a timestamp in the 1st cell
    if add_timestamp:
        timestamp = datetime.datetime.now()
        new_worksheet.insert_row(
            [timestamp], 1)
    
    print(
    'SAVED > {num_rows} rows to '
    'spreadsheet: "{spreadsheet_name}" | '
    'worksheet: "{worksheet_name}" | '
    'timestamp: {timestamp}'.format(
        num_rows=len(lst), spreadsheet_name=spreadsheet_name, 
        worksheet_name=worksheet_name, timestamp=timestamp), '\n')
    
    return True

test_lst = ['michale', 'kobe'] + [i for i in range(98)]
print(len(test_lst))
save_list_to_worksheet(test_lst, 'test_spreadsheet', 'test', add_timestamp=True, overwrite=True)

100
Remove 0 rows with "None" as their value.
SAVED > 100 rows to spreadsheet: "test_spreadsheet" | worksheet: "test" | timestamp: 2017-11-24 09:32:11.257191 

CPU times: user 98.5 ms, sys: 14 ms, total: 112 ms
Wall time: 3.55 s


### Save & Load Pickle

In [9]:
%%time

test = {
    'words': """
        Lorem ipsum dolor sit amet, consectetur adipiscing 
        elit. Mauris adipiscing adipiscing placerat. 
        Vestibulum augue augue, 
        pellentesque quis sollicitudin id, adipiscing.
        """,
    'list': list(range(10000)),
    'dict': dict((str(i),'a') for i in range(10000)),
    'int': 100,
    'float': 100.123456
}

def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

def get_file_size(filename):
    statinfo = os.stat(filename)
    return sizeof_fmt(statinfo.st_size)

def save_pickle(item, filename):
    with open(filename, 'wb') as file:
        pickle.dump(item, file)
    print(
        '\n'
        'SAVED  > ',
        filename, ' | ', 
        get_file_size(filename), ' | ',
        'length: ', len(item),
        '\n')
    return True;

def load_pickle(filename):
    with open(filename, 'rb') as file:
        obj = pickle.load(file)
        print(
        '\n'
        'LOADED > ',
        filename, ' | ', 
        get_file_size(filename), ' | ',
        'length: ', len(obj),
        '\n')
        return obj

save_pickle(test, 'test.pickle')

len(load_pickle('test.pickle'))


SAVED  >  test.pickle  |  183.8 KiB  |  length:  5 


LOADED >  test.pickle  |  183.8 KiB  |  length:  5 

CPU times: user 10.5 ms, sys: 2 ms, total: 12.5 ms
Wall time: 13.6 ms


## Reshape & create dataframes

In [44]:
%%time

all_players_stats_tables = load_pickle('all_players_stats_tables.pickle')
all_players_attributes = load_pickle('all_players_attributes.pickle')

all_players_attributes_df = pd.DataFrame.from_dict(all_players_attributes, orient='index')
print(all_players_attributes_df.shape)

all_players_dfs = {}
for table_id in TABLE_IDS:
    l_dfs = []
    l_keys = []
    for name, player in all_players_stats_tables.items():
        df = player.get('tables', {}).get(table_id, None)
        if df is not None:
            l_dfs.append(df)
            l_keys.append(name)
    all_players_dfs[table_id] = pd.concat(l_dfs, keys=l_keys)

all_players_dfs['all_players_attributes'] = all_players_attributes_df

save_pickle(all_players_dfs, 'all_players_dfs.pickle')

print('# of Dataframes created: ', len(all_players_dfs))
for key in all_players_dfs.keys():
    print(key, all_players_dfs[key].shape)


LOADED >  all_players_stats_tables.pickle  |  97.6 MiB  |  length:  3997 


LOADED >  all_players_attributes.pickle  |  5.8 MiB  |  length:  3997 

(3997, 19)

SAVED  >  all_players_dfs.pickle  |  51.5 MiB  |  length:  14 

# of Dataframes created:  14
per_game (30982, 29)
totals (29804, 29)
per_minute (29804, 28)
per_poss (24489, 30)
advanced (29804, 28)
playoffs_per_game (12319, 29)
playoffs_totals (12319, 29)
playoffs_per_minute (12319, 28)
playoffs_per_poss (9756, 30)
playoffs_advanced (12319, 26)
all_star (2154, 24)
all_college_stats (14251, 3)
all_salaries (15807, 3)
all_players_attributes (3997, 19)
CPU times: user 1min 12s, sys: 1.24 s, total: 1min 14s
Wall time: 1min 19s


## Get Players

### Get players from a single URL

In [None]:
def get_players(url):
    
    href_pattern = re.compile('^\/players\/.\/[a-z0-9]*.html$')
    href_prefix = 'https://www.basketball-reference.com'
    
    page = urllib.request.urlopen(url)
    html = page.read()

    # Get the player name
    soup = BeautifulSoup(html, 'html.parser')
    
    els = soup.find_all('a', href=href_pattern)
    
    players = {}
    
    for el in els:
        if el.parent.name == 'td':
            player_name = sanitize_string(el.text)
            player_url = ''.join([href_prefix, el['href']])
            players[player_name] = player_url
    
    randomized_sleep_time = 5 + np.random.exponential(1, 1)[0]
    time.sleep(randomized_sleep_time)
    
    print('Scrapped {url} | Players Found: {len}'.format(url=url, len=len(players)))
    sys.stdout.flush()

    return players

players = get_players('https://www.basketball-reference.com/leagues/NBA_1967_totals.html')

### Get players from a list of URLs, Multiprocessing

In [None]:
%%time

def get_players_from_urls(urls, num_processes):
    p = multiprocessing.Pool(processes=num_processes)
    outputs = p.map(get_players, urls)
    p.close()
    final_output = merge_list_of_dict(outputs)
    print(
        'Scrapped {num_url} urls, found {num_player} players.'.format(
            num_url=len(urls), num_player=len(final_output)), '\n')
    return final_output

test_urls = [
    'https://www.basketball-reference.com/leagues/NBA_2015_totals.html',
    'https://www.basketball-reference.com/leagues/NBA_2010_totals.html'
]

players = get_players_from_urls(test_urls, 2)

## GET URL (DEPRECATED, replaced by "Get Players")

### Get URL for a player name

In [None]:
# %%time

# #TODO(jameshu): Add logic to verify the url returned  in fact matches the player name
# # Currently, even gibberish player_name e.g. "James Hu" would have results returned.

# def get_url_title(url):
#     page = urllib.request.urlopen(url)
#     soup = BeautifulSoup(page, "html.parser")
#     return soup.title.text

# def get_url(player_name):       
#     query = (
#         'site:www.basketball-reference.com/players/*/*.html '
#         '{player_name} Overview').format(player_name=player_name)
#     print('query: ', query)

#     results = google.search(query=query, start=0, stop=1)
#     urls = list(results)        
    
#     time.sleep(random.randint(5, 10))
    
#     if urls:
#         return {player_name: urls[0]}
#     else:
#         print('url found: None')
#         return {player_name: None}
        
# # print(get_url('Michael Jordan'))

### Get URLs for a list of player names, MULTIPROCESSING

In [None]:
# %%time

# def get_urls(player_names, num_processes):
#     p = multiprocessing.Pool(processes=num_processes)
#     outputs = p.map(get_url, player_names)
#     p.close()
#     return merge_list_of_dict(outputs)

# # print(get_urls(test_names[0:2], 2))

## Get stats tables

### Get stats table for an url

In [None]:
%matplotlib inline

# Test out the sleep function

# mu, sigma = 0, 1
# s = np.random.normal(mu, sigma, 1000)
# pd.Series(s).hist()

# mu, sigma = 0, 1
# s = np.random.exponential(1, 100000)
# pd.Series(s).hist()


In [None]:
%%time

def get_stats_table(url):
        
    output = {}
    
    page = urllib.request.urlopen(url)
    urlHtml = page.read().decode()
    
    # Uncomment the tables
    uncommentedUrlHtml = urlHtml.replace('-->', '')
    uncommentedUrlHtml = uncommentedUrlHtml.replace('<!--', '')

    soup = BeautifulSoup(uncommentedUrlHtml, 'lxml')
    
    player_name = sanitize_string(soup.find("h1").text)
    output.setdefault(player_name, {}).setdefault('url', url);

    tags = soup.find_all('table')
    
    tables = {}
    missing_table_ids = list(TABLE_IDS) # MAKE A COPY
    
    for tag in tags:
        table_id = tag.get('id')
        if table_id in TABLE_IDS:
            table = pd.read_html(str(tag), header=0, index_col=0)[0]
            tables[table_id] = table
            missing_table_ids.remove(table_id)

    for dataframe in tables.values():
        dataframe.drop([col_name for col_name in dataframe.columns if 'Unnamed' in col_name], axis=1, inplace=True)
       
    output[player_name].setdefault('tables', tables);
    output[player_name].setdefault('missing_tables', missing_table_ids);
    
    randomized_sleep_time = 5 + np.random.exponential(1, 1)[0]
    time.sleep(randomized_sleep_time)
    
    processing_info = (
        '{player_name} | Found: {num_table} | '
        'slept: {randomized_sleep_time}'.format(
            player_name=player_name, 
            num_table=len(output[player_name]['tables']), 
            missing_tables=missing_table_ids,
            randomized_sleep_time=randomized_sleep_time))
    
    print(processing_info)
    sys.stdout.flush()

    return output

table = get_stats_table('https://www.basketball-reference.com/players/b/bellawa01.html')

In [None]:
table['walt bellamy']['tables']['totals']

In [None]:
table['walt bellamy']['missing_tables']

In [None]:
table['walt bellamy']['url']

### Get stats tables for a list of urls, MULTIPROCESSING

In [None]:
%%time

def get_stats_tables(urls, num_processes):
    pool = multiprocessing.Pool(processes=num_processes)
    jobs = pool.imap_unordered(get_stats_table, urls)
    size = len(urls)
    outputs = tqdm.tqdm_notebook(jobs, total=size)
    pool.close()
#     pool.join()
    return merge_list_of_dict(outputs)

test_urls = [
    'https://www.basketball-reference.com/players/b/bellawa01.html',
    'https://www.basketball-reference.com/players/j/jordami01.html'
]

tables = get_stats_tables(test_urls, 2)
print('obj length: ', len(tables))

In [None]:
tables['michael jordan']['tables']

## Get Player Attributes

### Get a player's attributes from an URL

In [None]:
%%time


def get_player_attributes(url):

    output = {}
    
    try:

        page = urllib.request.urlopen(url)
        urlHtml = page.read().decode()

        # Uncomment the tables
        uncommentedUrlHtml = urlHtml.replace('-->', '')
        uncommentedUrlHtml = uncommentedUrlHtml.replace('<!--', '')

        soup = BeautifulSoup(uncommentedUrlHtml, 'lxml')

        player_name = sanitize_string(soup.find("h1").text)
        output.setdefault(player_name, {}).setdefault('url', url)
        output.setdefault(player_name, {}).setdefault('missing_attributes', [])

        # Get all info, for future extraction
        tag = soup.find('div', attrs={'id': 'info'})
        if tag:
            player_info = tag
#             output['player_info_raw'] = tag
        else:
            output[player_name]['missing_attributes'].append('player_info_raw')

        tag = player_info.find('strong',text=re.compile('.*Position:.*'))
        if tag:
            position = tag.parent.contents[2]
            position = position.replace('\n','').replace('▪','').strip()
            position = re.sub('\s{2,}', ' ', position)
            shooting_hand = tag.parent.contents[-1]
            shooting_hand = shooting_hand.replace('\n','').replace('▪','').strip()
            shooting_hand = re.sub('\s{2,}', ' ', shooting_hand)
            output[player_name]['position'] = position
            output[player_name]['shooting_hand'] = shooting_hand
        else:
            output[player_name]['missing_attributes'].extend(['position', 'shooting_hand'])

        tag = player_info.find('strong',text=re.compile('.*High School:.*'))
        if tag:
            school = tag.parent.text.replace('\n', '').split(':')
            school = school[-1].strip()
            school = re.sub('\s{2,}', ' ', school)
            output[player_name]['high_school'] = school
        else:
            output[player_name]['missing_attributes'].append('high_school')

        tag = player_info.find('strong',text=re.compile('.*College:.*'))
        if tag:
            tag = tag.parent.find('a')
            if tag:
                college = tag.text.strip()
                output[player_name]['college'] = college
            else: 
                output[player_name]['missing_attributes'].append('college')
        else:
            output[player_name]['missing_attributes'].append('college')

        tag = player_info.find('strong',text=re.compile('.*Recruiting Rank:.*'))
        if tag:
            recruiting_rank = tag.parent.text.strip()
            recruiting_rank = re.search('\(([0-9]*)\)', recruiting_rank).group(1)
            output[player_name]['recruiting_rank'] = recruiting_rank
        else:
            output[player_name]['missing_attributes'].append('recruiting_rank')

        tag = player_info.find('strong',text=re.compile('.*Draft:.*'))
        if tag:
            draft = tag.parent.text.replace('\n', '').split(':')
            draft = draft[-1].strip()
            draft = re.sub('\s{2,}', ' ', draft)
            output[player_name]['draft'] = draft
        else:
            output[player_name]['missing_attributes'].append('draft')

        tag = player_info.find('strong',text=re.compile('.*Debut:.*'))
        if tag:
            nba_debute = tag.parent.contents[2]
            output[player_name]['nba_debut'] = nba_debute.text.strip()
        else:
            output[player_name]['missing_attributes'].append('nba_debut')            

        href_pattern = re.compile('^https://twitter.com/.*$')
        tag = player_info.find('a', href=href_pattern)
        if tag:
            output[player_name]['twitter'] = tag['href'].strip()
        else:
            output[player_name]['missing_attributes'].append('twitter')  

        tag = player_info.find('span', attrs={'itemprop': 'birthDate'})
        if tag:
            output[player_name]['birth_date'] = tag['data-birth'].strip()
        else:
            output[player_name]['missing_attributes'].append('birth_date')  

        tag = player_info.find('span', attrs={'itemprop': 'birthPlace'})
        if tag:
            tag = tag.find('a')
            if tag:
                output[player_name]['birth_place'] = tag.text.strip()
            else:
                output[player_name]['missing_attributes'].append('birth_place')
        else:
            output[player_name]['missing_attributes'].append('birth_place')  

        tag = player_info.find('span', attrs={'itemprop': 'height'})
        if tag:
            output[player_name]['height'] = tag.text.strip()
        else:
            output[player_name]['missing_attributes'].append('height')  

        tag = player_info.find('span', attrs={'itemprop': 'weight'})
        if tag:
            output[player_name]['weight'] = tag.text.strip()
        else:
            output[player_name]['missing_attributes'].append('weight')  

        tags = player_info.find('ul', attrs={'id': 'bling'})
        if tags:
            tags = tags.find_all('a')
            if tags:
                output[player_name]['honors'] = []
                for tag in tags:
                    output[player_name]['honors'].append(tag.text.strip())
            else:
                output[player_name]['missing_attributes'].append('honors')
        else:
            output[player_name]['missing_attributes'].append('honors')  

        tag = soup.find('p', text=re.compile('.*Chinese:.*'))
        if tag:
            chinese_name = tag.text.split(':')[-1].replace('數據','').strip()
            output[player_name]['chinese_name'] = chinese_name
        else:
            output[player_name]['missing_attributes'].append('chinese_name')  

        tags = soup.find_all('p', attrs={'class': 'transaction '})
        if tags:
            for tag in tags:
                transaction_date = tag.find('strong').text.strip()
                transaction = tag.text.split(':')[-1].strip()
                transaction = re.sub('\s{2,}', ' ', transaction)
                output[player_name].setdefault('transactions', {})[transaction_date] = transaction
        else:
            output[player_name]['missing_attributes'].append('transactions')  

        tag = player_info.find('p', text=re.compile('.*\(.*\).*'))
        if tag:
            nicknames = tag.text.replace('\n','').split(',')
            nicknames = [nickname.replace('(','').replace(')','').strip() for nickname in nicknames]
            output[player_name]['nicknames'] = nicknames
        else:
            output[player_name]['missing_attributes'].append('nicknames')  

        tags = player_info.find_all('svg', attrs={'class': 'jersey'})
        if tags:
            for tag in tags:
                jersey_number = tag.find('text').text.strip()
                team = tag.parent['data-tip'].strip()
                output[player_name].setdefault('numbers', {})[jersey_number] = team
        else:
            output[player_name]['missing_attributes'].append('numbers')  


        randomized_sleep_time = 5 + np.random.exponential(1, 1)[0]
        time.sleep(randomized_sleep_time)

        processing_info = (
            '{player_name} | Missing: {num_missing} | '
            'slept: {randomized_sleep_time}'.format(
                player_name=player_name, 
                num_missing=len(output[player_name]['missing_attributes']), 
                randomized_sleep_time=randomized_sleep_time))

        print(processing_info)
        sys.stdout.flush()
    
    except Exception as e:
        print(url, " FAILED! | ", str(e))
        output[url] = e
                
    return output

# test_url = 'https://www.basketball-reference.com/players/m/mingya01.html'
# test_url = "https://www.basketball-reference.com/players/b/bellawa01.html"
# test_url = 'https://www.basketball-reference.com/players/b/bryanko01.html'
# test_url = 'https://www.basketball-reference.com/players/r/redicjj01.html'
# test_url = 'https://www.basketball-reference.com/players/n/novakst01.html'
# test_url = 'https://www.basketball-reference.com/players/j/jordami01.html'
# test_url = 'https://www.basketball-reference.com/players/h/hairsal01.html'
test_url = 'https://www.basketball-reference.com/players/h/henryal01.html'
    
test_output = get_player_attributes(test_url)
pprint.pprint(test_output)

In [None]:
%%time

def get_players_attributes(urls, num_processes):
    pool = multiprocessing.Pool(processes=num_processes)
    jobs = pool.imap_unordered(get_player_attributes, urls)
    size = len(urls)
    outputs = tqdm.tqdm_notebook(jobs, total=size)
    pool.close()
#     pool.join()
    return merge_list_of_dict(outputs)

test_urls = [
    'https://www.basketball-reference.com/players/b/bellawa01.html',
    'https://www.basketball-reference.com/players/j/jordami01.html',
    'https://www.basketball-reference.com/players/n/novakst01.html',
    'https://www.basketball-reference.com/players/m/mingya01.html',
    'https://www.basketball-reference.com/players/b/bryanko01.html',
    'https://www.basketball-reference.com/players/r/redicjj01.html'
]

attributes = get_players_attributes(test_urls, 4)
print('obj length: ', len(tables))

In [42]:
save_pickle(all_players_dfs, 'all_players_dfs.pickle')


SAVED  >  all_players_dfs.pickle  |  51.5 MiB  |  length:  14 



True

## Burner

In [11]:
sample_dict = {
    "a": {"a_attribute_1": "adfa", "a_attribute_2": {}, "a_attribute_3": ['a', 'b', 'c'], "a_attribute_4": '123'},
    "b": {"b_attribute_1": "erqwerdfa", "b_attribute_2": {}, "b_attribute_3": ['a', 'b', 'c']}
}

pd.DataFrame.from_dict(sample_dict, orient='index')

Unnamed: 0,a_attribute_1,a_attribute_2,a_attribute_3,a_attribute_4,b_attribute_1,b_attribute_2,b_attribute_3
a,adfa,{},"[a, b, c]",123.0,,,
b,,,,,erqwerdfa,{},"[a, b, c]"


In [None]:
# %%time

# all_players = load_pickle('all_players.pickle')
# all_players_attributes = get_players_attributes(list(all_players.values()), 4)
# save_pickle(all_players_attributes, 'all_players_attributes.pickle')

In [None]:
# %%time

# all_players_stats_tables = get_stats_tables(list(all_players.values()), 4)

# save_pickle(all_players_stats_tables, 'all_players_stats_tables.pickle')

# print(all_players_stats_tables['lebron james']['tables'].keys())
# all_players_stats_tables['lebron james']['tables']['per_game']

In [None]:
# %%time

# leads_urls = load_list_from_worksheet('nba_player_names', 'leads_urls')

# all_players = get_players_from_urls(leads_urls, 4)

# save_pickle(all_players, 'all_players.pickle')

In [None]:
# worksheet = load_list_from_worksheet('nba_players_sanitized', 'hof')
# hof_names = sanitize_list(worksheet[0].tolist())
# print(hof_names)

# hof_urls = get_urls(hof_names, 30)

In [None]:
# worksheet = load_list_from_worksheet('nba_players_sanitized', 'retired_all_stars')
# retired_all_stars_names = sanitize_list(worksheet[0].tolist())
# print(retired_all_stars_names)

# retired_all_stars_urls = get_urls(retired_all_stars_names, 30)

In [None]:
# worksheet = load_list_from_worksheet('nba_players_sanitized', 'retired_all_nbas')
# retired_all_nbas_names = sanitize_list(worksheet[0].tolist())
# print(retired_all_nbas_names)

# retired_all_nbas_urls = get_urls(retired_all_nbas_names, 30)

In [None]:
# worksheet = load_list_from_worksheet('nba_players_sanitized', '2015')
# players_2015_names = sanitize_list(worksheet[0].tolist())
# print(players_2015_names)

# players_2015_urls = get_urls(players_2015_names, 30)

In [None]:
# save_pickle(hof_urls, 'hof_urls.pickle')
# save_pickle(retired_all_stars_urls, 'retired_all_stars_urls.pickle')
# save_pickle(retired_all_nbas_urls, 'retired_all_nbas_urls.pickle')
# save_pickle(players_2015_urls, 'players_2015_urls.pickle')

In [None]:
# %%time

# save_list_to_worksheet(list(hof_urls.values()), 'nba_player_urls', 'hof_urls', overwrite=True)
# save_list_to_worksheet(list(retired_all_nbas_urls.values()), 'nba_player_urls', 'retired_all_nbas_urls', overwrite=True)
# save_list_to_worksheet(list(retired_all_stars_urls.values()), 'nba_player_urls', 'retired_all_stars_urls', overwrite=True)
# save_list_to_worksheet(list(players_2015_urls.values()), 'nba_player_urls', 'players_2015_urls', overwrite=True)

## Get Tables

In [None]:
# %%time

# url_list = [url for url in hof_urls.values() if url is not None]
# print(len(url_list))
# hof_tables = get_tables(url_list, 4)
# save_pickle(hof_tables, 'hof_tables.pickle')

In [None]:
# %%time

# url_list = [url for url in retired_all_nbas_urls.values() if url is not None]
# print(len(url_list))
# retired_all_nbas_tables = get_tables(url_list, 4)
# save_pickle(retired_all_nbas_tables, 'retired_all_nbas_tables.pickle')

In [None]:
# %%time

# url_list = [url for url in retired_all_stars_urls.values() if url is not None]
# print(len(url_list))
# retired_all_stars_tables = get_tables(url_list, 4)
# save_pickle(retired_all_stars_tables, 'retired_all_stars_tables.pickle')

In [None]:
# %%time

# url_list = [url for url in players_2015_urls.values() if url is not None]
# print(len(url_list))
# players_2015_tables = get_tables(url_list, 4)
# save_pickle(players_2015_tables, 'players_2015_tables.pickle')