# Extract Stats from Basketball-Reference.com

2017-11

Scrap data from basketball-reference.com, using pd.read_html, BeaultifulSoup, Multiprocessing, & Other python modules.

## Imports, Constants, Utilities

### Imports

In [1]:
%%time

import os
import sys
import datetime
import re

import json
import pickle
import urllib
import pandas as pd
import numpy as np
import google
import random
import time
import multiprocessing
import gspread
import unidecode
import tqdm
import pprint

from collections import OrderedDict
from gspread import WorksheetNotFound
from bs4 import BeautifulSoup
from oauth2client.service_account import ServiceAccountCredentials

DATETIME_STRING_FORMAT = '%Y-%m-%d %H:%M:%S'

CPU times: user 483 ms, sys: 89.7 ms, total: 573 ms
Wall time: 733 ms


In [2]:
%%time

def merge_list_of_list(nested_list):
    flattened_list = [item for lst in nested_list for item in lst]
    return flattened_list

test_list = [['a'], ['b']]
merge_list_of_list(test_list)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 5.96 µs


In [3]:
%%time

# Utility function to merge retrived data tables into 1 dictionary.
def merge_list_of_dict(list_of_dict):
    merged_dict = {}
    for dictionary in list_of_dict:
        merged_dict.update(dictionary)
    # Sort by dictionary key
    ordered_dict = OrderedDict(sorted(merged_dict.items(), key=lambda t: t[0]))
    return ordered_dict

test_list = [
    {'michael jordan': {'tables': {}, 'missing_tables': 'none', 'url': 'diety'}},
    {'kobe bryant': {'tables': {}, 'missing_tables': 'none', 'url': 'godly'}},
]

dic = merge_list_of_dict(test_list)
print(dic)

OrderedDict([('kobe bryant', {'tables': {}, 'missing_tables': 'none', 'url': 'godly'}), ('michael jordan', {'tables': {}, 'missing_tables': 'none', 'url': 'diety'})])
CPU times: user 271 µs, sys: 334 µs, total: 605 µs
Wall time: 401 µs


In [4]:
%%time

def sanitize_string(raw_string):
    sanitized_string = unidecode.unidecode(raw_string)
    sanitized_string = sanitized_string.strip().lower()
    sanitized_string = sanitized_string.replace("'", "")
    sanitized_string = sanitized_string.replace('"', '') 
    sanitized_string = sanitized_string.replace('.', '')
    if "," in sanitized_string:
        lst = sanitized_string.split(",")
        lst.reverse()
        lst = [token.strip() for token in lst]
        sanitized_string = " ".join(lst)
    return sanitized_string

print(sanitize_string("Shaquille O'neal"))
print(sanitize_string("Bryant, Kobe"))
print(sanitize_string(" CarTer, Vince .."))

shaquille oneal
kobe bryant
vince carter
CPU times: user 248 µs, sys: 410 µs, total: 658 µs
Wall time: 344 µs


In [5]:
%%time

def sanitize_list(raw_list):
            
    sanitized_list = [sanitize_string(raw_string) for raw_string in raw_list]
    return sanitized_list

test_list = ["Shaquille O'neal", "J. J. Reddick", "VinCe Carter ", "Bryant, Kobe"]

print(sanitize_list(test_list))

['shaquille oneal', 'j j reddick', 'vince carter', 'kobe bryant']
CPU times: user 235 µs, sys: 301 µs, total: 536 µs
Wall time: 279 µs


In [6]:
%%time

def dedupe_list(lst):
    return list(set(lst))

print(dedupe_list(['a', 'a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b', ]))

['b', 'a']
CPU times: user 533 µs, sys: 1.04 ms, total: 1.57 ms
Wall time: 1.29 ms


### Load Spreadsheets

In [7]:
def load_list_from_worksheet(spreadsheet_name, worksheet_name):
  
    scope = ['https://spreadsheets.google.com/feeds']
    credentials = ServiceAccountCredentials.from_json_keyfile_name('Data-35df9a696bc1.json', scope)
    gc = gspread.authorize(credentials)

    spreadsheet = gc.open(spreadsheet_name)
    worksheet = spreadsheet.worksheet(worksheet_name)

    rows = worksheet.get_all_values()
    
    first_row = rows[0]
    first_cell = first_row[0]
    
    try:
        timestamp = datetime.datetime.strptime(first_cell, DATETIME_STRING_FORMAT)
        rows.remove(first_row)
    except ValueError:
        timestamp = None

    print(
        'LOADED > {num_rows} rows from '
        'spreadsheet: "{spreadsheet_name}" | '
        'worksheet: "{worksheet_name}" | '
        'timestamp: {timestamp}'.format(
            num_rows=len(rows), spreadsheet_name=spreadsheet_name, 
            worksheet_name=worksheet_name, timestamp=timestamp), '\n')

    df = merge_list_of_list(rows)
    
    return df

worksheet = load_list_from_worksheet('test_spreadsheet', 'test')
print(worksheet[:10])

LOADED > 100 rows from spreadsheet: "test_spreadsheet" | worksheet: "test" | timestamp: 2017-11-23 18:49:31 

['michale', 'kobe', '0', '1', '2', '3', '4', '5', '6', '7']


In [8]:
%%time

def save_list_to_worksheet(lst, spreadsheet_name, worksheet_name, add_timestamp=True, overwrite=False):
    scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
    credentials = ServiceAccountCredentials.from_json_keyfile_name('Data-35df9a696bc1.json', scope)
    gc = gspread.authorize(credentials)
    
    spreadsheet = gc.open(spreadsheet_name)
    
    if type(lst) is not list:
        print('ERROR: input item is not a list!')
        return False
    
    # Remove rows with None value
    original_length = len(lst)
    lst = [item for item in lst if item is not None]
    new_length = len(lst)
    
    try:
        worksheet = spreadsheet.worksheet(worksheet_name)
        if overwrite:
            new_worksheet_name = worksheet_name + "_new"
            new_worksheet = spreadsheet.add_worksheet(new_worksheet_name, len(lst), 1)
            spreadsheet.del_worksheet(worksheet)
            new_worksheet.update_title(worksheet_name)
        else:
            print('Worksheet "{worksheet_name}" already exist! Please set overwrite=True to overwrite.')
            return False
    except WorksheetNotFound: 
        new_worksheet = spreadsheet.add_worksheet(worksheet_name, len(lst), 1)
    
    range_notation = 'A1:A{last_row_index}'.format(last_row_index=len(lst))
    
    cells_to_update = new_worksheet.range(range_notation)

    print('Remove {num_row} rows with "None" as their value.'.format(
        num_row=(original_length - new_length)))
    
    for cell, item in zip(cells_to_update, lst):
        cell.value = item
    
    new_worksheet.update_cells(cells_to_update)
    
    #Add a timestamp in the 1st cell
    if add_timestamp:
        timestamp = datetime.datetime.now()
        new_worksheet.insert_row(
            [timestamp], 1)
    
    print(
    'SAVED > {num_rows} rows to '
    'spreadsheet: "{spreadsheet_name}" | '
    'worksheet: "{worksheet_name}" | '
    'timestamp: {timestamp}'.format(
        num_rows=len(lst), spreadsheet_name=spreadsheet_name, 
        worksheet_name=worksheet_name, timestamp=timestamp), '\n')
    
    return True

test_lst = ['michale', 'kobe'] + [i for i in range(98)]
print(len(test_lst))
save_list_to_worksheet(test_lst, 'test_spreadsheet', 'test', add_timestamp=True, overwrite=True)

100
Remove 0 rows with "None" as their value.
SAVED > 100 rows to spreadsheet: "test_spreadsheet" | worksheet: "test" | timestamp: 2017-11-23 18:51:35.299483 

CPU times: user 108 ms, sys: 14 ms, total: 122 ms
Wall time: 4.56 s


### Save & Load Pickle

In [9]:
%%time

test = {
    'words': """
        Lorem ipsum dolor sit amet, consectetur adipiscing 
        elit. Mauris adipiscing adipiscing placerat. 
        Vestibulum augue augue, 
        pellentesque quis sollicitudin id, adipiscing.
        """,
    'list': list(range(10000)),
    'dict': dict((str(i),'a') for i in range(10000)),
    'int': 100,
    'float': 100.123456
}

def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

def get_file_size(filename):
    statinfo = os.stat(filename)
    return sizeof_fmt(statinfo.st_size)

def save_pickle(item, filename):
    with open(filename, 'wb') as file:
        pickle.dump(item, file)
    print(
        '\n'
        'SAVED  > ',
        filename, ' | ', 
        get_file_size(filename), ' | ',
        'length: ', len(item),
        '\n')
    return True;

def load_pickle(filename):
    with open(filename, 'rb') as file:
        obj = pickle.load(file)
        print(
        '\n'
        'LOADED > ',
        filename, ' | ', 
        get_file_size(filename), ' | ',
        'length: ', len(obj),
        '\n')
        return obj

save_pickle(test, 'test.pickle')

len(load_pickle('test.pickle'))


SAVED  >  test.pickle  |  183.8 KiB  |  length:  5 


LOADED >  test.pickle  |  183.8 KiB  |  length:  5 

CPU times: user 10.8 ms, sys: 2.19 ms, total: 13 ms
Wall time: 13 ms


### Define & Load Constants

In [10]:
%%time

# Tables to retrieve for each player, by table html ids
TABLE_IDS = [
  'per_game',
  'totals',
  'per_minute', # per 36 minutes
  'per_poss', # per 100 possessions
  'advanced', # advanced
    
  'playoffs_per_game',
  'playoffs_totals',
  'playoffs_per_minute', # playoffs per 36 minutes
  'playoffs_per_poss', # playoffs per 100 possessions
  'playoffs_advanced', 
    
  'all_star',
  'all_college_stats',
  'all_salaries',
]

print('Current TABLE_IDS length: ', len(TABLE_IDS))

# Load player names
hof_names = sanitize_list(
    load_list_from_worksheet('nba_player_names', 'hof_names'))
retired_all_stars_names = sanitize_list(
    load_list_from_worksheet('nba_player_names', 'retired_all_stars_names'))
retired_all_nbas_names = sanitize_list(
    load_list_from_worksheet('nba_player_names', 'retired_all_nbas_names'))
players_2015_names = sanitize_list(
    load_list_from_worksheet('nba_player_names', 'players_2015_names'))

# Load URLs
hof_urls = load_list_from_worksheet('nba_player_urls', 'hof_urls')
retired_all_stars_urls = load_list_from_worksheet('nba_player_urls', 'retired_all_stars_urls')
retired_all_nbas_urls = load_list_from_worksheet('nba_player_urls', 'retired_all_nbas_urls')
players_2015_urls = load_list_from_worksheet('nba_player_urls', 'players_2015_urls')

Current TABLE_IDS length:  13
LOADED > 181 rows from spreadsheet: "nba_player_names" | worksheet: "hof_names" | timestamp: None 

LOADED > 352 rows from spreadsheet: "nba_player_names" | worksheet: "retired_all_stars_names" | timestamp: None 

LOADED > 191 rows from spreadsheet: "nba_player_names" | worksheet: "retired_all_nbas_names" | timestamp: None 

LOADED > 476 rows from spreadsheet: "nba_player_names" | worksheet: "players_2015_names" | timestamp: None 

LOADED > 131 rows from spreadsheet: "nba_player_urls" | worksheet: "hof_urls" | timestamp: 2017-11-21 13:21:32 

LOADED > 352 rows from spreadsheet: "nba_player_urls" | worksheet: "retired_all_stars_urls" | timestamp: 2017-11-21 13:21:41 

LOADED > 190 rows from spreadsheet: "nba_player_urls" | worksheet: "retired_all_nbas_urls" | timestamp: 2017-11-21 13:21:36 

LOADED > 476 rows from spreadsheet: "nba_player_urls" | worksheet: "players_2015_urls" | timestamp: 2017-11-21 13:21:46 

CPU times: user 526 ms, sys: 70 ms, total: 596

## Get Players

### Get players from a single URL

In [11]:
def get_players(url):
    
    href_pattern = re.compile('^\/players\/.\/[a-z0-9]*.html$')
    href_prefix = 'https://www.basketball-reference.com'
    
    page = urllib.request.urlopen(url)
    html = page.read()

    # Get the player name
    soup = BeautifulSoup(html, 'html.parser')
    
    els = soup.find_all('a', href=href_pattern)
    
    players = {}
    
    for el in els:
        if el.parent.name == 'td':
            player_name = sanitize_string(el.text)
            player_url = ''.join([href_prefix, el['href']])
            players[player_name] = player_url
    
    randomized_sleep_time = 5 + np.random.exponential(1, 1)[0]
    time.sleep(randomized_sleep_time)
    
    print('Scrapped {url} | Players Found: {len}'.format(url=url, len=len(players)))
    sys.stdout.flush()

    return players

players = get_players('https://www.basketball-reference.com/leagues/NBA_1967_totals.html')

Scrapped https://www.basketball-reference.com/leagues/NBA_1967_totals.html | Players Found: 123


### Get players from a list of URLs, Multiprocessing

In [12]:
%%time

def get_players_from_urls(urls, num_processes):
    p = multiprocessing.Pool(processes=num_processes)
    outputs = p.map(get_players, urls)
    p.close()
    final_output = merge_list_of_dict(outputs)
    print(
        'Scrapped {num_url} urls, found {num_player} players.'.format(
            num_url=len(urls), num_player=len(final_output)), '\n')
    return final_output

test_urls = [
    'https://www.basketball-reference.com/leagues/NBA_2015_totals.html',
    'https://www.basketball-reference.com/leagues/NBA_2010_totals.html'
]

players = get_players_from_urls(test_urls, 2)

Scrapped https://www.basketball-reference.com/leagues/NBA_2015_totals.html | Players Found: 492
Scrapped https://www.basketball-reference.com/leagues/NBA_2010_totals.html | Players Found: 442
Scrapped 2 urls, found 716 players. 

CPU times: user 15.3 ms, sys: 15.6 ms, total: 30.9 ms
Wall time: 10 s


## GET URL (DEPRECATED, replaced by "Get Players")

### Get URL for a player name

In [None]:
# %%time

# #TODO(jameshu): Add logic to verify the url returned  in fact matches the player name
# # Currently, even gibberish player_name e.g. "James Hu" would have results returned.

# def get_url_title(url):
#     page = urllib.request.urlopen(url)
#     soup = BeautifulSoup(page, "html.parser")
#     return soup.title.text

# def get_url(player_name):       
#     query = (
#         'site:www.basketball-reference.com/players/*/*.html '
#         '{player_name} Overview').format(player_name=player_name)
#     print('query: ', query)

#     results = google.search(query=query, start=0, stop=1)
#     urls = list(results)        
    
#     time.sleep(random.randint(5, 10))
    
#     if urls:
#         return {player_name: urls[0]}
#     else:
#         print('url found: None')
#         return {player_name: None}
        
# # print(get_url('Michael Jordan'))

### Get URLs for a list of player names, MULTIPROCESSING

In [None]:
# %%time

# def get_urls(player_names, num_processes):
#     p = multiprocessing.Pool(processes=num_processes)
#     outputs = p.map(get_url, player_names)
#     p.close()
#     return merge_list_of_dict(outputs)

# # print(get_urls(test_names[0:2], 2))

## Get stats tables

### Get stats table for an url

In [None]:
%matplotlib inline

# Test out the sleep function

# mu, sigma = 0, 1
# s = np.random.normal(mu, sigma, 1000)
# pd.Series(s).hist()

# mu, sigma = 0, 1
# s = np.random.exponential(1, 100000)
# pd.Series(s).hist()


In [13]:
%%time

def get_stats_table(url):
        
    output = {}
    
    page = urllib.request.urlopen(url)
    urlHtml = page.read().decode()
    
    # Uncomment the tables
    uncommentedUrlHtml = urlHtml.replace('-->', '')
    uncommentedUrlHtml = uncommentedUrlHtml.replace('<!--', '')

    soup = BeautifulSoup(uncommentedUrlHtml, 'lxml')
    
    player_name = sanitize_string(soup.find("h1").text)
    output.setdefault(player_name, {}).setdefault('url', url);

    tags = soup.find_all('table')
    
    tables = {}
    missing_table_ids = list(TABLE_IDS) # MAKE A COPY
    
    for tag in tags:
        table_id = tag.get('id')
        if table_id in TABLE_IDS:
            table = pd.read_html(str(tag), header=0, index_col=0)[0]
            tables[table_id] = table
            missing_table_ids.remove(table_id)

    for dataframe in tables.values():
        dataframe.drop([col_name for col_name in dataframe.columns if 'Unnamed' in col_name], axis=1, inplace=True)
       
    output[player_name].setdefault('tables', tables);
    output[player_name].setdefault('missing_tables', missing_table_ids);
    
    randomized_sleep_time = 5 + np.random.exponential(1, 1)[0]
    time.sleep(randomized_sleep_time)
    
    processing_info = (
        '{player_name} | Found: {num_table} | '
        'slept: {randomized_sleep_time}'.format(
            player_name=player_name, 
            num_table=len(output[player_name]['tables']), 
            missing_tables=missing_table_ids,
            randomized_sleep_time=randomized_sleep_time))
    
    print(processing_info)
    sys.stdout.flush()

    return output

table = get_stats_table('https://www.basketball-reference.com/players/b/bellawa01.html')

walt bellamy | Found: 11 | slept: 6.1447167928978725
CPU times: user 824 ms, sys: 28.5 ms, total: 852 ms
Wall time: 7.7 s


In [None]:
table['walt bellamy']['tables']['totals']

In [None]:
table['walt bellamy']['missing_tables']

In [None]:
table['walt bellamy']['url']

### Get stats tables for a list of urls, MULTIPROCESSING

In [14]:
%%time

def get_stats_tables(urls, num_processes):
    pool = multiprocessing.Pool(processes=num_processes)
    jobs = pool.imap_unordered(get_stats_table, urls)
    size = len(urls)
    outputs = tqdm.tqdm_notebook(jobs, total=size)
    pool.close()
#     pool.join()
    return merge_list_of_dict(outputs)

test_urls = [
    'https://www.basketball-reference.com/players/b/bellawa01.html',
    'https://www.basketball-reference.com/players/j/jordami01.html'
]

tables = get_stats_tables(test_urls, 2)
print('obj length: ', len(tables))

A Jupyter Widget

michael jordan | Found: 13 | slept: 5.024256088425269
walt bellamy | Found: 11 | slept: 5.024256088425269

obj length:  2
CPU times: user 49 ms, sys: 23.7 ms, total: 72.6 ms
Wall time: 7.87 s


In [None]:
tables['michael jordan']['tables']

## Get Player Attributes

### Get a player's attributes from an URL

In [18]:
%%time


def get_player_attributes(url):

    output = {}
    
    try:

        page = urllib.request.urlopen(url)
        urlHtml = page.read().decode()

        # Uncomment the tables
        uncommentedUrlHtml = urlHtml.replace('-->', '')
        uncommentedUrlHtml = uncommentedUrlHtml.replace('<!--', '')

        soup = BeautifulSoup(uncommentedUrlHtml, 'lxml')

        player_name = sanitize_string(soup.find("h1").text)
        output.setdefault(player_name, {}).setdefault('url', url)
        output.setdefault(player_name, {}).setdefault('missing_attributes', [])

        # Get all info, for future extraction
        tag = soup.find('div', attrs={'id': 'info'})
        if tag:
            player_info = tag
#             output['player_info_raw'] = tag
        else:
            output[player_name]['missing_attributes'].append('player_info_raw')

        tag = player_info.find('strong',text=re.compile('.*Position:.*'))
        if tag:
            position = tag.parent.contents[2]
            position = position.replace('\n','').replace('▪','').strip()
            position = re.sub('\s{2,}', ' ', position)
            shooting_hand = tag.parent.contents[-1]
            shooting_hand = shooting_hand.replace('\n','').replace('▪','').strip()
            shooting_hand = re.sub('\s{2,}', ' ', shooting_hand)
            output[player_name]['position'] = position
            output[player_name]['shooting_hand'] = shooting_hand
        else:
            output[player_name]['missing_attributes'].extend(['position', 'shooting_hand'])

        tag = player_info.find('strong',text=re.compile('.*High School:.*'))
        if tag:
            school = tag.parent.text.replace('\n', '').split(':')
            school = school[-1].strip()
            school = re.sub('\s{2,}', ' ', school)
            output[player_name]['high_school'] = school
        else:
            output[player_name]['missing_attributes'].append('high_school')

        tag = player_info.find('strong',text=re.compile('.*College:.*'))
        if tag:
            tag = tag.parent.find('a')
            if tag:
                college = tag.text.strip()
                output[player_name]['college'] = college
            else: 
                output[player_name]['missing_attributes'].append('college')
        else:
            output[player_name]['missing_attributes'].append('college')

        tag = player_info.find('strong',text=re.compile('.*Recruiting Rank:.*'))
        if tag:
            recruiting_rank = tag.parent.text.strip()
            recruiting_rank = re.search('\(([0-9]*)\)', recruiting_rank).group(1)
            output[player_name]['recruiting_rank'] = recruiting_rank
        else:
            output[player_name]['missing_attributes'].append('recruiting_rank')

        tag = player_info.find('strong',text=re.compile('.*Draft:.*'))
        if tag:
            draft = tag.parent.text.replace('\n', '').split(':')
            draft = draft[-1].strip()
            draft = re.sub('\s{2,}', ' ', draft)
            output[player_name]['draft'] = draft
        else:
            output[player_name]['missing_attributes'].append('draft')

        tag = player_info.find('strong',text=re.compile('.*Debut:.*'))
        if tag:
            nba_debute = tag.parent.contents[2]
            output[player_name]['nba_debut'] = nba_debute.text.strip()
        else:
            output[player_name]['missing_attributes'].append('nba_debut')            

        href_pattern = re.compile('^https://twitter.com/.*$')
        tag = player_info.find('a', href=href_pattern)
        if tag:
            output[player_name]['twitter'] = tag['href'].strip()
        else:
            output[player_name]['missing_attributes'].append('twitter')  

        tag = player_info.find('span', attrs={'itemprop': 'birthDate'})
        if tag:
            output[player_name]['birth_date'] = tag['data-birth'].strip()
        else:
            output[player_name]['missing_attributes'].append('birth_date')  

        tag = player_info.find('span', attrs={'itemprop': 'birthPlace'})
        if tag:
            tag = tag.find('a')
            if tag:
                output[player_name]['birth_place'] = tag.text.strip()
            else:
                output[player_name]['missing_attributes'].append('birth_place')
        else:
            output[player_name]['missing_attributes'].append('birth_place')  

        tag = player_info.find('span', attrs={'itemprop': 'height'})
        if tag:
            output[player_name]['height'] = tag.text.strip()
        else:
            output[player_name]['missing_attributes'].append('height')  

        tag = player_info.find('span', attrs={'itemprop': 'weight'})
        if tag:
            output[player_name]['weight'] = tag.text.strip()
        else:
            output[player_name]['missing_attributes'].append('weight')  

        tags = player_info.find('ul', attrs={'id': 'bling'})
        if tags:
            tags = tags.find_all('a')
            if tags:
                output[player_name]['honors'] = []
                for tag in tags:
                    output[player_name]['honors'].append(tag.text.strip())
            else:
                output[player_name]['missing_attributes'].append('honors')
        else:
            output[player_name]['missing_attributes'].append('honors')  

        tag = soup.find('p', text=re.compile('.*Chinese:.*'))
        if tag:
            chinese_name = tag.text.split(':')[-1].replace('數據','').strip()
            output[player_name]['chinese_name'] = chinese_name
        else:
            output[player_name]['missing_attributes'].append('chinese_name')  

        tags = soup.find_all('p', attrs={'class': 'transaction '})
        if tags:
            for tag in tags:
                transaction_date = tag.find('strong').text.strip()
                transaction = tag.text.split(':')[-1].strip()
                transaction = re.sub('\s{2,}', ' ', transaction)
                output[player_name].setdefault('transactions', {})[transaction_date] = transaction
        else:
            output[player_name]['missing_attributes'].append('transactions')  

        tag = player_info.find('p', text=re.compile('.*\(.*\).*'))
        if tag:
            nicknames = tag.text.replace('\n','').split(',')
            nicknames = [nickname.replace('(','').replace(')','').strip() for nickname in nicknames]
            output[player_name]['nicknames'] = nicknames
        else:
            output[player_name]['missing_attributes'].append('nicknames')  

        tags = player_info.find_all('svg', attrs={'class': 'jersey'})
        if tags:
            for tag in tags:
                jersey_number = tag.find('text').text.strip()
                team = tag.parent['data-tip'].strip()
                output[player_name].setdefault('numbers', {})[jersey_number] = team
        else:
            output[player_name]['missing_attributes'].append('numbers')  


        randomized_sleep_time = 5 + np.random.exponential(1, 1)[0]
        time.sleep(randomized_sleep_time)

        processing_info = (
            '{player_name} | Missing: {num_missing} | '
            'slept: {randomized_sleep_time}'.format(
                player_name=player_name, 
                num_missing=len(output[player_name]['missing_attributes']), 
                randomized_sleep_time=randomized_sleep_time))

        print(processing_info)
        sys.stdout.flush()
    
    except Exception as e:
        print(url, " FAILED! | ", str(e))
        output[url] = e
                
    return output

# test_url = 'https://www.basketball-reference.com/players/m/mingya01.html'
# test_url = "https://www.basketball-reference.com/players/b/bellawa01.html"
# test_url = 'https://www.basketball-reference.com/players/b/bryanko01.html'
# test_url = 'https://www.basketball-reference.com/players/r/redicjj01.html'
# test_url = 'https://www.basketball-reference.com/players/n/novakst01.html'
# test_url = 'https://www.basketball-reference.com/players/j/jordami01.html'
# test_url = 'https://www.basketball-reference.com/players/h/hairsal01.html'
test_url = 'https://www.basketball-reference.com/players/h/henryal01.html'
    
test_output = get_player_attributes(test_url)
pprint.pprint(test_output)

al henry | Missing: 4 | slept: 5.896106678169678
{'al henry': {'birth_date': '1949-02-09',
              'chinese_name': '艾尔·亨利',
              'college': 'University of Wisconsin',
              'draft': 'Philadelphia 76ers, 1st round (12th pick, 12th '
                       'overall), 1970 NBA Draft',
              'height': '6-9',
              'high_school': 'Hamilton in Memphis, Tennessee',
              'missing_attributes': ['recruiting_rank',
                                     'twitter',
                                     'birth_place',
                                     'honors'],
              'nba_debut': 'October 16, 1970',
              'nicknames': ['The Tree'],
              'numbers': {'26': 'Philadelphia 76ers, 1971-1972'},
              'position': 'Center',
              'shooting_hand': 'Left',
              'transactions': {'March 23, 1970': 'Drafted by the Philadelphia '
                                                 '76ers in the 1st round (12th '
      

In [19]:
%%time

def get_players_attributes(urls, num_processes):
    pool = multiprocessing.Pool(processes=num_processes)
    jobs = pool.imap_unordered(get_player_attributes, urls)
    size = len(urls)
    outputs = tqdm.tqdm_notebook(jobs, total=size)
    pool.close()
#     pool.join()
    return merge_list_of_dict(outputs)

test_urls = [
    'https://www.basketball-reference.com/players/b/bellawa01.html',
    'https://www.basketball-reference.com/players/j/jordami01.html',
    'https://www.basketball-reference.com/players/n/novakst01.html',
    'https://www.basketball-reference.com/players/m/mingya01.html',
    'https://www.basketball-reference.com/players/b/bryanko01.html',
    'https://www.basketball-reference.com/players/r/redicjj01.html'
]

attributes = get_players_attributes(test_urls, 4)
print('obj length: ', len(tables))

A Jupyter Widget

steve novak | Missing: 3 | slept: 5.781327073906766
walt bellamy | Missing: 2 | slept: 5.781327073906766
yao ming | Missing: 3 | slept: 5.781327073906766
michael jordan | Missing: 2 | slept: 5.781327073906766
kobe bryant | Missing: 2 | slept: 5.529048427194018
jj redick | Missing: 2 | slept: 5.529048427194018

obj length:  2
CPU times: user 60.1 ms, sys: 33.3 ms, total: 93.4 ms
Wall time: 16.6 s


## Run Tasks

### Get Names

In [None]:
# %%time

# leads_urls = load_list_from_worksheet('nba_player_names', 'leads_urls')

# all_players = get_players_from_urls(leads_urls, 4)

# save_pickle(all_players, 'all_players.pickle')

In [None]:
all_players['michael jordan']

In [None]:
len(all_players)

In [None]:
# %%time

# all_players_stats_tables = get_stats_tables(list(all_players.values()), 4)

# save_pickle(all_players_stats_tables, 'all_players_stats_tables.pickle')

# print(all_players_stats_tables['lebron james']['tables'].keys())
# all_players_stats_tables['lebron james']['tables']['per_game']

In [20]:
%%time

all_players = load_pickle('all_players.pickle')
all_players_attributes = get_players_attributes(list(all_players.values()), 4)
save_pickle(all_players_attributes, 'all_players_attributes.pickle')


LOADED >  all_players.pickle  |  363.5 KiB  |  length:  3998 



A Jupyter Widget

aaron gray | Missing: 4 | slept: 5.781327073906766
aaron gordon | Missing: 3 | slept: 5.781327073906766
aaron brooks | Missing: 2 | slept: 5.781327073906766
aaron harrison | Missing: 5 | slept: 5.781327073906766
aaron james | Missing: 3 | slept: 5.529048427194018
aaron mckie | Missing: 3 | slept: 5.529048427194018
aaron miles | Missing: 5 | slept: 5.529048427194018
aaron swinson | Missing: 6 | slept: 5.529048427194018
aaron williams | Missing: 6 | slept: 6.685134730248059
abdel nader | Missing: 5 | slept: 6.685134730248059
abdul jeelani | Missing: 5 | slept: 6.685134730248059
ac green | Missing: 1 | slept: 6.685134730248059
acie earl | Missing: 4 | slept: 6.038738007180326
acie law | Missing: 3 | slept: 6.038738007180326
adam harrington | Missing: 5 | slept: 6.038738007180326
adam keefe | Missing: 4 | slept: 6.038738007180326
adam morrison | Missing: 2 | slept: 6.647937505435011
adonal foyle | Missing: 3 | slept: 6.647937505435011
adonis jordan | Missing: 4 | slept: 6.647937505435011
a

andy toolson | Missing: 6 | slept: 6.398929678638477
andy walker | Missing: 5 | slept: 6.398929678638477
anfernee hardaway | Missing: 1 | slept: 6.398929678638477
antawn jamison | Missing: 3 | slept: 5.3260954475933815
ansu sesay | Missing: 5 | slept: 7.591169465303311
ante zizic | Missing: 7 | slept: 5.3260954475933815
anthony avent | Missing: 3 | slept: 5.3260954475933815
anthony bennett | Missing: 2 | slept: 5.495802783564147
anthony bonner | Missing: 3 | slept: 6.398929678638477
anthony bowie | Missing: 3 | slept: 5.495802783564147
anthony brown | Missing: 2 | slept: 5.495802783564147
anthony carter | Missing: 5 | slept: 5.821150013926104
anthony cook | Missing: 3 | slept: 5.3260954475933815
anthony davis | Missing: 0 | slept: 5.821150013926104
anthony frederick | Missing: 4 | slept: 5.821150013926104
anthony grundy | Missing: 6 | slept: 5.495802783564147
anthony goldwire | Missing: 4 | slept: 9.407564439707706
anthony johnson | Missing: 5 | slept: 9.407564439707706
anthony jones |

bill mlkvy | Missing: 3 | slept: 5.446186339052677
bill roberts | Missing: 8 | slept: 5.213619102079313
bill robinzine | Missing: 5 | slept: 5.188661797283296
bill russell | Missing: 2 | slept: 5.213619102079313
bill sharman | Missing: 2 | slept: 6.317037692112287
bill smith | Missing: 6 | slept: 6.588958838302187
bill stricker | Missing: 7 | slept: 5.213619102079313
bill tosheff | Missing: 5 | slept: 5.188661797283296
bill thieben | Missing: 7 | slept: 6.588958838302187
bill turner | Missing: 7 | slept: 7.236106393796209
bill walton | Missing: 1 | slept: 6.588958838302187
bill wennington | Missing: 2 | slept: 5.213619102079313
bill willoughby | Missing: 6 | slept: 7.236106393796209
bill zopf | Missing: 5 | slept: 6.851178716840319
billy cunningham | Missing: 2 | slept: 7.236106393796209
billy donovan | Missing: 3 | slept: 6.588958838302187
billy hassett | Missing: 6 | slept: 6.851178716840319
billy kenville | Missing: 4 | slept: 5.83571987346168
billy knight | Missing: 2 | slept: 6.85

brandon jennings | Missing: 3 | slept: 5.331505946572118
brandon knight | Missing: 2 | slept: 5.752064350099062
brandon paul | Missing: 5 | slept: 6.108836595070334
brandon roy | Missing: 1 | slept: 5.752064350099062
brandon rush | Missing: 2 | slept: 5.752064350099062
brandon williams | Missing: 6 | slept: 6.108836595070334
brant weidner | Missing: 5 | slept: 5.887799265213815
brendan haywood | Missing: 2 | slept: 6.108836595070334
brendan mccann | Missing: 4 | slept: 6.108836595070334
brent price | Missing: 5 | slept: 5.2802540999841865
brent barry | Missing: 1 | slept: 5.887799265213815
brent scott | Missing: 6 | slept: 5.887799265213815
brett szabo | Missing: 6 | slept: 5.887799265213815
brett vroman | Missing: 5 | slept: 5.1967122478793
brevin knight | Missing: 3 | slept: 5.2802540999841865
brian cardinal | Missing: 1 | slept: 5.2802540999841865
brian cook | Missing: 3 | slept: 5.2802540999841865
brian davis | Missing: 4 | slept: 5.703087400842975
brian evans | Missing: 4 | slept:

charlie bell | Missing: 4 | slept: 5.0513176148748
charlie black | Missing: 4 | slept: 6.005803545524264
charlie davis | Missing: 4 | slept: 5.0513176148748
charlie criss | Missing: 4 | slept: 6.386712807820871
charlie hardnett | Missing: 4 | slept: 6.379109960036996
charlie lowery | Missing: 6 | slept: 5.0513176148748
charlie parsley | Missing: 9 | slept: 6.379109960036996
charlie paulk | Missing: 6 | slept: 5.463399562503877
charlie scott | Missing: 4 | slept: 6.386712807820871
charlie sitton | Missing: 5 | slept: 6.379109960036996
charlie tyra | Missing: 4 | slept: 6.386712807820871
charlie villanueva | Missing: 2 | slept: 5.081511908374334
charlie ward | Missing: 5 | slept: 5.463399562503877
charlie yelverton | Missing: 6 | slept: 6.386712807820871
chase budinger | Missing: 2 | slept: 5.463399562503877
chasson randle | Missing: 4 | slept: 8.160844819201627
chauncey billups | Missing: 1 | slept: 5.081511908374334
cheese johnson | Missing: 5 | slept: 5.463399562503877
cheick diallo |

corey crowder | Missing: 5 | slept: 8.64159005832984
corey maggette | Missing: 3 | slept: 6.570329555653093
corey williams | Missing: 5 | slept: 6.105663634963747
corie blount | Missing: 4 | slept: 5.4628511742854435
corky devlin | Missing: 6 | slept: 5.4628511742854435
corky calhoun | Missing: 3 | slept: 6.570329555653093
corliss williamson | Missing: 3 | slept: 6.3264567730747
cornelius cash | Missing: 5 | slept: 6.105663634963747
cornell warner | Missing: 5 | slept: 6.105663634963747
corny thompson | Missing: 5 | slept: 5.4628511742854435
corsley edwards | Missing: 4 | slept: 6.421431777639999
cory alexander | Missing: 4 | slept: 6.3264567730747
cory blackwell | Missing: 4 | slept: 6.3264567730747
cory carr | Missing: 4 | slept: 6.105663634963747
cory higgins | Missing: 5 | slept: 5.699150158694052
cory jefferson | Missing: 3 | slept: 6.421431777639999
cory joseph | Missing: 1 | slept: 6.421431777639999
cotton nash | Missing: 4 | slept: 6.3264567730747
coty clarke | Missing: 6 | sle

dave lattin | Missing: 3 | slept: 7.234768288587352
dave magley | Missing: 5 | slept: 5.192621444888275
dave minor | Missing: 4 | slept: 5.244852527761571
dave meyers | Missing: 4 | slept: 7.234768288587352
dave newmark | Missing: 4 | slept: 5.192621444888275
dave piontek | Missing: 6 | slept: 6.505373853112337
dave popson | Missing: 5 | slept: 5.512095336190506
dave robisch | Missing: 4 | slept: 5.192621444888275
dave schellhase | Missing: 5 | slept: 6.505373853112337
dave scholz | Missing: 6 | slept: 7.842765265158295
dave sorenson | Missing: 5 | slept: 7.234768288587352
dave stallworth | Missing: 3 | slept: 6.505373853112337
dave twardzik | Missing: 3 | slept: 7.842765265158295
dave wohl | Missing: 5 | slept: 5.488180021913075
dave zeller | Missing: 6 | slept: 5.192621444888275
david andersen | Missing: 6 | slept: 7.842765265158295
david benoit | Missing: 5 | slept: 5.488180021913075
david cooke | Missing: 6 | slept: 6.505373853112337
david burns | Missing: 5 | slept: 8.530979151623

dick okeefe | Missing: 7 | slept: 5.144489508770925
dick ricketts | Missing: 4 | slept: 5.202470901833106
dick schnittker | Missing: 4 | slept: 5.144489508770925
dick rosenthal | Missing: 6 | slept: 7.428189701103838
dick schulz | Missing: 6 | slept: 6.253261459651713
dick snyder | Missing: 4 | slept: 5.144489508770925
dick triptow | Missing: 5 | slept: 5.201307925223483
dick surhoff | Missing: 7 | slept: 6.253261459651713
dick van arsdale | Missing: 3 | slept: 5.221117268174344
dickey simpkins | Missing: 4 | slept: 6.253261459651713
dj mbenga | Missing: 6 | slept: 5.202470901833106
dijon thompson | Missing: 4 | slept: 5.221117268174344
dike eddleman | Missing: 3 | slept: 6.544138034847369
dikembe mutombo | Missing: 2 | slept: 5.221117268174344
dillard crocker | Missing: 6 | slept: 5.144489508770925
dillon brooks | Missing: 4 | slept: 6.544138034847369
dino radja | Missing: 5 | slept: 5.338318845419897
dion glover | Missing: 5 | slept: 6.544138034847369
dion waiters | Missing: 1 | slep

ed leede | Missing: 5 | slept: 5.281071013781859
ed horton | Missing: 4 | slept: 8.575420842361805
ed kalafat | Missing: 5 | slept: 8.744353920599304
ed macauley | Missing: 2 | slept: 8.575420842361805
ed nealy | Missing: 5 | slept: 5.001957101908609
ed manning | Missing: 6 | slept: 8.575420842361805
ed mikan | Missing: 5 | slept: 8.376989336227053
ed obannon | Missing: 5 | slept: 8.376989336227053
ed peterson | Missing: 7 | slept: 5.1294873876411895
ed pinckney | Missing: 4 | slept: 8.376989336227053
ed rains | Missing: 5 | slept: 8.702495298606939
ed sadowski | Missing: 5 | slept: 5.281071013781859
ed ratleff | Missing: 5 | slept: 8.702495298606939
ed sherod | Missing: 6 | slept: 7.142979395321261
ed searcy | Missing: 5 | slept: 8.702495298606939
ed smith | Missing: 6 | slept: 8.575420842361805
ed stanczak | Missing: 7 | slept: 7.142979395321261
ed stokes | Missing: 5 | slept: 5.559927561764572
eddie basden | Missing: 4 | slept: 7.142979395321261
eddie griffin | Missing: 2 | slept: 5

fred hetzel | Missing: 3 | slept: 5.125407266101213
fred hilton | Missing: 5 | slept: 5.070413314323228
fred jones | Missing: 3 | slept: 5.192999135234874
fred hoiberg | Missing: 2 | slept: 6.955424931319627
fred lacour | Missing: 5 | slept: 5.192999135234874
fred roberts | Missing: 5 | slept: 5.418207974661059
fred saunders | Missing: 5 | slept: 5.070413314323228
fred scolari | Missing: 4 | slept: 5.070413314323228
fred schaus | Missing: 5 | slept: 7.014626368003597
fred taylor | Missing: 5 | slept: 5.27226987816012
fred vanvleet | Missing: 5 | slept: 5.418207974661059
fred vinson | Missing: 6 | slept: 5.418207974661059
freddie boyd | Missing: 4 | slept: 5.125407266101213
freddie crawford | Missing: 3 | slept: 6.178226519469356
freddie lewis | Missing: 2 | slept: 5.27226987816012
freeman williams | Missing: 5 | slept: 5.27226987816012
furkan aldemir | Missing: 5 | slept: 5.192999135234874
furkan korkmaz | Missing: 7 | slept: 6.244158037835799
gabe pruitt | Missing: 2 | slept: 6.178226

greg griffin | Missing: 5 | slept: 5.131110773799372
greg howard | Missing: 3 | slept: 5.475727328990702
greg hyder | Missing: 6 | slept: 5.131110773799372
greg jackson | Missing: 5 | slept: 5.524472744411643
greg kelser | Missing: 2 | slept: 7.213885979240592
greg kite | Missing: 3 | slept: 5.131110773799372
greg lee | Missing: 4 | slept: 7.213885979240592
greg minor | Missing: 4 | slept: 7.527286712651714
greg monroe | Missing: 0 | slept: 5.790279038287266
greg oden | Missing: 2 | slept: 7.213885979240592
greg ostertag | Missing: 4 | slept: 5.790279038287266
greg smith | Missing: 3 | slept: 6.120872688454957
greg stiemsma | Missing: 4 | slept: 5.486061151219661
greg stokes | Missing: 5 | slept: 5.790279038287266
greg sutton | Missing: 5 | slept: 5.486061151219661
greivis vasquez | Missing: 2 | slept: 5.475727328990702
guerschon yabusele | Missing: 7 | slept: 5.354147954644238
guillermo diaz | Missing: 4 | slept: 5.486061151219661
gundars vetra | Missing: 8 | slept: 5.354147954644238


jack kiley | Missing: 6 | slept: 8.867607600469759
jack mccloskey | Missing: 8 | slept: 6.246827167475007
jack marin | Missing: 4 | slept: 8.867607600469759
jack mcmahon | Missing: 4 | slept: 5.298707134819513
jack molinas | Missing: 4 | slept: 5.296928668347216
jack parkinson | Missing: 6 | slept: 5.296928668347216
jack nichols | Missing: 6 | slept: 7.097977600996411
jack phelan | Missing: 8 | slept: 5.116558148875446
jack parr | Missing: 5 | slept: 8.867607600469759
jack sikma | Missing: 3 | slept: 5.116558148875446
jack smiley | Missing: 5 | slept: 5.864552063917925
jack stephens | Missing: 5 | slept: 5.41553817826356
jack toomay | Missing: 7 | slept: 5.296928668347216
jack turner | Missing: 5 | slept: 5.41553817826356
jack twyman | Missing: 4 | slept: 6.17969320839876
jackie butler | Missing: 6 | slept: 5.308971014038851
jackie dinkins | Missing: 6 | slept: 5.116558148875446
jackie moore | Missing: 5 | slept: 5.308971014038851
jackie moreland | Missing: 4 | slept: 5.298707134819513

jeff martin | Missing: 5 | slept: 5.106260784137536
jeff malone | Missing: 4 | slept: 5.420187738553408
jeff mcinnis | Missing: 5 | slept: 6.224766396082012
jeff mullins | Missing: 2 | slept: 5.420187738553408
jeff nordgaard | Missing: 5 | slept: 5.364521857543785
jeff ruland | Missing: 3 | slept: 6.823538094363578
jeff sanders | Missing: 5 | slept: 5.481646712051888
jeff slade | Missing: 7 | slept: 6.823538094363578
jeff teague | Missing: 2 | slept: 5.5621681975176225
jeff trepagnier | Missing: 5 | slept: 5.420187738553408
jeff taylor | Missing: 5 | slept: 8.013636089500407
jeff turner | Missing: 5 | slept: 5.5621681975176225
jeff webb | Missing: 7 | slept: 5.771507802817174
jeff wilkins | Missing: 4 | slept: 5.077878588051317
jeff webster | Missing: 5 | slept: 6.823538094363578
jeff withey | Missing: 3 | slept: 5.771507802817174
jeff taylor | Missing: 3 | slept: 6.325269926891478
jeffrey sheppard | Missing: 5 | slept: 6.224766396082012
jelani mccoy | Missing: 5 | slept: 5.56216819751

joe binion | Missing: 4 | slept: 6.3513290715500865
joe bryant | Missing: 3 | slept: 5.25947147220832
joe buckhalter | Missing: 4 | slept: 5.8944976820098285
joe cooke | Missing: 5 | slept: 5.8944976820098285
joe caldwell | Missing: 2 | slept: 6.504567301414167
joe cooper | Missing: 4 | slept: 6.3513290715500865
joe courtney | Missing: 6 | slept: 5.052213425045948
joe crawford | Missing: 3 | slept: 5.052213425045948
joe crispin | Missing: 6 | slept: 5.342847150230261
joe dolhon | Missing: 8 | slept: 5.8944976820098285
joe dumars | Missing: 2 | slept: 6.659334537391045
joe ellis | Missing: 5 | slept: 6.659334537391045
joe fulks | Missing: 4 | slept: 5.194096399552945
joe graboski | Missing: 7 | slept: 5.052213425045948
joe harris | Missing: 5 | slept: 6.81111655252896
joe holland | Missing: 6 | slept: 5.25947147220832
joe hassett | Missing: 3 | slept: 6.81111655252896
joe holup | Missing: 5 | slept: 6.659334537391045
joe hutton | Missing: 5 | slept: 5.9213215636350665
joe ingles | Missi

jonathan bender | Missing: 3 | slept: 5.020842843574739
jonathan gibson | Missing: 6 | slept: 5.225094852593242
jonathan isaac | Missing: 3 | slept: 6.67256382951446
jonathan kerner | Missing: 6 | slept: 6.67256382951446
jonathon simmons | Missing: 4 | slept: 5.388560758891085
jonny flynn | Missing: 3 | slept: 5.898186798096607
jordan adams | Missing: 2 | slept: 5.020842843574739
jordan bell | Missing: 2 | slept: 5.020842843574739
jordan clarkson | Missing: 3 | slept: 5.321708969568655
jordan crawford | Missing: 3 | slept: 5.803806802296463
jordan farmar | Missing: 2 | slept: 5.388560758891085
jordan hamilton | Missing: 3 | slept: 5.388560758891085
jordan hill | Missing: 4 | slept: 5.91954100200665
jordan mickey | Missing: 2 | slept: 5.321708969568655
jordan mcrae | Missing: 1 | slept: 6.901142989799492
jordan williams | Missing: 4 | slept: 5.321708969568655
jorge garbajosa | Missing: 5 | slept: 5.422982438355288
jorge gutierrez | Missing: 6 | slept: 5.91954100200665
jose calderon | Mi

kentavious caldwell-pope | Missing: 2 | slept: 5.414529661876507
kenton edelin | Missing: 4 | slept: 6.155595758999992
kenyon martin | Missing: 1 | slept: 5.815657896397649
keon clark | Missing: 4 | slept: 5.414529661876507
kermit washington | Missing: 3 | slept: 6.937726792763006
kerry kittles | Missing: 3 | slept: 5.406433290654998
kevin brooks | Missing: 4 | slept: 5.414529661876507
kevin burleson | Missing: 5 | slept: 6.937726792763006
kevin duckworth | Missing: 2 | slept: 5.418359437818363
kevin durant | Missing: 0 | slept: 5.025317905367549
kevin edwards | Missing: 3 | slept: 6.937726792763006
kevin gamble | Missing: 4 | slept: 5.418359437818363
kevin grevey | Missing: 2 | slept: 5.044009617402851
kevin garnett | Missing: 3 | slept: 5.8578822728020565
kevin henderson | Missing: 4 | slept: 5.418359437818363
kevin johnson | Missing: 1 | slept: 5.8578822728020565
kevin jones | Missing: 5 | slept: 5.18197413676242
kevin kunnert | Missing: 4 | slept: 5.239639820800317
kevin loder | Mi

lee shaffer | Missing: 4 | slept: 5.269081532088623
lee winfield | Missing: 5 | slept: 7.145346038344018
len elmore | Missing: 4 | slept: 5.269081532088623
len chappell | Missing: 2 | slept: 6.3378445905615886
len kosmalski | Missing: 5 | slept: 7.145346038344018
lennie rosenbluth | Missing: 5 | slept: 5.653100101862188
leo barnhorst | Missing: 2 | slept: 5.880169163142271
lenny wilkens | Missing: 3 | slept: 7.145346038344018
leo katkaveck | Missing: 7 | slept: 5.653100101862188
leo klier | Missing: 5 | slept: 5.521820558209917
leo kubiak | Missing: 6 | slept: 5.610600996183412
leo mogus | Missing: 6 | slept: 5.653100101862188
leo rautins | Missing: 4 | slept: 5.521820558209917
leon benbow | Missing: 4 | slept: 6.062707281373605
leon blevins | Missing: 6 | slept: 5.533336597672717
leon douglas | Missing: 4 | slept: 5.521820558209917
leon powe | Missing: 2 | slept: 6.062707281373605
leon smith | Missing: 5 | slept: 5.754531202656765
leon wood | Missing: 4 | slept: 5.269081532088623
leon

mark bradtke | Missing: 5 | slept: 7.521316462291242
mark bryant | Missing: 4 | slept: 5.916486708727804
mark crow | Missing: 4 | slept: 5.64174767027229
mark davis | Missing: 5 | slept: 5.603485789665638
mark eaton | Missing: 3 | slept: 5.64174767027229
mark hendrickson | Missing: 4 | slept: 5.2284839010677135
mark jackson | Missing: 2 | slept: 5.603485789665638
mark jones | Missing: 5 | slept: 6.081383482787592
mark landsberger | Missing: 3 | slept: 5.603485789665638
mark macon | Missing: 4 | slept: 5.1326660276201315
mark madsen | Missing: 2 | slept: 6.081383482787592
mark mcnamara | Missing: 2 | slept: 5.310903478379252
mark minor | Missing: 6 | slept: 6.081383482787592
mark olberding | Missing: 4 | slept: 6.521167420522009
mark pope | Missing: 5 | slept: 5.310903478379252
mark price | Missing: 3 | slept: 5.106940439369167
mark radford | Missing: 5 | slept: 5.310903478379252
mark sibley | Missing: 6 | slept: 5.106940439369167
mark randall | Missing: 5 | slept: 7.521316462291242
mar

mike brittain | Missing: 4 | slept: 8.268834673430469
mike brown | Missing: 4 | slept: 7.593258098344267
mike champion | Missing: 6 | slept: 7.593258098344267
mike conley | Missing: 0 | slept: 6.395559414989585
mike dantoni | Missing: 4 | slept: 6.625239133745529
mike davis | Missing: 5 | slept: 6.395559414989585
mike dunleavy | Missing: 2 | slept: 6.395559414989585
mike evans | Missing: 5 | slept: 5.71172446765561
mike farmer | Missing: 4 | slept: 5.424245684002732
mike flynn | Missing: 4 | slept: 5.71172446765561
mike gale | Missing: 2 | slept: 5.71172446765561
mike gibson | Missing: 4 | slept: 6.79474027703551
mike glenn | Missing: 3 | slept: 5.000562387795437
mike gminski | Missing: 3 | slept: 6.79474027703551
mike green | Missing: 2 | slept: 6.79474027703551
mike hall | Missing: 6 | slept: 6.28757373829579
mike harper | Missing: 3 | slept: 5.413355884972832
mike harris | Missing: 5 | slept: 6.28757373829579
mike higgins | Missing: 5 | slept: 6.28757373829579
mike holton | Missing:

nolan smith | Missing: 3 | slept: 5.829303259965809
norm grekin | Missing: 5 | slept: 5.829303259965809
norm cook | Missing: 4 | slept: 7.394731832595001
norm mager | Missing: 5 | slept: 5.829303259965809
norm nixon | Missing: 3 | slept: 7.062676255837119
norm stewart | Missing: 5 | slept: 5.624920672381217
norm richardson | Missing: 6 | slept: 7.062676255837119
norm swanson | Missing: 5 | slept: 7.062676255837119
norm van lier | Missing: 3 | slept: 5.513296695332592
norman black | Missing: 6 | slept: 5.423127767860546
norman powell | Missing: 2 | slept: 5.513296695332592
normie glick | Missing: 7 | slept: 5.513296695332592
norris cole | Missing: 2 | slept: 5.653930815061915
norris coleman | Missing: 4 | slept: 5.772862101393705
norton barnhill | Missing: 5 | slept: 5.653930815061915
obinna ekezie | Missing: 4 | slept: 5.653930815061915
odie spears | Missing: 6 | slept: 6.0340850211571375
odis allison | Missing: 5 | slept: 6.444790603883151
og anunoby | Missing: 4 | slept: 6.0340850211

price brookfield | Missing: 6 | slept: 5.394851025598374
predrag savovic | Missing: 7 | slept: 6.762467477731781
priest lauderdale | Missing: 4 | slept: 5.1488589100257505
primoz brezec | Missing: 6 | slept: 5.1488589100257505
purvis short | Missing: 5 | slept: 5.434105204427983
quentin richardson | Missing: 2 | slept: 5.1488589100257505
quincy acy | Missing: 4 | slept: 5.423095075970913
quincy douby | Missing: 5 | slept: 5.423095075970913
quincy lewis | Missing: 5 | slept: 5.042925906839624
quincy miller | Missing: 3 | slept: 5.423095075970913
quincy pondexter | Missing: 2 | slept: 5.23143555831728
quinn buckner | Missing: 3 | slept: 5.23143555831728
quintin dailey | Missing: 2 | slept: 5.23143555831728
quinn cook | Missing: 5 | slept: 7.550246243571907
quinton ross | Missing: 6 | slept: 6.493087471947048
qyntel woods | Missing: 5 | slept: 6.493087471947048
radisav curcic | Missing: 7 | slept: 6.493087471947048
rafael addison | Missing: 4 | slept: 5.492574036740712
raef lafrentz | Mis

ricky marsh | Missing: 5 | slept: 5.883345312770349
ricky rubio | Missing: 4 | slept: 5.395396118830252
ricky sobers | Missing: 4 | slept: 5.395396118830252
ricky pierce | Missing: 3 | slept: 6.9423139963499345
ricky wilson | Missing: 5 | slept: 5.174716335889689
rik smits | Missing: 2 | slept: 5.883345312770349
rj hunter | Missing: 5 | slept: 5.883345312770349
rob kurz | Missing: 6 | slept: 5.544981207256525
rob lock | Missing: 5 | slept: 5.039569270792962
rob rose | Missing: 6 | slept: 5.174716335889689
rob williams | Missing: 5 | slept: 5.174716335889689
robbie hummel | Missing: 3 | slept: 8.256901689567929
robert churchwell | Missing: 6 | slept: 5.039569270792962
robert archibald | Missing: 3 | slept: 8.782965446755309
robert covington | Missing: 3 | slept: 5.039569270792962
robert hahn | Missing: 8 | slept: 5.055706022961085
robert hite | Missing: 4 | slept: 5.689673788081755
robert hawkins | Missing: 4 | slept: 8.782965446755309
robert pack | Missing: 6 | slept: 5.278905819051592

sam ranzino | Missing: 5 | slept: 5.13433462115769
sam perkins | Missing: 3 | slept: 6.521960518332118
sam sibert | Missing: 7 | slept: 5.403132471145613
sam smith | Missing: 4 | slept: 5.403132471145613
sam stith | Missing: 5 | slept: 5.849977951650274
sam vincent | Missing: 3 | slept: 5.763948432565699
sam williams | Missing: 5 | slept: 5.13433462115769
sam worthen | Missing: 5 | slept: 5.13433462115769
sam young | Missing: 3 | slept: 5.61167849975787
samaki walker | Missing: 4 | slept: 5.252163830452239
samardo samuels | Missing: 5 | slept: 5.849977951650274
samuel dalembert | Missing: 2 | slept: 5.849977951650274
sarunas jasikevicius | Missing: 5 | slept: 5.146902690313563
sarunas marciulionis | Missing: 5 | slept: 5.9065205993379815
sasha danilovic | Missing: 5 | slept: 5.61167849975787
sasha kaun | Missing: 4 | slept: 5.61167849975787
sasha pavlovic | Missing: 6 | slept: 6.604761237873992
sasha vujacic | Missing: 3 | slept: 5.473840023713507
scooter mccray | Missing: 5 | slept: 5

stephon marbury | Missing: 1 | slept: 5.164445240919846
sterling brown | Missing: 5 | slept: 5.363401565512632
steve alford | Missing: 4 | slept: 6.534614618306476
steve bardo | Missing: 5 | slept: 6.65930543473832
steve blake | Missing: 2 | slept: 6.663447926389396
steve bracey | Missing: 3 | slept: 6.534614618306476
steve bucknall | Missing: 5 | slept: 5.164445240919846
steve burtt | Missing: 4 | slept: 5.031052075664175
steve courtin | Missing: 5 | slept: 5.164445240919846
steve colter | Missing: 4 | slept: 5.8166150933945575
steve downing | Missing: 3 | slept: 6.663447926389396
steve francis | Missing: 2 | slept: 5.031713938366653
steve green | Missing: 4 | slept: 5.446842558514153
steve goodrich | Missing: 5 | slept: 6.663447926389396
steve hamer | Missing: 4 | slept: 5.8166150933945575
steve hamilton | Missing: 4 | slept: 8.088595331600839
steve harris | Missing: 3 | slept: 5.3668827059425235
steve hawes | Missing: 4 | slept: 5.8166150933945575
steve hayes | Missing: 4 | slept: 5

tom chambers | Missing: 2 | slept: 6.9439394730693795
tom copa | Missing: 5 | slept: 5.169521067829491
tom gola | Missing: 2 | slept: 5.169521067829491
tom garrick | Missing: 3 | slept: 7.267959901826692
tom gugliotta | Missing: 2 | slept: 5.0931775181920855
tom hammonds | Missing: 3 | slept: 5.122705676642433
tom hawkins | Missing: 3 | slept: 5.122705676642433
tom heinsohn | Missing: 2 | slept: 5.059986717258957
tom henderson | Missing: 3 | slept: 5.2484141469465575
tom hoover | Missing: 4 | slept: 7.267959901826692
tom ingelsby | Missing: 4 | slept: 5.214137472838375
tom hovasse | Missing: 5 | slept: 7.267959901826692
tom kozelko | Missing: 4 | slept: 5.325043783715535
tom kropp | Missing: 4 | slept: 5.059986717258957
tom marshall | Missing: 6 | slept: 5.059986717258957
tom lagarde | Missing: 4 | slept: 6.153132120136094
tom mcmillen | Missing: 4 | slept: 5.760673749403304
tom meschery | Missing: 3 | slept: 5.214137472838375
tom owens | Missing: 5 | slept: 5.214137472838375
tom payne

vincenzo esposito | Missing: 7 | slept: 6.169138726609357
vinnie johnson | Missing: 2 | slept: 5.021351112936582
vinny del negro | Missing: 3 | slept: 5.698991890905317
vitaly potapenko | Missing: 4 | slept: 6.169138726609357
vitor faverani | Missing: 7 | slept: 5.021351112936582
vlade divac | Missing: 4 | slept: 5.565971255951742
vladimir radmanovic | Missing: 5 | slept: 5.123247701217376
vladimir stepania | Missing: 7 | slept: 5.021351112936582
voise winters | Missing: 5 | slept: 5.565971255951742
von wafer | Missing: 3 | slept: 7.133544571666526
vonteego cummings | Missing: 4 | slept: 6.700972395285042
voshon lenard | Missing: 4 | slept: 5.565971255951742
wade baldwin | Missing: 4 | slept: 7.133544571666526
wah wah jones | Missing: 5 | slept: 5.379191776559471
wali jones | Missing: 2 | slept: 6.367203021243851
walker russell | Missing: 6 | slept: 7.133544571666526
wallace bryant | Missing: 4 | slept: 5.379191776559471
wally anderzunas | Missing: 5 | slept: 6.963743550969385
wally os

### Get Urls

In [None]:
# worksheet = load_list_from_worksheet('nba_players_sanitized', 'hof')
# hof_names = sanitize_list(worksheet[0].tolist())
# print(hof_names)

# hof_urls = get_urls(hof_names, 30)

In [None]:
# worksheet = load_list_from_worksheet('nba_players_sanitized', 'retired_all_stars')
# retired_all_stars_names = sanitize_list(worksheet[0].tolist())
# print(retired_all_stars_names)

# retired_all_stars_urls = get_urls(retired_all_stars_names, 30)

In [None]:
# worksheet = load_list_from_worksheet('nba_players_sanitized', 'retired_all_nbas')
# retired_all_nbas_names = sanitize_list(worksheet[0].tolist())
# print(retired_all_nbas_names)

# retired_all_nbas_urls = get_urls(retired_all_nbas_names, 30)

In [None]:
# worksheet = load_list_from_worksheet('nba_players_sanitized', '2015')
# players_2015_names = sanitize_list(worksheet[0].tolist())
# print(players_2015_names)

# players_2015_urls = get_urls(players_2015_names, 30)

In [None]:
# save_pickle(hof_urls, 'hof_urls.pickle')
# save_pickle(retired_all_stars_urls, 'retired_all_stars_urls.pickle')
# save_pickle(retired_all_nbas_urls, 'retired_all_nbas_urls.pickle')
# save_pickle(players_2015_urls, 'players_2015_urls.pickle')

In [None]:
# %%time

# save_list_to_worksheet(list(hof_urls.values()), 'nba_player_urls', 'hof_urls', overwrite=True)
# save_list_to_worksheet(list(retired_all_nbas_urls.values()), 'nba_player_urls', 'retired_all_nbas_urls', overwrite=True)
# save_list_to_worksheet(list(retired_all_stars_urls.values()), 'nba_player_urls', 'retired_all_stars_urls', overwrite=True)
# save_list_to_worksheet(list(players_2015_urls.values()), 'nba_player_urls', 'players_2015_urls', overwrite=True)

## Get Tables

In [None]:
# %%time

# url_list = [url for url in hof_urls.values() if url is not None]
# print(len(url_list))
# hof_tables = get_tables(url_list, 4)
# save_pickle(hof_tables, 'hof_tables.pickle')

In [None]:
# %%time

# url_list = [url for url in retired_all_nbas_urls.values() if url is not None]
# print(len(url_list))
# retired_all_nbas_tables = get_tables(url_list, 4)
# save_pickle(retired_all_nbas_tables, 'retired_all_nbas_tables.pickle')

In [None]:
# %%time

# url_list = [url for url in retired_all_stars_urls.values() if url is not None]
# print(len(url_list))
# retired_all_stars_tables = get_tables(url_list, 4)
# save_pickle(retired_all_stars_tables, 'retired_all_stars_tables.pickle')

In [None]:
# %%time

# url_list = [url for url in players_2015_urls.values() if url is not None]
# print(len(url_list))
# players_2015_tables = get_tables(url_list, 4)
# save_pickle(players_2015_tables, 'players_2015_tables.pickle')