# Extract Stats from Basketball-Reference.com

2017-11

Scrap data from basketball-reference.com, using pd.read_html, BeaultifulSoup, Multiprocessing, & Other python modules.

## Imports, Constants, Utilities

### Imports

In [1]:
%%time

import os
import sys
import datetime
import re

import json
import pickle
import urllib
import pandas as pd
import numpy as np
import google
import random
import time
import multiprocessing
import gspread
import unidecode
import tqdm
import pprint

from collections import OrderedDict
from gspread import WorksheetNotFound
from bs4 import BeautifulSoup
from oauth2client.service_account import ServiceAccountCredentials


pd.set_option("display.max_columns", 100)


DATETIME_STRING_FORMAT = '%Y-%m-%d %H:%M:%S'

# Tables to retrieve for each player, by table html ids
TABLE_IDS = [
  'per_game',
  'totals',
  'per_minute', # per 36 minutes
  'per_poss', # per 100 possessions
  'advanced', # advanced
    
  'playoffs_per_game',
  'playoffs_totals',
  'playoffs_per_minute', # playoffs per 36 minutes
  'playoffs_per_poss', # playoffs per 100 possessions
  'playoffs_advanced', 
    
  'all_star',
  'all_college_stats',
  'all_salaries',
]

print('Current TABLE_IDS length: ', len(TABLE_IDS))

Current TABLE_IDS length:  13
CPU times: user 514 ms, sys: 232 ms, total: 746 ms
Wall time: 972 ms


In [2]:
%%time

def merge_list_of_list(nested_list):
    flattened_list = [item for lst in nested_list for item in lst]
    return flattened_list

test_list = [['a'], ['b']]
merge_list_of_list(test_list)

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 8.11 µs


In [3]:
%%time

# Utility function to merge retrived data tables into 1 dictionary.
def merge_list_of_dict(list_of_dict):
    merged_dict = {}
    for dictionary in list_of_dict:
        merged_dict.update(dictionary)
    # Sort by dictionary key
    ordered_dict = OrderedDict(sorted(merged_dict.items(), key=lambda t: t[0]))
    return ordered_dict

test_list = [
    {'michael jordan': {'tables': {}, 'missing_tables': 'none', 'url': 'diety'}},
    {'kobe bryant': {'tables': {}, 'missing_tables': 'none', 'url': 'godly'}},
]

dic = merge_list_of_dict(test_list)
print(dic)

OrderedDict([('kobe bryant', {'tables': {}, 'missing_tables': 'none', 'url': 'godly'}), ('michael jordan', {'tables': {}, 'missing_tables': 'none', 'url': 'diety'})])
CPU times: user 228 µs, sys: 573 µs, total: 801 µs
Wall time: 426 µs


In [4]:
%%time

def sanitize_string(raw_string):
    sanitized_string = unidecode.unidecode(raw_string)
    sanitized_string = sanitized_string.strip().lower()
    sanitized_string = sanitized_string.replace("'", "")
    sanitized_string = sanitized_string.replace('"', '') 
    sanitized_string = sanitized_string.replace('.', '')
    if "," in sanitized_string:
        lst = sanitized_string.split(",")
        lst.reverse()
        lst = [token.strip() for token in lst]
        sanitized_string = " ".join(lst)
    return sanitized_string

print(sanitize_string("Shaquille O'neal"))
print(sanitize_string("Bryant, Kobe"))
print(sanitize_string(" CarTer, Vince .."))

shaquille oneal
kobe bryant
vince carter
CPU times: user 487 µs, sys: 1.19 ms, total: 1.68 ms
Wall time: 1.27 ms


In [5]:
%%time

def sanitize_list(raw_list):
            
    sanitized_list = [sanitize_string(raw_string) for raw_string in raw_list]
    return sanitized_list

test_list = ["Shaquille O'neal", "J. J. Reddick", "VinCe Carter ", "Bryant, Kobe"]

print(sanitize_list(test_list))

['shaquille oneal', 'j j reddick', 'vince carter', 'kobe bryant']
CPU times: user 82 µs, sys: 35 µs, total: 117 µs
Wall time: 121 µs


In [6]:
%%time

def dedupe_list(lst):
    return list(set(lst))

print(dedupe_list(['a', 'a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b', ]))

['b', 'a']
CPU times: user 192 µs, sys: 165 µs, total: 357 µs
Wall time: 286 µs


### Load Spreadsheets

In [7]:
def load_list_from_worksheet(spreadsheet_name, worksheet_name):
  
    scope = ['https://spreadsheets.google.com/feeds']
    credentials = ServiceAccountCredentials.from_json_keyfile_name('Data-35df9a696bc1.json', scope)
    gc = gspread.authorize(credentials)

    spreadsheet = gc.open(spreadsheet_name)
    worksheet = spreadsheet.worksheet(worksheet_name)

    rows = worksheet.get_all_values()
    
    first_row = rows[0]
    first_cell = first_row[0]
    
    try:
        timestamp = datetime.datetime.strptime(first_cell, DATETIME_STRING_FORMAT)
        rows.remove(first_row)
    except ValueError:
        timestamp = None

    print(
        'LOADED > {num_rows} rows from '
        'spreadsheet: "{spreadsheet_name}" | '
        'worksheet: "{worksheet_name}" | '
        'timestamp: {timestamp}'.format(
            num_rows=len(rows), spreadsheet_name=spreadsheet_name, 
            worksheet_name=worksheet_name, timestamp=timestamp), '\n')

    df = merge_list_of_list(rows)
    
    return df

worksheet = load_list_from_worksheet('test_spreadsheet', 'test')
print(worksheet[:10])

LOADED > 100 rows from spreadsheet: "test_spreadsheet" | worksheet: "test" | timestamp: 2017-11-24 09:32:11 

['michale', 'kobe', '0', '1', '2', '3', '4', '5', '6', '7']


In [8]:
%%time

def save_list_to_worksheet(lst, spreadsheet_name, worksheet_name, add_timestamp=True, overwrite=False):
    scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
    credentials = ServiceAccountCredentials.from_json_keyfile_name('Data-35df9a696bc1.json', scope)
    gc = gspread.authorize(credentials)
    
    spreadsheet = gc.open(spreadsheet_name)
    
    if type(lst) is not list:
        print('ERROR: input item is not a list!')
        return False
    
    # Remove rows with None value
    original_length = len(lst)
    lst = [item for item in lst if item is not None]
    new_length = len(lst)
    
    try:
        worksheet = spreadsheet.worksheet(worksheet_name)
        if overwrite:
            new_worksheet_name = worksheet_name + "_new"
            new_worksheet = spreadsheet.add_worksheet(new_worksheet_name, len(lst), 1)
            spreadsheet.del_worksheet(worksheet)
            new_worksheet.update_title(worksheet_name)
        else:
            print('Worksheet "{worksheet_name}" already exist! Please set overwrite=True to overwrite.')
            return False
    except WorksheetNotFound: 
        new_worksheet = spreadsheet.add_worksheet(worksheet_name, len(lst), 1)
    
    range_notation = 'A1:A{last_row_index}'.format(last_row_index=len(lst))
    
    cells_to_update = new_worksheet.range(range_notation)

    print('Remove {num_row} rows with "None" as their value.'.format(
        num_row=(original_length - new_length)))
    
    for cell, item in zip(cells_to_update, lst):
        cell.value = item
    
    new_worksheet.update_cells(cells_to_update)
    
    #Add a timestamp in the 1st cell
    if add_timestamp:
        timestamp = datetime.datetime.now()
        new_worksheet.insert_row(
            [timestamp], 1)
    
    print(
    'SAVED > {num_rows} rows to '
    'spreadsheet: "{spreadsheet_name}" | '
    'worksheet: "{worksheet_name}" | '
    'timestamp: {timestamp}'.format(
        num_rows=len(lst), spreadsheet_name=spreadsheet_name, 
        worksheet_name=worksheet_name, timestamp=timestamp), '\n')
    
    return True

test_lst = ['michale', 'kobe'] + [i for i in range(98)]
print(len(test_lst))
save_list_to_worksheet(test_lst, 'test_spreadsheet', 'test', add_timestamp=True, overwrite=True)

100
Remove 0 rows with "None" as their value.
SAVED > 100 rows to spreadsheet: "test_spreadsheet" | worksheet: "test" | timestamp: 2017-11-25 12:45:20.404712 

CPU times: user 97.7 ms, sys: 13.6 ms, total: 111 ms
Wall time: 3.89 s


### Save & Load Pickle

In [9]:
%%time

test = {
    'words': """
        Lorem ipsum dolor sit amet, consectetur adipiscing 
        elit. Mauris adipiscing adipiscing placerat. 
        Vestibulum augue augue, 
        pellentesque quis sollicitudin id, adipiscing.
        """,
    'list': list(range(10000)),
    'dict': dict((str(i),'a') for i in range(10000)),
    'int': 100,
    'float': 100.123456
}

def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

def get_file_size(filename):
    statinfo = os.stat(filename)
    return sizeof_fmt(statinfo.st_size)

def save_pickle(item, filename):
    with open(filename, 'wb') as file:
        pickle.dump(item, file)
    print(
        '\n'
        'SAVED  > ',
        filename, ' | ', 
        get_file_size(filename), ' | ',
        'length: ', len(item),
        '\n')
    return True;

def load_pickle(filename):
    with open(filename, 'rb') as file:
        obj = pickle.load(file)
        print(
        '\n'
        'LOADED > ',
        filename, ' | ', 
        get_file_size(filename), ' | ',
        'length: ', len(obj),
        '\n')
        return obj

save_pickle(test, 'test.pickle')

len(load_pickle('test.pickle'))


SAVED  >  test.pickle  |  183.8 KiB  |  length:  5 


LOADED >  test.pickle  |  183.8 KiB  |  length:  5 

CPU times: user 10.6 ms, sys: 4.75 ms, total: 15.3 ms
Wall time: 15.7 ms


## Reshape & create dataframes

In [44]:
%%time

all_players_stats_tables = load_pickle('all_players_stats_tables.pickle')
all_players_attributes = load_pickle('all_players_attributes.pickle')

all_players_attributes_df = pd.DataFrame.from_dict(all_players_attributes, orient='index')
print(all_players_attributes_df.shape)

all_players_dfs = {}
for table_id in TABLE_IDS:
    l_dfs = []
    l_keys = []
    for name, player in all_players_stats_tables.items():
        df = player.get('tables', {}).get(table_id, None)
        if df is not None:
            l_dfs.append(df)
            l_keys.append(name)
    all_players_dfs[table_id] = pd.concat(l_dfs, keys=l_keys)

all_players_dfs['all_players_attributes'] = all_players_attributes_df

save_pickle(all_players_dfs, 'all_players_dfs.pickle')

print('# of Dataframes created: ', len(all_players_dfs))
for key in all_players_dfs.keys():
    print(key, all_players_dfs[key].shape)


LOADED >  all_players_stats_tables.pickle  |  97.6 MiB  |  length:  3997 


LOADED >  all_players_attributes.pickle  |  5.8 MiB  |  length:  3997 

(3997, 19)

SAVED  >  all_players_dfs.pickle  |  51.5 MiB  |  length:  14 

# of Dataframes created:  14
per_game (30982, 29)
totals (29804, 29)
per_minute (29804, 28)
per_poss (24489, 30)
advanced (29804, 28)
playoffs_per_game (12319, 29)
playoffs_totals (12319, 29)
playoffs_per_minute (12319, 28)
playoffs_per_poss (9756, 30)
playoffs_advanced (12319, 26)
all_star (2154, 24)
all_college_stats (14251, 3)
all_salaries (15807, 3)
all_players_attributes (3997, 19)
CPU times: user 1min 12s, sys: 1.24 s, total: 1min 14s
Wall time: 1min 19s


## Get Players

### Get players from a single URL

In [None]:
def get_players(url):
    
    href_pattern = re.compile('^\/players\/.\/[a-z0-9]*.html$')
    href_prefix = 'https://www.basketball-reference.com'
    
    page = urllib.request.urlopen(url)
    html = page.read()

    # Get the player name
    soup = BeautifulSoup(html, 'html.parser')
    
    els = soup.find_all('a', href=href_pattern)
    
    players = {}
    
    for el in els:
        if el.parent.name == 'td':
            player_name = sanitize_string(el.text)
            player_url = ''.join([href_prefix, el['href']])
            players[player_name] = player_url
    
    randomized_sleep_time = 5 + np.random.exponential(1, 1)[0]
    time.sleep(randomized_sleep_time)
    
    print('Scrapped {url} | Players Found: {len}'.format(url=url, len=len(players)))
    sys.stdout.flush()

    return players

players = get_players('https://www.basketball-reference.com/leagues/NBA_1967_totals.html')

### Get players from a list of URLs, Multiprocessing

In [None]:
%%time

def get_players_from_urls(urls, num_processes):
    p = multiprocessing.Pool(processes=num_processes)
    outputs = p.map(get_players, urls)
    p.close()
    final_output = merge_list_of_dict(outputs)
    print(
        'Scrapped {num_url} urls, found {num_player} players.'.format(
            num_url=len(urls), num_player=len(final_output)), '\n')
    return final_output

test_urls = [
    'https://www.basketball-reference.com/leagues/NBA_2015_totals.html',
    'https://www.basketball-reference.com/leagues/NBA_2010_totals.html'
]

players = get_players_from_urls(test_urls, 2)

## GET URL (DEPRECATED, replaced by "Get Players")

### Get URL for a player name

In [None]:
# %%time

# #TODO(jameshu): Add logic to verify the url returned  in fact matches the player name
# # Currently, even gibberish player_name e.g. "James Hu" would have results returned.

# def get_url_title(url):
#     page = urllib.request.urlopen(url)
#     soup = BeautifulSoup(page, "html.parser")
#     return soup.title.text

# def get_url(player_name):       
#     query = (
#         'site:www.basketball-reference.com/players/*/*.html '
#         '{player_name} Overview').format(player_name=player_name)
#     print('query: ', query)

#     results = google.search(query=query, start=0, stop=1)
#     urls = list(results)        
    
#     time.sleep(random.randint(5, 10))
    
#     if urls:
#         return {player_name: urls[0]}
#     else:
#         print('url found: None')
#         return {player_name: None}
        
# # print(get_url('Michael Jordan'))

### Get URLs for a list of player names, MULTIPROCESSING

In [None]:
# %%time

# def get_urls(player_names, num_processes):
#     p = multiprocessing.Pool(processes=num_processes)
#     outputs = p.map(get_url, player_names)
#     p.close()
#     return merge_list_of_dict(outputs)

# # print(get_urls(test_names[0:2], 2))

## Get stats tables

### Get stats table for an url

In [None]:
%matplotlib inline

# Test out the sleep function

# mu, sigma = 0, 1
# s = np.random.normal(mu, sigma, 1000)
# pd.Series(s).hist()

# mu, sigma = 0, 1
# s = np.random.exponential(1, 100000)
# pd.Series(s).hist()


In [None]:
%%time

def get_stats_table(url):
        
    output = {}
    
    page = urllib.request.urlopen(url)
    urlHtml = page.read().decode()
    
    # Uncomment the tables
    uncommentedUrlHtml = urlHtml.replace('-->', '')
    uncommentedUrlHtml = uncommentedUrlHtml.replace('<!--', '')

    soup = BeautifulSoup(uncommentedUrlHtml, 'lxml')
    
    player_name = sanitize_string(soup.find("h1").text)
    output.setdefault(player_name, {}).setdefault('url', url);

    tags = soup.find_all('table')
    
    tables = {}
    missing_table_ids = list(TABLE_IDS) # MAKE A COPY
    
    for tag in tags:
        table_id = tag.get('id')
        if table_id in TABLE_IDS:
            table = pd.read_html(str(tag), header=0, index_col=0)[0]
            tables[table_id] = table
            missing_table_ids.remove(table_id)

    for dataframe in tables.values():
        dataframe.drop([col_name for col_name in dataframe.columns if 'Unnamed' in col_name], axis=1, inplace=True)
       
    output[player_name].setdefault('tables', tables);
    output[player_name].setdefault('missing_tables', missing_table_ids);
    
    randomized_sleep_time = 5 + np.random.exponential(1, 1)[0]
    time.sleep(randomized_sleep_time)
    
    processing_info = (
        '{player_name} | Found: {num_table} | '
        'slept: {randomized_sleep_time}'.format(
            player_name=player_name, 
            num_table=len(output[player_name]['tables']), 
            missing_tables=missing_table_ids,
            randomized_sleep_time=randomized_sleep_time))
    
    print(processing_info)
    sys.stdout.flush()

    return output

table = get_stats_table('https://www.basketball-reference.com/players/b/bellawa01.html')

In [None]:
table['walt bellamy']['tables']['totals']

In [None]:
table['walt bellamy']['missing_tables']

In [None]:
table['walt bellamy']['url']

### Get stats tables for a list of urls, MULTIPROCESSING

In [None]:
%%time

def get_stats_tables(urls, num_processes):
    pool = multiprocessing.Pool(processes=num_processes)
    jobs = pool.imap_unordered(get_stats_table, urls)
    size = len(urls)
    outputs = tqdm.tqdm_notebook(jobs, total=size)
    pool.close()
#     pool.join()
    return merge_list_of_dict(outputs)

test_urls = [
    'https://www.basketball-reference.com/players/b/bellawa01.html',
    'https://www.basketball-reference.com/players/j/jordami01.html'
]

tables = get_stats_tables(test_urls, 2)
print('obj length: ', len(tables))

In [None]:
tables['michael jordan']['tables']

## Get Player Attributes

### Get a player's attributes from an URL

In [61]:
%%time


def get_player_attributes(url):

    output = {}
    
    try:

        page = urllib.request.urlopen(url)
        urlHtml = page.read().decode()

        # Uncomment the tables
        uncommentedUrlHtml = urlHtml.replace('-->', '')
        uncommentedUrlHtml = uncommentedUrlHtml.replace('<!--', '')

        soup = BeautifulSoup(uncommentedUrlHtml, 'lxml')

        player_name = sanitize_string(soup.find("h1").text)
        output.setdefault(player_name, {}).setdefault('url', url)
        output.setdefault(player_name, {}).setdefault('missing_attributes', [])

        # Get all info, for future extraction
        tag = soup.find('div', attrs={'id': 'info'})
        if tag:
            player_info = tag
#             output['player_info_raw'] = tag
        else:
            output[player_name]['missing_attributes'].append('player_info_raw')

        tag = player_info.find('strong',text=re.compile('.*Position.*'))
        if tag:
            position = tag.parent.contents[2]
            position = position.replace('\n','').replace('▪','').strip()
            position = re.sub('\s{2,}', ' ', position)
            shooting_hand = tag.parent.contents[-1]
            shooting_hand = shooting_hand.replace('\n','').replace('▪','').strip()
            shooting_hand = re.sub('\s{2,}', ' ', shooting_hand)
            output[player_name]['position'] = position
            output[player_name]['shooting_hand'] = shooting_hand
        else:
            output[player_name]['missing_attributes'].extend(['position', 'shooting_hand'])

        tag = player_info.find('strong',text=re.compile('.*High School.*'))
        if tag:
            school = tag.parent.text.replace('\n', '').split(':')
            school = school[-1].strip()
            school = re.sub('\s{2,}', ' ', school)
            output[player_name]['high_school'] = school
        else:
            output[player_name]['missing_attributes'].append('high_school')
            
        tag = player_info.find('strong',text=re.compile('.*Relatives.*'))
        if tag:
            tags = tag.parent.find_all('a')
            if tags:
                relatives = []
                for tag in tags:
                    relatives.append(tag.text.strip().replace('\xa0', ' '))
                output[player_name]['relatives'] = relatives
            else:
                output[player_name]['missing_attributes'].append('relatives')
        else:
            output[player_name]['missing_attributes'].append('relatives')

        tag = player_info.find('strong',text=re.compile('.*College.*'))
        if tag:
            tag = tag.parent.find('a')
            if tag:
                college = tag.text.strip()
                output[player_name]['college'] = college
            else: 
                output[player_name]['missing_attributes'].append('college')
        else:
            output[player_name]['missing_attributes'].append('college')

        tag = player_info.find('strong',text=re.compile('.*Recruiting Rank.*'))
        if tag:
            recruiting_rank = tag.parent.text.strip()
            recruiting_rank = re.search('\(([0-9]*)\)', recruiting_rank).group(1)
            output[player_name]['recruiting_rank'] = recruiting_rank
        else:
            output[player_name]['missing_attributes'].append('recruiting_rank')

        tag = player_info.find('strong',text=re.compile('.*Draft.*'))
        if tag:
            draft = tag.parent.text.replace('\n', '').split(':')
            draft = draft[-1].strip()
            draft = re.sub('\s{2,}', ' ', draft)
            output[player_name]['draft'] = draft
        else:
            output[player_name]['missing_attributes'].append('draft')

        tag = player_info.find('strong',text=re.compile('.*Debut.*'))
        if tag:
            nba_debute = tag.parent.contents[2]
            output[player_name]['nba_debut'] = nba_debute.text.strip()
        else:
            output[player_name]['missing_attributes'].append('nba_debut')            

        href_pattern = re.compile('^https://twitter.com/.*$')
        tag = player_info.find('a', href=href_pattern)
        if tag:
            output[player_name]['twitter'] = tag['href'].strip()
        else:
            output[player_name]['missing_attributes'].append('twitter')  

        tag = player_info.find('span', attrs={'itemprop': 'birthDate'})
        if tag:
            output[player_name]['birth_date'] = tag['data-birth'].strip()
        else:
            output[player_name]['missing_attributes'].append('birth_date')  

        tag = player_info.find('span', attrs={'itemprop': 'birthPlace'})
        if tag:
            tag = tag.find('a')
            if tag:
                state = tag.text.strip()
                birth_place = state
                tag = tag.previous_element
                if tag:
                    city = tag.strip()
                    if city not in ['in', '']:
                        city = re.search('in\s(.*),', city).group(1)
                        birth_place = ', '.join([city, state])
                output[player_name]['birth_place'] = birth_place
            else:
                output[player_name]['missing_attributes'].append('birth_place')
        else:
            output[player_name]['missing_attributes'].append('birth_place')  

        tag = player_info.find('span', attrs={'itemprop': 'height'})
        if tag:
            output[player_name]['height'] = tag.text.strip()
        else:
            output[player_name]['missing_attributes'].append('height')  

        tag = player_info.find('span', attrs={'itemprop': 'weight'})
        if tag:
            output[player_name]['weight'] = tag.text.strip()
        else:
            output[player_name]['missing_attributes'].append('weight')  

        tags = player_info.find('ul', attrs={'id': 'bling'})
        if tags:
            tags = tags.find_all('a')
            if tags:
                output[player_name]['honors'] = []
                for tag in tags:
                    output[player_name]['honors'].append(tag.text.strip())
            else:
                output[player_name]['missing_attributes'].append('honors')
        else:
            output[player_name]['missing_attributes'].append('honors')  

        tag = soup.find('p', text=re.compile('.*Chinese:.*'))
        if tag:
            chinese_name = tag.text.split(':')[-1].replace('數據','').strip()
            output[player_name]['chinese_name'] = chinese_name
        else:
            output[player_name]['missing_attributes'].append('chinese_name')  

        tags = soup.find_all('p', attrs={'class': 'transaction '})
        if tags:
            for tag in tags:
                transaction_date = tag.find('strong').text.strip()
                transaction = tag.text.split(':')[-1].strip()
                transaction = re.sub('\s{2,}', ' ', transaction)
                output[player_name].setdefault('transactions', {})[transaction_date] = transaction
        else:
            output[player_name]['missing_attributes'].append('transactions')  

        tag = player_info.find('p', text=re.compile('.*\(.*\).*'))
        if tag:
            nicknames = tag.text.replace('\n','').split(',')
            nicknames = [nickname.replace('(','').replace(')','').strip() for nickname in nicknames]
            output[player_name]['nicknames'] = nicknames
        else:
            output[player_name]['missing_attributes'].append('nicknames')  

        tags = player_info.find_all('svg', attrs={'class': 'jersey'})
        if tags:
            for tag in tags:
                jersey_number = tag.find('text').text.strip()
                team = tag.parent['data-tip'].strip()
                output[player_name].setdefault('numbers', {})[jersey_number] = team
        else:
            output[player_name]['missing_attributes'].append('numbers')  


        randomized_sleep_time = 5 + np.random.exponential(1, 1)[0]
        time.sleep(randomized_sleep_time)

        processing_info = (
            '{player_name} | Missing: {num_missing} | '
            'slept: {randomized_sleep_time}'.format(
                player_name=player_name, 
                num_missing=len(output[player_name]['missing_attributes']), 
                randomized_sleep_time=randomized_sleep_time))

        print(processing_info)
        sys.stdout.flush()
    
    except Exception as e:
        print(url, " FAILED! | ", str(e))
        output[url] = e
                
    return output

# test_url = 'https://www.basketball-reference.com/players/m/mingya01.html'
# test_url = "https://www.basketball-reference.com/players/b/bellawa01.html"
# test_url = 'https://www.basketball-reference.com/players/r/redicjj01.html'
# test_url = 'https://www.basketball-reference.com/players/n/novakst01.html'
# test_url = 'https://www.basketball-reference.com/players/j/jordami01.html'
# test_url = 'https://www.basketball-reference.com/players/h/hairsal01.html'
# test_url = 'https://www.basketball-reference.com/players/h/henryal01.html' 
# test_url = 'https://www.basketball-reference.com/players/w/waltobi01.html'
# test_url = 'https://www.basketball-reference.com/players/s/stepavl01.html' # Test birth_place no city
# test_url = 'https://www.basketball-reference.com/players/b/barrybr01.html' # Test Relatives
    
test_output = get_player_attributes(test_url)
pprint.pprint(test_output)

steve novak | Missing: 4 | slept: 5.119533671390849
{'steve novak': {'birth_date': '1983-06-13',
                 'birth_place': 'Libertyville, Illinois',
                 'chinese_name': '史蒂夫·诺瓦克',
                 'college': 'Marquette University',
                 'draft': 'Houston Rockets, 2nd round (2nd pick, 32nd '
                          'overall), 2006 NBA Draft',
                 'height': '6-10',
                 'high_school': 'Brown Deer in Brown Deer, Wisconsin',
                 'missing_attributes': ['relatives',
                                        'twitter',
                                        'honors',
                                        'nicknames'],
                 'nba_debut': 'November 1, 2006',
                 'numbers': {'16': 'Utah Jazz, 2015',
                             '20': 'Los Angeles Clippers, 2009-2010',
                             '21': 'Dallas Mavericks, 2011',
                             '23': 'San Antonio Spurs, 2011',
            

In [53]:
# html = u'''
# <span itemprop="birthPlace">
#     in&nbsp;Brooklyn,&nbsp;<a href="/friv/birthplaces.cgi?country=US&amp;state=NY">New York</a></span>
# '''

# html = u'''
#     <span itemprop="birthPlace">
#         in&nbsp;<a href="/friv/birthplaces.cgi?country=GE&amp;state=">Georgia</a></span>
# '''

# html = u'''
#     <span itemprop="birthPlace"></span>
# '''

html = '''
<p><strong>Relatives</strong>: Brothers&nbsp;<a itemprop="relatedTo" href="/players/b/barrydr01.html">Drew&nbsp;Barry</a>, <a itemprop="relatedTo" href="/players/b/barryjo01.html">Jon&nbsp;Barry</a>; Father&nbsp;<a itemprop="relatedTo" href="/players/b/barryri01.html">Rick&nbsp;Barry</a>; Grandfather&nbsp;<a itemprop="relatedTo" href="/players/h/halebr01.html">Bruce&nbsp;Hale</a> </p>
'''

soup = BeautifulSoup(html, 'lxml')

tag = soup.find('strong',text=re.compile('.*Relatives.*'))
if tag:
    tags = tag.parent.find_all('a')
    if tags:
        relatives = []
        for tag in tags:
            relatives.append(tag.text.strip().replace('\xa0', ' '))
    else:
        print('not found')
    output = relatives
else:
    print('not found')
    
print(output)    

['Drew Barry', 'Jon Barry', 'Rick Barry', 'Bruce Hale']


In [62]:
%%time

def get_players_attributes(urls, num_processes):
    pool = multiprocessing.Pool(processes=num_processes)
    jobs = pool.imap_unordered(get_player_attributes, urls)
    size = len(urls)
    outputs = tqdm.tqdm_notebook(jobs, total=size)
    pool.close()
#     pool.join()
    return merge_list_of_dict(outputs)

test_urls = [
    'https://www.basketball-reference.com/players/b/bellawa01.html',
    'https://www.basketball-reference.com/players/j/jordami01.html',
    'https://www.basketball-reference.com/players/n/novakst01.html',
    'https://www.basketball-reference.com/players/m/mingya01.html',
    'https://www.basketball-reference.com/players/b/bryanko01.html',
    'https://www.basketball-reference.com/players/r/redicjj01.html'
]

attributes = get_players_attributes(test_urls, 4)
print('obj length: ', len(attributes))

A Jupyter Widget

walt bellamy | Missing: 3 | slept: 5.718610768524977
yao ming | Missing: 4 | slept: 5.718610768524977
steve novak | Missing: 4 | slept: 5.718610768524977
michael jordan | Missing: 3 | slept: 5.718610768524977
kobe bryant | Missing: 2 | slept: 8.956263373233707
jj redick | Missing: 3 | slept: 8.956263373233707



NameError: name 'tables' is not defined

In [42]:
save_pickle(all_players_dfs, 'all_players_dfs.pickle')


SAVED  >  all_players_dfs.pickle  |  51.5 MiB  |  length:  14 



True

## Burner

In [11]:
sample_dict = {
    "a": {"a_attribute_1": "adfa", "a_attribute_2": {}, "a_attribute_3": ['a', 'b', 'c'], "a_attribute_4": '123'},
    "b": {"b_attribute_1": "erqwerdfa", "b_attribute_2": {}, "b_attribute_3": ['a', 'b', 'c']}
}

pd.DataFrame.from_dict(sample_dict, orient='index')

Unnamed: 0,a_attribute_1,a_attribute_2,a_attribute_3,a_attribute_4,b_attribute_1,b_attribute_2,b_attribute_3
a,adfa,{},"[a, b, c]",123.0,,,
b,,,,,erqwerdfa,{},"[a, b, c]"


In [63]:
%%time

all_players = load_pickle('all_players.pickle')
all_players_attributes = get_players_attributes(list(all_players.values()), 4)
save_pickle(all_players_attributes, 'all_players_attributes.pickle')


LOADED >  all_players.pickle  |  363.5 KiB  |  length:  3998 



A Jupyter Widget

aaron gray | Missing: 5 | slept: 5.718610768524977
aaron brooks | Missing: 3 | slept: 5.718610768524977
aaron harrison | Missing: 5 | slept: 5.718610768524977
aaron gordon | Missing: 3 | slept: 5.718610768524977
aaron james | Missing: 4 | slept: 8.956263373233707
aaron mckie | Missing: 4 | slept: 8.956263373233707
aaron miles | Missing: 6 | slept: 8.956263373233707
aaron swinson | Missing: 7 | slept: 8.956263373233707
aaron williams | Missing: 7 | slept: 5.710200505894018
abdel nader | Missing: 6 | slept: 5.710200505894018
abdul jeelani | Missing: 6 | slept: 5.710200505894018
ac green | Missing: 2 | slept: 5.710200505894018
acie earl | Missing: 5 | slept: 5.132143799474683
acie law | Missing: 4 | slept: 5.132143799474683
adam harrington | Missing: 6 | slept: 5.132143799474683
adam keefe | Missing: 5 | slept: 5.132143799474683
adam morrison | Missing: 3 | slept: 5.545732810827167
adonal foyle | Missing: 4 | slept: 5.545732810827167
adonis jordan | Missing: 5 | slept: 5.545732810827167
a

andy toolson | Missing: 7 | slept: 5.426220361913156
andy walker | Missing: 6 | slept: 5.023337622086227
anfernee hardaway | Missing: 2 | slept: 5.426220361913156
ansu sesay | Missing: 6 | slept: 5.426220361913156
antawn jamison | Missing: 4 | slept: 5.073064800324845
ante zizic | Missing: 8 | slept: 5.426220361913156
anthony avent | Missing: 4 | slept: 5.073064800324845
anthony bennett | Missing: 3 | slept: 5.073064800324845
anthony bonner | Missing: 4 | slept: 5.204339788763715
anthony bowie | Missing: 4 | slept: 5.073064800324845
anthony brown | Missing: 3 | slept: 5.204339788763715
anthony carter | Missing: 6 | slept: 5.204339788763715
anthony cook | Missing: 4 | slept: 5.379184362561668
anthony davis | Missing: 1 | slept: 5.204339788763715
anthony frederick | Missing: 5 | slept: 5.379184362561668
anthony goldwire | Missing: 5 | slept: 5.379184362561668
anthony grundy | Missing: 7 | slept: 5.941627122758637
anthony johnson | Missing: 6 | slept: 5.379184362561668
anthony jones | Mis

bill mlkvy | Missing: 4 | slept: 5.7053120601239655
bill roberts | Missing: 9 | slept: 5.122554296776338
bill robinzine | Missing: 6 | slept: 5.600817274791857
bill russell | Missing: 3 | slept: 5.600817274791857
bill sharman | Missing: 3 | slept: 5.600817274791857
bill smith | Missing: 7 | slept: 5.235428574510796
bill stricker | Missing: 8 | slept: 5.125955172777373
bill thieben | Missing: 8 | slept: 5.125955172777373
bill tosheff | Missing: 6 | slept: 5.125955172777373
bill turner | Missing: 8 | slept: 5.7053120601239655
bill walton | Missing: 1 | slept: 5.418461257161338
bill wennington | Missing: 3 | slept: 5.418461257161338
bill willoughby | Missing: 7 | slept: 5.418461257161338
bill zopf | Missing: 6 | slept: 5.600817274791857
billy cunningham | Missing: 3 | slept: 7.978240043694083
billy donovan | Missing: 4 | slept: 7.978240043694083
billy kenville | Missing: 5 | slept: 5.125955172777373
billy hassett | Missing: 7 | slept: 7.978240043694083
billy owens | Missing: 5 | slept: 5.

brandon jennings | Missing: 4 | slept: 6.410074072975017
brandon knight | Missing: 3 | slept: 6.501705024271881
brandon paul | Missing: 6 | slept: 5.318355661219153
brandon roy | Missing: 2 | slept: 5.318355661219153
brandon williams | Missing: 7 | slept: 5.318355661219153
brandon rush | Missing: 2 | slept: 6.361195968350913
brant weidner | Missing: 6 | slept: 5.322618123820028
brendan haywood | Missing: 3 | slept: 5.322618123820028
brendan mccann | Missing: 5 | slept: 5.322618123820028
brent barry | Missing: 1 | slept: 6.501705024271881
brent price | Missing: 5 | slept: 7.48952204130436
brent scott | Missing: 7 | slept: 7.48952204130436
brett szabo | Missing: 7 | slept: 7.48952204130436
brett vroman | Missing: 5 | slept: 5.318355661219153
brevin knight | Missing: 3 | slept: 5.307241167999105
brian cardinal | Missing: 2 | slept: 5.307241167999105
brian davis | Missing: 5 | slept: 5.322618123820028
brian cook | Missing: 3 | slept: 5.307241167999105
brian evans | Missing: 5 | slept: 5.49

charlie bell | Missing: 5 | slept: 5.895729124235184
charlie black | Missing: 5 | slept: 6.8491043235355376
charlie davis | Missing: 5 | slept: 5.268247288465215
charlie criss | Missing: 5 | slept: 6.797422002002266
charlie hardnett | Missing: 5 | slept: 6.8491043235355376
charlie lowery | Missing: 7 | slept: 5.268247288465215
charlie parsley | Missing: 10 | slept: 5.562467070428111
charlie paulk | Missing: 7 | slept: 5.281837976451104
charlie scott | Missing: 4 | slept: 5.268247288465215
charlie sitton | Missing: 6 | slept: 5.562467070428111
charlie tyra | Missing: 5 | slept: 6.270574180920691
charlie villanueva | Missing: 3 | slept: 5.895729124235184
charlie ward | Missing: 6 | slept: 5.562467070428111
charlie yelverton | Missing: 7 | slept: 6.270574180920691
chasson randle | Missing: 5 | slept: 6.8491043235355376
chase budinger | Missing: 3 | slept: 8.191108123697628
chauncey billups | Missing: 2 | slept: 6.270574180920691
cheese johnson | Missing: 6 | slept: 8.191108123697628
cheic

corey maggette | Missing: 4 | slept: 5.229005468271124
corey gaines | Missing: 5 | slept: 7.601742405391615
corey williams | Missing: 6 | slept: 6.143085379554792
corky calhoun | Missing: 4 | slept: 5.613624502586431
corky devlin | Missing: 7 | slept: 6.143085379554792
corie blount | Missing: 5 | slept: 7.601742405391615
corliss williamson | Missing: 4 | slept: 5.365872226681091
cornell warner | Missing: 6 | slept: 5.365872226681091
cornelius cash | Missing: 6 | slept: 8.052989644931682
corny thompson | Missing: 6 | slept: 6.143085379554792
corsley edwards | Missing: 5 | slept: 5.553731947912975
cory alexander | Missing: 5 | slept: 5.553731947912975
cory carr | Missing: 4 | slept: 5.365872226681091
cory blackwell | Missing: 5 | slept: 7.601742405391615
cory higgins | Missing: 5 | slept: 7.314622038035543
cory jefferson | Missing: 4 | slept: 7.314622038035543
cory joseph | Missing: 2 | slept: 5.553731947912975
cotton nash | Missing: 5 | slept: 6.143085379554792
coty clarke | Missing: 7 

dave johnson | Missing: 5 | slept: 7.715327991134051
dave lattin | Missing: 4 | slept: 8.50415652456842
dave magley | Missing: 6 | slept: 5.413946310711032
dave meyers | Missing: 5 | slept: 6.538547673243761
dave minor | Missing: 5 | slept: 6.538547673243761
dave newmark | Missing: 5 | slept: 7.715327991134051
dave piontek | Missing: 7 | slept: 8.50415652456842
dave popson | Missing: 6 | slept: 9.682278799871845
dave schellhase | Missing: 6 | slept: 6.538547673243761
dave robisch | Missing: 5 | slept: 9.682278799871845
dave scholz | Missing: 7 | slept: 7.715327991134051
dave sorenson | Missing: 6 | slept: 5.596741065651338
dave twardzik | Missing: 4 | slept: 5.596741065651338
dave zeller | Missing: 7 | slept: 5.1574845777293215
dave wohl | Missing: 6 | slept: 6.538547673243761
dave stallworth | Missing: 4 | slept: 9.682278799871845
david andersen | Missing: 7 | slept: 5.1574845777293215
david benoit | Missing: 6 | slept: 5.510132687755352
david cooke | Missing: 7 | slept: 5.59674106565

dick okeefe | Missing: 8 | slept: 5.117692707664256
dick miller | Missing: 6 | slept: 6.878007684974844
dick ricketts | Missing: 5 | slept: 5.117692707664256
dick rosenthal | Missing: 7 | slept: 5.071170595814649
dick schnittker | Missing: 5 | slept: 6.666011166788513
dick schulz | Missing: 7 | slept: 6.270840660874598
dick snyder | Missing: 5 | slept: 6.666011166788513
dick surhoff | Missing: 8 | slept: 6.878007684974844
dick triptow | Missing: 6 | slept: 5.014667509505398
dick van arsdale | Missing: 3 | slept: 5.117692707664256
dickey simpkins | Missing: 5 | slept: 5.014667509505398
dijon thompson | Missing: 5 | slept: 5.957493104863484
dj mbenga | Missing: 7 | slept: 6.270840660874598
dike eddleman | Missing: 4 | slept: 6.666011166788513
dikembe mutombo | Missing: 3 | slept: 5.957493104863484
dillard crocker | Missing: 7 | slept: 5.607879898470997
dillon brooks | Missing: 5 | slept: 5.117692707664256
dino radja | Missing: 6 | slept: 5.014667509505398
dion glover | Missing: 6 | slept

ed gray | Missing: 5 | slept: 5.291621281965137
ed horton | Missing: 5 | slept: 5.389701570185757
ed kalafat | Missing: 6 | slept: 6.148703587389809
ed leede | Missing: 6 | slept: 5.573105607687524
ed macauley | Missing: 3 | slept: 5.389701570185757
ed manning | Missing: 6 | slept: 5.0074185735786445
ed mikan | Missing: 5 | slept: 5.291621281965137
ed obannon | Missing: 5 | slept: 5.0074185735786445
ed nealy | Missing: 6 | slept: 6.148703587389809
ed peterson | Missing: 8 | slept: 5.350527273485405
ed pinckney | Missing: 5 | slept: 5.389701570185757
ed rains | Missing: 6 | slept: 5.350527273485405
ed ratleff | Missing: 6 | slept: 5.291621281965137
ed sadowski | Missing: 6 | slept: 6.487104447636624
ed searcy | Missing: 6 | slept: 5.0074185735786445
ed smith | Missing: 7 | slept: 5.389701570185757
ed sherod | Missing: 7 | slept: 6.487104447636624
ed stanczak | Missing: 8 | slept: 5.523945164107883
ed stokes | Missing: 6 | slept: 5.350527273485405
eddie basden | Missing: 5 | slept: 5.007

fred hetzel | Missing: 4 | slept: 5.047391226974887
fred foster | Missing: 5 | slept: 7.323876341376785
fred hilton | Missing: 6 | slept: 5.606589621368313
fred hoiberg | Missing: 3 | slept: 6.732047843175766
fred lacour | Missing: 6 | slept: 5.210370557662327
fred jones | Missing: 4 | slept: 7.323876341376785
fred roberts | Missing: 6 | slept: 6.732047843175766
fred schaus | Missing: 6 | slept: 5.606589621368313
fred saunders | Missing: 6 | slept: 7.43725199316149
fred scolari | Missing: 5 | slept: 5.210370557662327
fred taylor | Missing: 6 | slept: 7.43725199316149
fred vanvleet | Missing: 6 | slept: 6.732047843175766
freddie boyd | Missing: 5 | slept: 5.606589621368313
fred vinson | Missing: 7 | slept: 8.31779528965492
freddie crawford | Missing: 4 | slept: 8.31779528965492
furkan aldemir | Missing: 6 | slept: 5.462516954799719
freeman williams | Missing: 6 | slept: 6.732047843175766
freddie lewis | Missing: 3 | slept: 7.43725199316149
furkan korkmaz | Missing: 8 | slept: 5.46251695

greg grant | Missing: 4 | slept: 5.571312783608938
greg griffin | Missing: 6 | slept: 5.751459894371455
greg howard | Missing: 4 | slept: 5.751459894371455
greg hyder | Missing: 7 | slept: 5.310607187815741
greg jackson | Missing: 6 | slept: 5.763275865395458
greg kelser | Missing: 3 | slept: 5.056763166178243
greg kite | Missing: 4 | slept: 5.056763166178243
greg lee | Missing: 5 | slept: 6.974432536924224
greg minor | Missing: 5 | slept: 5.310607187815741
greg monroe | Missing: 1 | slept: 5.513397206672603
greg oden | Missing: 3 | slept: 5.513397206672603
greg ostertag | Missing: 5 | slept: 5.751459894371455
greg stiemsma | Missing: 5 | slept: 5.479341575328746
greg smith | Missing: 4 | slept: 6.974432536924224
greg stokes | Missing: 6 | slept: 5.479341575328746
greg sutton | Missing: 6 | slept: 5.056763166178243
greivis vasquez | Missing: 3 | slept: 5.207568632674977
guerschon yabusele | Missing: 8 | slept: 5.751459894371455
guillermo diaz | Missing: 5 | slept: 5.207568632674977
gun

jack kerris | Missing: 5 | slept: 6.000410955599437
jack haley | Missing: 5 | slept: 8.033999013390744
jack kiley | Missing: 7 | slept: 6.137824057869903
jack marin | Missing: 5 | slept: 5.857327981645998
jack mccloskey | Missing: 9 | slept: 5.857327981645998
jack mcmahon | Missing: 5 | slept: 7.003974004820165
jack nichols | Missing: 7 | slept: 5.126473416108855
jack molinas | Missing: 5 | slept: 8.033999013390744
jack parkinson | Missing: 7 | slept: 5.126473416108855
jack parr | Missing: 6 | slept: 6.000410955599437
jack phelan | Missing: 9 | slept: 6.586650078082163
jack smiley | Missing: 6 | slept: 6.586650078082163
jack sikma | Missing: 3 | slept: 7.003974004820165
jack stephens | Missing: 6 | slept: 5.857327981645998
jack toomay | Missing: 8 | slept: 6.162789314154283
jack turner | Missing: 6 | slept: 6.162789314154283
jack twyman | Missing: 5 | slept: 6.000410955599437
jackie butler | Missing: 7 | slept: 5.126473416108855
jackie dinkins | Missing: 7 | slept: 6.644493333305341
ja

jeff lebo | Missing: 7 | slept: 5.37245672578126
jeff lamp | Missing: 5 | slept: 6.912069701565744
jeff malone | Missing: 5 | slept: 6.177290475541847
jeff martin | Missing: 6 | slept: 6.01690494438135
jeff mcinnis | Missing: 6 | slept: 5.540422680753089
jeff mullins | Missing: 3 | slept: 6.01690494438135
jeff nordgaard | Missing: 6 | slept: 5.37245672578126
jeff sanders | Missing: 6 | slept: 5.029427702050848
jeff ruland | Missing: 4 | slept: 6.843512464050752
jeff slade | Missing: 8 | slept: 6.843512464050752
jeff taylor | Missing: 5 | slept: 5.540422680753089
jeff teague | Missing: 2 | slept: 6.912069701565744
jeff trepagnier | Missing: 6 | slept: 7.272679945895146
jeff webb | Missing: 8 | slept: 5.029427702050848
jeff turner | Missing: 6 | slept: 7.272679945895146
jeff webster | Missing: 6 | slept: 6.01690494438135
jeff wilkins | Missing: 5 | slept: 7.007718660641734
jeff withey | Missing: 4 | slept: 6.912069701565744
jeff taylor | Missing: 3 | slept: 7.007718660641734
jeffrey shep

joe barry carroll | Missing: 3 | slept: 5.250411516075664
joe binion | Missing: 5 | slept: 5.250411516075664
joe bryant | Missing: 3 | slept: 5.250620053721121
joe bradley | Missing: 6 | slept: 7.597922401533074
joe buckhalter | Missing: 5 | slept: 5.935128086486384
joe caldwell | Missing: 3 | slept: 5.935128086486384
joe cooke | Missing: 6 | slept: 5.738413174366845
joe cooper | Missing: 5 | slept: 5.070045749573543
joe courtney | Missing: 7 | slept: 5.1999089972282055
joe crawford | Missing: 3 | slept: 5.1999089972282055
joe dolhon | Missing: 9 | slept: 5.250411516075664
joe crispin | Missing: 7 | slept: 7.597922401533074
joe dumars | Missing: 3 | slept: 5.489443579461273
joe ellis | Missing: 6 | slept: 5.489443579461273
joe fulks | Missing: 5 | slept: 5.935128086486384
joe graboski | Missing: 8 | slept: 5.070045749573543
joe harris | Missing: 6 | slept: 5.002714738476001
joe hassett | Missing: 4 | slept: 5.002714738476001
joe holland | Missing: 7 | slept: 5.1999089972282055
joe holu

jonas valanciunas | Missing: 5 | slept: 5.009997152274785
jon sundvold | Missing: 5 | slept: 8.4383412843566
jonathan gibson | Missing: 7 | slept: 5.660625272340179
jonathan bender | Missing: 4 | slept: 10.886558703566848
jonathan kerner | Missing: 7 | slept: 5.686988581289729
jonathan isaac | Missing: 4 | slept: 8.4383412843566
jonny flynn | Missing: 4 | slept: 5.009997152274785
jordan adams | Missing: 3 | slept: 5.051195582014077
jordan bell | Missing: 3 | slept: 5.686988581289729
jonathon simmons | Missing: 5 | slept: 10.886558703566848
jordan farmar | Missing: 3 | slept: 5.051195582014077
jordan crawford | Missing: 3 | slept: 6.7956334303627495
jordan clarkson | Missing: 4 | slept: 8.4383412843566
jordan hamilton | Missing: 3 | slept: 5.009997152274785
jordan mcrae | Missing: 2 | slept: 6.12828554814098
jordan hill | Missing: 4 | slept: 6.7956334303627495
jordan mickey | Missing: 3 | slept: 5.686988581289729
jordan williams | Missing: 5 | slept: 8.4383412843566
jorge garbajosa | Mi

kent benson | Missing: 4 | slept: 5.402341376016096
kent bazemore | Missing: 7 | slept: 7.43998795180115
kentavious caldwell-pope | Missing: 3 | slept: 6.033415941229542
kenton edelin | Missing: 5 | slept: 5.402341376016096
kenyon martin | Missing: 2 | slept: 5.027123797298228
keon clark | Missing: 5 | slept: 6.599637636309335
kerry kittles | Missing: 4 | slept: 5.027123797298228
kermit washington | Missing: 4 | slept: 7.43998795180115
kevin brooks | Missing: 5 | slept: 5.382719635302719
kevin burleson | Missing: 6 | slept: 5.198688196191548
kevin duckworth | Missing: 3 | slept: 5.382719635302719
kevin durant | Missing: 1 | slept: 6.599637636309335
kevin edwards | Missing: 4 | slept: 6.033415941229542
kevin gamble | Missing: 5 | slept: 6.2957982904805645
kevin garnett | Missing: 3 | slept: 6.033415941229542
kevin grevey | Missing: 3 | slept: 5.198688196191548
kevin henderson | Missing: 5 | slept: 7.43998795180115
kevin johnson | Missing: 2 | slept: 7.036906841480752
kevin jones | Missi

lee nailon | Missing: 6 | slept: 5.030520657714739
lee shaffer | Missing: 5 | slept: 5.0205511213639795
lee winfield | Missing: 6 | slept: 5.174187757523728
len elmore | Missing: 4 | slept: 5.095609945392894
len kosmalski | Missing: 6 | slept: 5.174187757523728
len chappell | Missing: 3 | slept: 6.969914176127963
lennie rosenbluth | Missing: 6 | slept: 5.030520657714739
leo barnhorst | Missing: 3 | slept: 5.030520657714739
lenny wilkens | Missing: 4 | slept: 6.075517784694203
leo katkaveck | Missing: 8 | slept: 5.505788441230993
leo klier | Missing: 6 | slept: 5.095609945392894
leo kubiak | Missing: 7 | slept: 5.095609945392894
leo rautins | Missing: 4 | slept: 5.5021759559976315
leo mogus | Missing: 7 | slept: 6.969914176127963
leon benbow | Missing: 5 | slept: 6.075517784694203
leon blevins | Missing: 7 | slept: 6.075517784694203
leon douglas | Missing: 4 | slept: 5.9139353603464855
leon powe | Missing: 3 | slept: 5.505788441230993
leon smith | Missing: 6 | slept: 6.969914176127963
l

mark bradtke | Missing: 6 | slept: 5.943149657179598
mark blount | Missing: 4 | slept: 6.7751082156009
mark crow | Missing: 5 | slept: 5.050002366100347
mark davis | Missing: 6 | slept: 5.050002366100347
mark bryant | Missing: 5 | slept: 9.457904300816002
mark eaton | Missing: 4 | slept: 5.420549846559703
mark hendrickson | Missing: 5 | slept: 9.457904300816002
mark jones | Missing: 5 | slept: 6.7751082156009
mark landsberger | Missing: 4 | slept: 5.844658383580519
mark jackson | Missing: 3 | slept: 9.457904300816002
mark macon | Missing: 5 | slept: 6.7751082156009
mark madsen | Missing: 3 | slept: 5.420549846559703
mark mcnamara | Missing: 3 | slept: 5.314611125525001
mark minor | Missing: 7 | slept: 6.7751082156009
mark pope | Missing: 6 | slept: 5.420549846559703
mark olberding | Missing: 5 | slept: 5.844658383580519
mark radford | Missing: 6 | slept: 5.420549846559703
mark price | Missing: 3 | slept: 6.085615012539627
mark randall | Missing: 6 | slept: 5.844658383580519
mark sibley

mike bratz | Missing: 5 | slept: 5.563814864153702
mike brittain | Missing: 5 | slept: 5.403963707948836
mike brown | Missing: 5 | slept: 6.213875271774544
mike champion | Missing: 7 | slept: 5.662303071502656
mike conley | Missing: 1 | slept: 6.213875271774544
mike dantoni | Missing: 5 | slept: 5.662303071502656
mike davis | Missing: 6 | slept: 5.403963707948836
mike dunleavy | Missing: 2 | slept: 5.356505049588403
mike evans | Missing: 6 | slept: 5.403963707948836
mike farmer | Missing: 5 | slept: 5.356505049588403
mike flynn | Missing: 5 | slept: 5.662303071502656
mike gale | Missing: 3 | slept: 5.678645522636009
mike gibson | Missing: 5 | slept: 5.662303071502656
mike glenn | Missing: 4 | slept: 5.678645522636009
mike gminski | Missing: 4 | slept: 5.356505049588403
mike green | Missing: 3 | slept: 5.194451988368752
mike hall | Missing: 7 | slept: 5.356505049588403
mike harper | Missing: 4 | slept: 5.194451988368752
mike harris | Missing: 6 | slept: 5.678645522636009
mike holton | M

noah vonleh | Missing: 4 | slept: 5.94674633546925
noble jorgensen | Missing: 6 | slept: 5.613889341116505
noel felix | Missing: 7 | slept: 5.94674633546925
nolan smith | Missing: 3 | slept: 5.613889341116505
norm cook | Missing: 4 | slept: 5.613889341116505
norm grekin | Missing: 6 | slept: 5.118224548702731
norm mager | Missing: 6 | slept: 5.613889341116505
norm nixon | Missing: 4 | slept: 5.118224548702731
norm richardson | Missing: 7 | slept: 5.118224548702731
norm stewart | Missing: 6 | slept: 5.014832616295308
norm swanson | Missing: 6 | slept: 5.118224548702731
norm van lier | Missing: 4 | slept: 5.014832616295308
norman black | Missing: 7 | slept: 5.014832616295308
norman powell | Missing: 3 | slept: 5.309782102537337
normie glick | Missing: 8 | slept: 5.014832616295308
norris cole | Missing: 3 | slept: 5.309782102537337
norris coleman | Missing: 5 | slept: 5.309782102537337
norton barnhill | Missing: 6 | slept: 5.991877361455767
obinna ekezie | Missing: 5 | slept: 5.3097821025

pops mensah-bonsu | Missing: 6 | slept: 6.046966852238774
porter meriwether | Missing: 7 | slept: 6.046966852238774
predrag drobnjak | Missing: 6 | slept: 5.8734205408919085
predrag savovic | Missing: 8 | slept: 6.046966852238774
price brookfield | Missing: 7 | slept: 5.8734205408919085
priest lauderdale | Missing: 5 | slept: 5.8734205408919085
primoz brezec | Missing: 7 | slept: 5.06622154023633
purvis short | Missing: 5 | slept: 5.8734205408919085
quentin richardson | Missing: 3 | slept: 5.06622154023633
quincy acy | Missing: 5 | slept: 5.06622154023633
quincy douby | Missing: 6 | slept: 5.803831948914997
quincy lewis | Missing: 6 | slept: 5.06622154023633
quincy miller | Missing: 4 | slept: 5.803831948914997
quincy pondexter | Missing: 2 | slept: 5.803831948914997
quinn buckner | Missing: 4 | slept: 6.000176918653565
quinn cook | Missing: 6 | slept: 5.803831948914997
quintin dailey | Missing: 3 | slept: 6.000176918653565
quinton ross | Missing: 7 | slept: 6.000176918653565
qyntel wo

ricky blanton | Missing: 5 | slept: 7.716588041927561
ricky grace | Missing: 4 | slept: 5.508758943927893
ricky ledo | Missing: 5 | slept: 5.508758943927893
ricky davis | Missing: 4 | slept: 7.716588041927561
ricky marsh | Missing: 6 | slept: 5.508758943927893
ricky pierce | Missing: 4 | slept: 5.2570303766089435
ricky rubio | Missing: 5 | slept: 5.2570303766089435
ricky sobers | Missing: 5 | slept: 5.508758943927893
ricky wilson | Missing: 6 | slept: 5.2570303766089435
rik smits | Missing: 3 | slept: 6.552658551637815
rob kurz | Missing: 7 | slept: 5.2570303766089435
rj hunter | Missing: 6 | slept: 6.552658551637815
rob lock | Missing: 6 | slept: 6.552658551637815
rob rose | Missing: 7 | slept: 5.8212564478075
robbie hummel | Missing: 4 | slept: 5.8212564478075
rob williams | Missing: 6 | slept: 6.552658551637815
robert archibald | Missing: 4 | slept: 5.8212564478075
robert churchwell | Missing: 7 | slept: 5.007897912997483
robert covington | Missing: 4 | slept: 5.007897912997483
robe

sam mack | Missing: 7 | slept: 5.128367499061401
sam mitchell | Missing: 5 | slept: 5.128367499061401
sam pellom | Missing: 7 | slept: 5.128367499061401
sam ranzino | Missing: 6 | slept: 5.2630935907149565
sam perkins | Missing: 4 | slept: 5.079508757439087
sam sibert | Missing: 8 | slept: 5.2630935907149565
sam smith | Missing: 5 | slept: 5.2630935907149565
sam stith | Missing: 5 | slept: 5.079508757439087
sam vincent | Missing: 3 | slept: 5.91284723597497
sam williams | Missing: 6 | slept: 5.079508757439087
sam worthen | Missing: 6 | slept: 5.079508757439087
sam young | Missing: 4 | slept: 5.91284723597497
samaki walker | Missing: 5 | slept: 5.393595577741377
samardo samuels | Missing: 6 | slept: 5.91284723597497
samuel dalembert | Missing: 3 | slept: 5.91284723597497
sarunas jasikevicius | Missing: 6 | slept: 5.393595577741377
sasha danilovic | Missing: 6 | slept: 5.393595577741377
sarunas marciulionis | Missing: 6 | slept: 6.386219837617933
sasha kaun | Missing: 5 | slept: 5.393595

stephen jackson | Missing: 2 | slept: 5.355024474574243
stephen zimmerman | Missing: 5 | slept: 5.355024474574243
stephen thompson | Missing: 7 | slept: 7.609663919765042
stephon marbury | Missing: 1 | slept: 7.609663919765042
sterling brown | Missing: 5 | slept: 7.609663919765042
steve alford | Missing: 4 | slept: 7.609663919765042
steve bardo | Missing: 6 | slept: 7.1391085409062764
steve blake | Missing: 3 | slept: 7.1391085409062764
steve bracey | Missing: 4 | slept: 7.1391085409062764
steve burtt | Missing: 5 | slept: 5.090097512196379
steve bucknall | Missing: 6 | slept: 7.1391085409062764
steve colter | Missing: 5 | slept: 5.090097512196379
steve courtin | Missing: 6 | slept: 5.090097512196379
steve downing | Missing: 4 | slept: 6.029194569293691
steve francis | Missing: 3 | slept: 5.090097512196379
steve goodrich | Missing: 6 | slept: 6.029194569293691
steve green | Missing: 5 | slept: 6.029194569293691
steve hamer | Missing: 5 | slept: 5.600276964914019
steve hamilton | Missin

tom boswell | Missing: 4 | slept: 6.229892890610246
tom brennan | Missing: 6 | slept: 6.229892890610246
tom burleson | Missing: 5 | slept: 5.81535466232314
tom chambers | Missing: 3 | slept: 5.4657088795348905
tom copa | Missing: 6 | slept: 5.81535466232314
tom garrick | Missing: 4 | slept: 5.81535466232314
tom gola | Missing: 3 | slept: 5.4657088795348905
tom gugliotta | Missing: 3 | slept: 6.110642213200701
tom hammonds | Missing: 4 | slept: 5.4657088795348905
tom hawkins | Missing: 4 | slept: 5.4657088795348905
tom heinsohn | Missing: 3 | slept: 6.110642213200701
tom henderson | Missing: 4 | slept: 5.511011469365613
tom hoover | Missing: 5 | slept: 6.110642213200701
tom hovasse | Missing: 6 | slept: 6.110642213200701
tom ingelsby | Missing: 5 | slept: 5.511011469365613
tom kozelko | Missing: 5 | slept: 5.079497628206895
tom kropp | Missing: 5 | slept: 5.511011469365613
tom lagarde | Missing: 5 | slept: 5.511011469365613
tom marshall | Missing: 7 | slept: 5.079497628206895
tom mcmill

vince carter | Missing: 1 | slept: 5.0953386055921746
vincent askew | Missing: 6 | slept: 5.853388809159487
vincent yarbrough | Missing: 5 | slept: 5.53945375074686
vincenzo esposito | Missing: 8 | slept: 5.077987184443477
vinnie johnson | Missing: 2 | slept: 5.077987184443477
vinny del negro | Missing: 4 | slept: 5.53945375074686
vitaly potapenko | Missing: 5 | slept: 5.272425388586903
vitor faverani | Missing: 8 | slept: 5.853388809159487
vlade divac | Missing: 5 | slept: 5.853388809159487
vladimir radmanovic | Missing: 6 | slept: 5.272425388586903
voise winters | Missing: 6 | slept: 5.53945375074686
vladimir stepania | Missing: 8 | slept: 6.662801079467167
von wafer | Missing: 4 | slept: 5.53945375074686
vonteego cummings | Missing: 5 | slept: 6.662801079467167
voshon lenard | Missing: 5 | slept: 5.272425388586903
wade baldwin | Missing: 5 | slept: 5.659144254214263
wah wah jones | Missing: 6 | slept: 5.272425388586903
wali jones | Missing: 2 | slept: 5.659144254214263
walker russel

In [66]:
all_players_dfs = load_pickle('all_players_dfs.pickle')
all_players_dfs['attributes'] = all_players_attributes
all_players_dfs.keys()
save_pickle(all_players_dfs, 'all_players_dfs.pickle')


LOADED >  all_players_dfs.pickle  |  51.5 MiB  |  length:  14 



In [None]:
# %%time

# all_players_stats_tables = get_stats_tables(list(all_players.values()), 4)

# save_pickle(all_players_stats_tables, 'all_players_stats_tables.pickle')

# print(all_players_stats_tables['lebron james']['tables'].keys())
# all_players_stats_tables['lebron james']['tables']['per_game']

In [None]:
# %%time

# leads_urls = load_list_from_worksheet('nba_player_names', 'leads_urls')

# all_players = get_players_from_urls(leads_urls, 4)

# save_pickle(all_players, 'all_players.pickle')

In [None]:
# worksheet = load_list_from_worksheet('nba_players_sanitized', 'hof')
# hof_names = sanitize_list(worksheet[0].tolist())
# print(hof_names)

# hof_urls = get_urls(hof_names, 30)

In [None]:
# worksheet = load_list_from_worksheet('nba_players_sanitized', 'retired_all_stars')
# retired_all_stars_names = sanitize_list(worksheet[0].tolist())
# print(retired_all_stars_names)

# retired_all_stars_urls = get_urls(retired_all_stars_names, 30)

In [None]:
# worksheet = load_list_from_worksheet('nba_players_sanitized', 'retired_all_nbas')
# retired_all_nbas_names = sanitize_list(worksheet[0].tolist())
# print(retired_all_nbas_names)

# retired_all_nbas_urls = get_urls(retired_all_nbas_names, 30)

In [None]:
# worksheet = load_list_from_worksheet('nba_players_sanitized', '2015')
# players_2015_names = sanitize_list(worksheet[0].tolist())
# print(players_2015_names)

# players_2015_urls = get_urls(players_2015_names, 30)

In [None]:
# save_pickle(hof_urls, 'hof_urls.pickle')
# save_pickle(retired_all_stars_urls, 'retired_all_stars_urls.pickle')
# save_pickle(retired_all_nbas_urls, 'retired_all_nbas_urls.pickle')
# save_pickle(players_2015_urls, 'players_2015_urls.pickle')

In [None]:
# %%time

# save_list_to_worksheet(list(hof_urls.values()), 'nba_player_urls', 'hof_urls', overwrite=True)
# save_list_to_worksheet(list(retired_all_nbas_urls.values()), 'nba_player_urls', 'retired_all_nbas_urls', overwrite=True)
# save_list_to_worksheet(list(retired_all_stars_urls.values()), 'nba_player_urls', 'retired_all_stars_urls', overwrite=True)
# save_list_to_worksheet(list(players_2015_urls.values()), 'nba_player_urls', 'players_2015_urls', overwrite=True)

## Get Tables

In [None]:
# %%time

# url_list = [url for url in hof_urls.values() if url is not None]
# print(len(url_list))
# hof_tables = get_tables(url_list, 4)
# save_pickle(hof_tables, 'hof_tables.pickle')

In [None]:
# %%time

# url_list = [url for url in retired_all_nbas_urls.values() if url is not None]
# print(len(url_list))
# retired_all_nbas_tables = get_tables(url_list, 4)
# save_pickle(retired_all_nbas_tables, 'retired_all_nbas_tables.pickle')

In [None]:
# %%time

# url_list = [url for url in retired_all_stars_urls.values() if url is not None]
# print(len(url_list))
# retired_all_stars_tables = get_tables(url_list, 4)
# save_pickle(retired_all_stars_tables, 'retired_all_stars_tables.pickle')

In [None]:
# %%time

# url_list = [url for url in players_2015_urls.values() if url is not None]
# print(len(url_list))
# players_2015_tables = get_tables(url_list, 4)
# save_pickle(players_2015_tables, 'players_2015_tables.pickle')