# Extract Stats from Basketball-Reference.com

## Imports, Constants, Utilities

### Imports

In [340]:
%%time

import os
import sys
import datetime

import json
import pickle
import urllib
import pandas as pd
import google
import random
import time
import multiprocessing
import gspread
import unidecode


from collections import OrderedDict
from gspread import WorksheetNotFound
from bs4 import BeautifulSoup
from oauth2client.service_account import ServiceAccountCredentials

DATETIME_STRING_FORMAT = '%Y-%m-%d %H:%M:%S'

CPU times: user 40 µs, sys: 171 µs, total: 211 µs
Wall time: 215 µs


In [341]:
%%time

def merge_list_of_list(nested_list):
    flattened_list = [item for lst in nested_list for item in lst]
    return flattened_list

test_list = [['a'], ['b']]
merge_list_of_list(test_list)

CPU times: user 11 µs, sys: 9 µs, total: 20 µs
Wall time: 22.2 µs


In [342]:
%%time

# Utility function to merge retrived data tables into 1 dictionary.
def merge_list_of_dict(list_of_dict):
    merged_dict = {}
    for dictionary in list_of_dict:
        merged_dict.update(dictionary)
    # Sort by dictionary key
    ordered_dict = OrderedDict(sorted(merged_dict.items(), key=lambda t: t[0]))
    return ordered_dict

test_list = [
    {'michael jordan': {'tables': {}, 'missing_tables': 'none', 'url': 'diety'}},
    {'kobe bryant': {'tables': {}, 'missing_tables': 'none', 'url': 'godly'}},
]

dic = merge_list_of_dict(test_list)
print(dic)

OrderedDict([('kobe bryant', {'tables': {}, 'missing_tables': 'none', 'url': 'godly'}), ('michael jordan', {'tables': {}, 'missing_tables': 'none', 'url': 'diety'})])
CPU times: user 146 µs, sys: 83 µs, total: 229 µs
Wall time: 184 µs


In [424]:
%%time

def sanitize_string(raw_string):
    sanitized_string = unidecode.unidecode(raw_string)
    sanitized_string = sanitized_string.strip().lower()
    sanitized_string = sanitized_string.replace("'", "")
    sanitized_string = sanitized_string.replace('"', '') 
    sanitized_string = sanitized_string.replace('.', '')
    if "," in sanitized_string:
        lst = sanitized_string.split(",")
        lst.reverse()
        lst = [token.strip() for token in lst]
        sanitized_string = " ".join(lst)
    return sanitized_string

print(sanitize_string("Shaquille O'neal"))
print(sanitize_string("Bryant, Kobe"))
print(sanitize_string(" CarTer, Vince .."))

shaquille oneal
kobe bryant
vince carter


In [425]:
%%time

def sanitize_list(raw_list):
            
    sanitized_list = [sanitize_string(raw_string) for raw_string in raw_list]
    return sanitized_list

test_list = ["Shaquille O'neal", "J. J. Reddick", "VinCe Carter ", "Bryant, Kobe"]

print(sanitize_list(test_list))

['shaquille oneal', 'j j reddick', 'vince carter', 'kobe bryant']
CPU times: user 262 µs, sys: 254 µs, total: 516 µs
Wall time: 373 µs


In [344]:
%%time

def dedupe_list(lst):
    return list(set(lst))

print(dedupe_list(['a', 'a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b', ]))

['b', 'a']
CPU times: user 604 µs, sys: 763 µs, total: 1.37 ms
Wall time: 1.27 ms


### Load Spreadsheets

In [345]:
def load_list_from_worksheet(spreadsheet_name, worksheet_name):
  
    scope = ['https://spreadsheets.google.com/feeds']
    credentials = ServiceAccountCredentials.from_json_keyfile_name('Data-35df9a696bc1.json', scope)
    gc = gspread.authorize(credentials)

    spreadsheet = gc.open(spreadsheet_name)
    worksheet = spreadsheet.worksheet(worksheet_name)

    rows = worksheet.get_all_values()
    
    first_row = rows[0]
    first_cell = first_row[0]
    
    try:
        timestamp = datetime.datetime.strptime(first_cell, DATETIME_STRING_FORMAT)
        rows.remove(first_row)
    except ValueError:
        timestamp = None

    print(
        'LOADED > {num_rows} rows from '
        'spreadsheet: "{spreadsheet_name}" | '
        'worksheet: "{worksheet_name}" | '
        'timestamp: {timestamp}'.format(
            num_rows=len(rows), spreadsheet_name=spreadsheet_name, 
            worksheet_name=worksheet_name, timestamp=timestamp), '\n')

    df = merge_list_of_list(rows)
    
    return df

worksheet = load_list_from_worksheet('test_spreadsheet', 'test')
print(worksheet[:10])

LOADED > 100 rows from spreadsheet: "test_spreadsheet" | worksheet: "test" | timestamp: 2017-11-21 13:51:46 

['michale', 'kobe', '0', '1', '2', '3', '4', '5', '6', '7']


In [346]:
%%time

def save_list_to_worksheet(lst, spreadsheet_name, worksheet_name, add_timestamp=True, overwrite=False):
    scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']
    credentials = ServiceAccountCredentials.from_json_keyfile_name('Data-35df9a696bc1.json', scope)
    gc = gspread.authorize(credentials)
    
    spreadsheet = gc.open(spreadsheet_name)
    
    if type(lst) is not list:
        print('ERROR: input item is not a list!')
        return False
    
    # Remove rows with None value
    original_length = len(lst)
    lst = [item for item in lst if item is not None]
    new_length = len(lst)
    
    try:
        worksheet = spreadsheet.worksheet(worksheet_name)
        if overwrite:
            new_worksheet_name = worksheet_name + "_new"
            new_worksheet = spreadsheet.add_worksheet(new_worksheet_name, len(lst), 1)
            spreadsheet.del_worksheet(worksheet)
            new_worksheet.update_title(worksheet_name)
        else:
            print('Worksheet "{worksheet_name}" already exist! Please set overwrite=True to overwrite.')
            return False
    except WorksheetNotFound: 
        new_worksheet = spreadsheet.add_worksheet(worksheet_name, len(lst), 1)
    
    range_notation = 'A1:A{last_row_index}'.format(last_row_index=len(lst))
    
    cells_to_update = new_worksheet.range(range_notation)

    print('Remove {num_row} rows with "None" as their value.'.format(
        num_row=(original_length - new_length)))
    
    for cell, item in zip(cells_to_update, lst):
        cell.value = item
    
    new_worksheet.update_cells(cells_to_update)
    
    #Add a timestamp in the 1st cell
    if add_timestamp:
        timestamp = datetime.datetime.now()
        new_worksheet.insert_row(
            [timestamp], 1)
    
    print(
    'SAVED > {num_rows} rows to '
    'spreadsheet: "{spreadsheet_name}" | '
    'worksheet: "{worksheet_name}" | '
    'timestamp: {timestamp}'.format(
        num_rows=len(lst), spreadsheet_name=spreadsheet_name, 
        worksheet_name=worksheet_name, timestamp=timestamp), '\n')
    
    return True

test_lst = ['michale', 'kobe'] + [i for i in range(98)]
print(len(test_lst))
save_list_to_worksheet(test_lst, 'test_spreadsheet', 'test', add_timestamp=True, overwrite=True)

100
Remove 0 rows with "None" as their value.
SAVED > 100 rows to spreadsheet: "test_spreadsheet" | worksheet: "test" | timestamp: 2017-11-21 14:38:02.827563 

CPU times: user 151 ms, sys: 18.4 ms, total: 169 ms
Wall time: 4.08 s


### Save & Load Pickle

In [347]:
%%time

test = {
    'words': """
        Lorem ipsum dolor sit amet, consectetur adipiscing 
        elit. Mauris adipiscing adipiscing placerat. 
        Vestibulum augue augue, 
        pellentesque quis sollicitudin id, adipiscing.
        """,
    'list': list(range(10000)),
    'dict': dict((str(i),'a') for i in range(10000)),
    'int': 100,
    'float': 100.123456
}

def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

def get_file_size(filename):
    statinfo = os.stat(filename)
    return sizeof_fmt(statinfo.st_size)

def save_pickle(dictionary, filename):
    with open(filename, 'wb') as file:
        pickle.dump(dictionary, file)
    print(
        '\n'
        'SAVED  > ',
        filename, ' | ', 
        get_file_size(filename), ' | ',
        'length: ', len(dictionary),
        '\n')
    return True;

def load_pickle(filename):
    with open(filename, 'rb') as file:
        obj = pickle.load(file)
        print(
        '\n'
        'LOADED > ',
        filename, ' | ', 
        get_file_size(filename), ' | ',
        'length: ', len(obj),
        '\n')
        return obj

save_pickle(test, 'test.pickle')

len(load_pickle('test.pickle'))


SAVED  >  test.pickle  |  183.8 KiB  |  length:  5 


LOADED >  test.pickle  |  183.8 KiB  |  length:  5 

CPU times: user 7.76 ms, sys: 2.16 ms, total: 9.92 ms
Wall time: 9.13 ms


### Define & Load Constants

In [348]:
%%time

# Tables to retrieve for each player, by table html ids
table_ids = [
  'per_game',
  'totals',
  'per_minute', # per 36 minutes
  'per_poss', # per 100 possessions
  'advanced', # advanced
    
  'playoffs_per_game',
  'playoffs_totals',
  'playoffs_per_minute', # playoffs per 36 minutes
  'playoffs_per_poss', # playoffs per 100 possessions
  'playoffs_advanced', 
    
  'all_star',
  'all_college_stats',
  'all_salaries',
]

# Load player names
hof_names = sanitize_list(
    load_list_from_worksheet('nba_player_names', 'hof_names'))
retired_all_stars_names = sanitize_list(
    load_list_from_worksheet('nba_player_names', 'retired_all_stars_names'))
retired_all_nbas_names = sanitize_list(
    load_list_from_worksheet('nba_player_names', 'retired_all_nbas_names'))
players_2015_names = sanitize_list(
    load_list_from_worksheet('nba_player_names', 'players_2015_names'))

# Load URLs
hof_urls = load_list_from_worksheet('nba_player_urls', 'hof_urls')
retired_all_stars_urls = load_list_from_worksheet('nba_player_urls', 'retired_all_stars_urls')
retired_all_nbas_urls = load_list_from_worksheet('nba_player_urls', 'retired_all_nbas_urls')
players_2015_urls = load_list_from_worksheet('nba_player_urls', 'players_2015_urls')

LOADED > 181 rows from spreadsheet: "nba_player_names" | worksheet: "hof_names" | timestamp: None 

LOADED > 352 rows from spreadsheet: "nba_player_names" | worksheet: "retired_all_stars_names" | timestamp: None 

LOADED > 191 rows from spreadsheet: "nba_player_names" | worksheet: "retired_all_nbas_names" | timestamp: None 

LOADED > 476 rows from spreadsheet: "nba_player_names" | worksheet: "players_2015_names" | timestamp: None 

LOADED > 131 rows from spreadsheet: "nba_player_urls" | worksheet: "hof_urls" | timestamp: 2017-11-21 13:21:32 

LOADED > 352 rows from spreadsheet: "nba_player_urls" | worksheet: "retired_all_stars_urls" | timestamp: 2017-11-21 13:21:41 

LOADED > 190 rows from spreadsheet: "nba_player_urls" | worksheet: "retired_all_nbas_urls" | timestamp: 2017-11-21 13:21:36 

LOADED > 476 rows from spreadsheet: "nba_player_urls" | worksheet: "players_2015_urls" | timestamp: 2017-11-21 13:21:46 

CPU times: user 583 ms, sys: 86.2 ms, total: 669 ms
Wall time: 22.4 s


## Get Names

### Get names from a single URL

In [427]:
def get_players(url):
    
    href_pattern = re.compile('^\/players\/.\/[a-z0-9]*.html$')
    href_prefix = 'https://www.basketball-reference.com'
    
    page = urllib.request.urlopen(url)
    html = page.read()

    # Get the player name
    soup = BeautifulSoup(html, 'html.parser')
    
    els = soup.find_all('a', href=href_pattern)
    
    players = {}
    
    for el in els:
        if el.parent.name == 'td':
            player_name = sanitize_string(el.text)
            player_url = ''.join([href_prefix, el['href']])
            players[player_name] = player_url
    
    print('Scrapped {url} | Players Found: {len}'.format(url=url, len=len(players)))
    
    return players

players = get_players('https://www.basketball-reference.com/leagues/NBA_1967_totals.html')

Scrapped https://www.basketball-reference.com/leagues/NBA_1967_totals.html | Players Found: 123


In [428]:
players

{'adrian smith': 'https://www.basketball-reference.com/players/s/smithad01.html',
 'al attles': 'https://www.basketball-reference.com/players/a/attleal01.html',
 'archie clark': 'https://www.basketball-reference.com/players/c/clarkar01.html',
 'bailey howell': 'https://www.basketball-reference.com/players/h/howelba01.html',
 'barry clemens': 'https://www.basketball-reference.com/players/c/clemeba01.html',
 'ben warley': 'https://www.basketball-reference.com/players/w/warlebe01.html',
 'bill bridges': 'https://www.basketball-reference.com/players/b/bridgbi01.html',
 'bill melchionni': 'https://www.basketball-reference.com/players/m/melchbi01.html',
 'bill russell': 'https://www.basketball-reference.com/players/r/russebi01.html',
 'billy cunningham': 'https://www.basketball-reference.com/players/c/cunnibi01.html',
 'bob boozer': 'https://www.basketball-reference.com/players/b/boozebo01.html',
 'bob ferry': 'https://www.basketball-reference.com/players/f/ferrybo01.html',
 'bob hogsett': '

In [429]:
%%time

def get_players_from_urls(urls, num_processes):
    p = multiprocessing.Pool(processes=num_processes)
    outputs = p.map(get_players, urls)
    p.close()
    final_output = merge_list_of_dict(outputs)
    print(
        'Scrapped {num_url} urls, found {num_player} players.'.format(
            num_url=len(urls), num_player=len(final_output)), '\n')
    return final_output

test_urls = [
    'https://www.basketball-reference.com/leagues/NBA_2015_totals.html',
    'https://www.basketball-reference.com/leagues/NBA_2010_totals.html'
]

players = get_players_from_urls(test_urls, 2)

Scrapped https://www.basketball-reference.com/leagues/NBA_2010_totals.html | Players Found: 442
Scrapped https://www.basketball-reference.com/leagues/NBA_2015_totals.html | Players Found: 492
Scrapped 2 urls, found 716 players. 

CPU times: user 6.55 ms, sys: 13.2 ms, total: 19.7 ms
Wall time: 2.88 s


In [416]:
players['Aaron Brooks']

'https://www.basketball-reference.com/players/b/brookaa01.html'

In [349]:
# def get_names(url):
#     page = urllib.request.urlopen(url)
#     html = page.read()

#     # Get the player name
#     soup = BeautifulSoup(html, 'html.parser')
#     player_names = soup.find_all(attrs={'data-stat': 'player'})
#     player_names = [player_name.text for player_name in player_names]
    
#     # Remove duplicates & unneeded entries
#     player_names = list(set(player_names))
#     player_names.remove('Player')
    
#     player_names.sort()
    
#     print('Scrapped {url} | Players Found: {len}'.format(url=url, len=len(player_names)))
    
#     return sanitize_list(player_names)

# len(get_names('https://www.basketball-reference.com/leagues/NBA_2015_totals.html'))

Scrapped https://www.basketball-reference.com/leagues/NBA_2015_totals.html | Players Found: 492


492

### Get names from a list of URLs

In [350]:
# %%time

# def get_names_from_urls(urls, num_processes):
#     p = multiprocessing.Pool(processes=num_processes)
#     outputs = p.map(get_names, urls)
#     p.close()
#     final_output = dedupe_list(merge_list_of_list(outputs))
#     print(
#         'Scrapped {num_url} urls, found {num_player} players.'.format(
#             num_url=len(urls), num_player=len(final_output)), '\n')
#     return final_output

# test_urls = [
#     'https://www.basketball-reference.com/leagues/NBA_2015_totals.html',
#     'https://www.basketball-reference.com/leagues/NBA_2010_totals.html'
# ]

# names = get_names_from_urls(test_urls, 2)

Scrapped https://www.basketball-reference.com/leagues/NBA_2010_totals.html | Players Found: 442
Scrapped https://www.basketball-reference.com/leagues/NBA_2015_totals.html | Players Found: 492
Scrapped 2 urls, found 716 players. 

CPU times: user 7.03 ms, sys: 14.9 ms, total: 22 ms
Wall time: 3.19 s


## GET URL

### Get URL for a player name

In [351]:
%%time

#TODO(jameshu): Add logic to verify the url returned  in fact matches the player name
# Currently, even gibberish player_name e.g. "James Hu" would have results returned.

def get_url_title(url):
    page = urllib.request.urlopen(url)
    soup = BeautifulSoup(page, "html.parser")
    return soup.title.text

def get_url(player_name):       
    query = (
        'site:www.basketball-reference.com/players/*/*.html '
        '{player_name} Overview').format(player_name=player_name)
    print('query: ', query)

    results = google.search(query=query, start=0, stop=1)
    urls = list(results)        
    
    time.sleep(random.randint(5, 10))
    
    if urls:
        return {player_name: urls[0]}
    else:
        print('url found: None')
        return {player_name: None}
        
# print(get_url('Michael Jordan'))

CPU times: user 18 µs, sys: 42 µs, total: 60 µs
Wall time: 68.9 µs


### Get URLs for a list of player names, MULTIPROCESSING

In [352]:
%%time

def get_urls(player_names, num_processes):
    p = multiprocessing.Pool(processes=num_processes)
    outputs = p.map(get_url, player_names)
    p.close()
    return merge_list_of_dict(outputs)

# print(get_urls(test_names[0:2], 2))

CPU times: user 30 µs, sys: 559 µs, total: 589 µs
Wall time: 600 µs


## GET TABLES

### Get stats table for an url

In [353]:
%%time

def get_table(url):
    
    output = {}
    
    page = urllib.request.urlopen(url)
    urlHtml = page.read().decode()

    # Get the player name
    soup = BeautifulSoup(urlHtml, "html.parser")
    player_name = soup.find("h1").text

    # Set the url
    output.setdefault(player_name, {}).setdefault('url', url);

    # Uncomment the tables
    uncommentedUrlHtml = urlHtml.replace('-->', '')
    uncommentedUrlHtml = uncommentedUrlHtml.replace('<!--', '')

    for table_id in table_ids:
        list_of_df = []
        try:
            list_of_df = pd.read_html(
                uncommentedUrlHtml, 
                header=0, 
                attrs={'id': table_id})
        except ValueError as err:
            # Set missing_tables
            output.setdefault(player_name, {}).setdefault('missing_tables', []).append(table_id)
            continue;

        # Drop 'Unnamed' columns
        for df in list_of_df:
          df.drop([col_name for col_name in df.columns if 'Unnamed' in col_name], axis=1, inplace=True)

        # Set table
        output.setdefault(player_name, {}).setdefault('tables', {}).update({table_id: list_of_df[0]})

    # Print processing info
    print(player_name, ' | ', url)
    print('Tables Found: ', len(output[player_name].get('tables', {})), 
          ' | missing_tables: ', output[player_name].get('missing_tables', []))
    print()

    return output

# table = get_table('https://www.basketball-reference.com/players/b/bellawa01.html')
# print('obj length: ', len(table))

CPU times: user 11 µs, sys: 87 µs, total: 98 µs
Wall time: 101 µs


### Get stats tables for a list of urls, MULTIPROCESSING

In [354]:
%%time

from multiprocessing import Pool
from functools import partial

def get_tables(urls, num_processes):
    pool = Pool(processes=num_processes)
    outputs = pool.map(get_table, urls)
    pool.close()
    pool.join()
    return merge_list_of_dict(outputs)

# tables = get_tables(test_urls, 2)
# print('obj length: ', len(tables))

CPU times: user 16 µs, sys: 74 µs, total: 90 µs
Wall time: 93 µs


## Run Tasks

### Get Names

In [430]:
%%time

leads_urls = load_list_from_worksheet('nba_player_names', 'leads_urls')

all_players = get_players_from_urls(leads_urls, 4)

LOADED > 69 rows from spreadsheet: "nba_player_names" | worksheet: "leads_urls" | timestamp: None 

Scrapped https://www.basketball-reference.com/leagues/NBA_1965_totals.html | Players Found: 114
Scrapped https://www.basketball-reference.com/leagues/NBA_1960_totals.html | Players Found: 99
Scrapped https://www.basketball-reference.com/leagues/NBA_1955_totals.html | Players Found: 105
Scrapped https://www.basketball-reference.com/leagues/NBA_1950_totals.html | Players Found: 223
Scrapped https://www.basketball-reference.com/leagues/NBA_1966_totals.html | Players Found: 111
Scrapped https://www.basketball-reference.com/leagues/NBA_1961_totals.html | Players Found: 93
Scrapped https://www.basketball-reference.com/leagues/NBA_1956_totals.html | Players Found: 92
Scrapped https://www.basketball-reference.com/leagues/NBA_1951_totals.html | Players Found: 135
Scrapped https://www.basketball-reference.com/leagues/NBA_1967_totals.html | Players Found: 123
Scrapped https://www.basketball-referen

In [431]:
all_players['michael jordan']

'https://www.basketball-reference.com/players/j/jordami01.html'

In [432]:
len(all_players)

3998

### Get Urls

In [12]:
# worksheet = load_list_from_worksheet('nba_players_sanitized', 'hof')
# hof_names = sanitize_list(worksheet[0].tolist())
# print(hof_names)

# hof_urls = get_urls(hof_names, 30)

In [13]:
# worksheet = load_list_from_worksheet('nba_players_sanitized', 'retired_all_stars')
# retired_all_stars_names = sanitize_list(worksheet[0].tolist())
# print(retired_all_stars_names)

# retired_all_stars_urls = get_urls(retired_all_stars_names, 30)

In [14]:
# worksheet = load_list_from_worksheet('nba_players_sanitized', 'retired_all_nbas')
# retired_all_nbas_names = sanitize_list(worksheet[0].tolist())
# print(retired_all_nbas_names)

# retired_all_nbas_urls = get_urls(retired_all_nbas_names, 30)

In [15]:
# worksheet = load_list_from_worksheet('nba_players_sanitized', '2015')
# players_2015_names = sanitize_list(worksheet[0].tolist())
# print(players_2015_names)

# players_2015_urls = get_urls(players_2015_names, 30)

In [16]:
# save_pickle(hof_urls, 'hof_urls.pickle')
# save_pickle(retired_all_stars_urls, 'retired_all_stars_urls.pickle')
# save_pickle(retired_all_nbas_urls, 'retired_all_nbas_urls.pickle')
# save_pickle(players_2015_urls, 'players_2015_urls.pickle')

In [238]:
# %%time

# save_list_to_worksheet(list(hof_urls.values()), 'nba_player_urls', 'hof_urls', overwrite=True)
# save_list_to_worksheet(list(retired_all_nbas_urls.values()), 'nba_player_urls', 'retired_all_nbas_urls', overwrite=True)
# save_list_to_worksheet(list(retired_all_stars_urls.values()), 'nba_player_urls', 'retired_all_stars_urls', overwrite=True)
# save_list_to_worksheet(list(players_2015_urls.values()), 'nba_player_urls', 'players_2015_urls', overwrite=True)

Remove 50 rows with "None" as their value.
SAVED > 131 rows to spreadsheet: "nba_player_urls" | worksheet: "hof_urls" | timestamp: 2017-11-21 13:21:31.733691 

Remove 0 rows with "None" as their value.
SAVED > 190 rows to spreadsheet: "nba_player_urls" | worksheet: "retired_all_nbas_urls" | timestamp: 2017-11-21 13:21:35.894669 

Remove 0 rows with "None" as their value.
SAVED > 352 rows to spreadsheet: "nba_player_urls" | worksheet: "retired_all_stars_urls" | timestamp: 2017-11-21 13:21:40.841957 

Remove 0 rows with "None" as their value.
SAVED > 476 rows to spreadsheet: "nba_player_urls" | worksheet: "players_2015_urls" | timestamp: 2017-11-21 13:21:46.229754 

CPU times: user 864 ms, sys: 89.2 ms, total: 953 ms
Wall time: 19 s


## Get Tables

In [17]:
# %%time

# url_list = [url for url in hof_urls.values() if url is not None]
# print(len(url_list))
# hof_tables = get_tables(url_list, 4)
# save_pickle(hof_tables, 'hof_tables.pickle')

In [18]:
# %%time

# url_list = [url for url in retired_all_nbas_urls.values() if url is not None]
# print(len(url_list))
# retired_all_nbas_tables = get_tables(url_list, 4)
# save_pickle(retired_all_nbas_tables, 'retired_all_nbas_tables.pickle')

In [19]:
# %%time

# url_list = [url for url in retired_all_stars_urls.values() if url is not None]
# print(len(url_list))
# retired_all_stars_tables = get_tables(url_list, 4)
# save_pickle(retired_all_stars_tables, 'retired_all_stars_tables.pickle')

In [20]:
# %%time

# url_list = [url for url in players_2015_urls.values() if url is not None]
# print(len(url_list))
# players_2015_tables = get_tables(url_list, 4)
# save_pickle(players_2015_tables, 'players_2015_tables.pickle')