# Extract Stats from Basketball-Reference.com

## Imports, Constants, Utilities

### Imports

In [40]:
%%time

import os
import sys
import urllib
import pickle
import json
import pandas as pd
import google
import random
import time
import multiprocessing

from bs4 import BeautifulSoup
from oauth2client.service_account import ServiceAccountCredentials

import gspread
from oauth2client.client import GoogleCredentials

import unidecode


CPU times: user 35 µs, sys: 1 µs, total: 36 µs
Wall time: 38.1 µs


### Load Spreadsheets

In [8]:
def load_worksheet(spreadsheet_name, worksheet_name):
  
#   gc = gspread.authorize(GoogleCredentials.get_application_default())
  
  scope = ['https://spreadsheets.google.com/feeds']
    
  credentials = ServiceAccountCredentials.from_json_keyfile_name('Data-35df9a696bc1.json', scope)
  
  gc = gspread.authorize(credentials)

  spreadsheet = gc.open(spreadsheet_name)
    
  worksheet = spreadsheet.worksheet(worksheet_name)

  rows = worksheet.get_all_values()

  print('{num_rows} rows loaded.'.format(num_rows=len(rows)))

  df = pd.DataFrame.from_records(rows)
  
  print(df.head(5))

  return df
        
worksheet = load_worksheet('nba_players_sanitized', 'hof')

181 rows loaded.
               0
0    chuck hyatt
1  hank luisetti
2   george mikan
3  john schommer
4     vic hanson


In [9]:
def sanitize_list(raw_list):
    # Remove accented (Spanish) characters.
    sanitized_list = [unidecode.unidecode(accented_string) for accented_string in raw_list]
    # Trim & lower-case
    sanitized_list = [string.strip().lower() for string in sanitized_list]
    # Remove quotes
    sanitized_list = [string.replace("'", "") for string in sanitized_list]
    sanitized_list = [string.replace('"', '') for string in sanitized_list]
    # Remove dots
    sanitized_list = [string.replace('.', '') for string in sanitized_list]
    for i, string in enumerate(sanitized_list):
        if "," in string:
            lst = string.split(",")
            lst.reverse()
            lst = [token.strip() for token in lst]
            sanitized_string = " ".join(lst)
            sanitized_list[i] = sanitized_string
            
    return sanitized_list

sample_list = ["Shaquille O'neal", "J. J. Reddick", "VinCe Carter ", "Bryant, Kobe"]

print(sanitize_list(sample_list))

['shaquille oneal', 'j j reddick', 'vince carter', 'kobe bryant']


### Save & Load Pickled Dictionaries)

In [34]:
%%time

test = {
    'words': """
        Lorem ipsum dolor sit amet, consectetur adipiscing 
        elit. Mauris adipiscing adipiscing placerat. 
        Vestibulum augue augue, 
        pellentesque quis sollicitudin id, adipiscing.
        """,
    'list': list(range(10000)),
    'dict': dict((str(i),'a') for i in range(10000)),
    'int': 100,
    'float': 100.123456
}

def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

def get_file_size(filename):
    statinfo = os.stat(filename)
    return sizeof_fmt(statinfo.st_size)

def save_dict(dictionary, filename):
    with open(filename, 'wb') as file:
        pickle.dump(dictionary, file)
    print(
        '\n'
        'SAVED  > ',
        filename, ' | ', 
        get_file_size(filename), ' | ',
        'length: ', len(dictionary),
        '\n')
    return True;

def load_dict(filename):
    with open(filename, 'rb') as file:
        obj = pickle.load(file)
        print(
        '\n'
        'LOADED > ',
        filename, ' | ', 
        get_file_size(filename), ' | ',
        'length: ', len(obj),
        '\n')
        return obj

save_dict(test, 'test.pickle')

len(load_dict('test.pickle'))


SAVED  >  test.pickle  |  183.8 KiB  |  length:  5 


LOADED >  test.pickle  |  183.8 KiB  |  length:  5 

CPU times: user 12.2 ms, sys: 3.36 ms, total: 15.5 ms
Wall time: 13.1 ms


### Define & Load Constants

In [11]:
%%time

# Tables to retrieve for each player, by table html ids
table_ids = [
  'per_game',
  'totals',
  'per_minute', # per 36 minutes
  'per_poss', # per 100 possessions
  'advanced', # advanced
    
  'playoffs_per_game',
  'playoffs_totals',
  'playoffs_per_minute', # playoffs per 36 minutes
  'playoffs_per_poss', # playoffs per 100 possessions
  'playoffs_advanced', 
    
  'all_star',
  'all_college_stats',
  'all_salaries',
]

test_names = ['Michael Jordan', 'Vince Carter', 'Yao Ming']
test_urls = [
    "https://www.basketball-reference.com/players/m/moncrsi01.html",
    "https://www.basketball-reference.com/players/b/bellawa01.html",
]

# Load list of player (names) 
# hall_of_famers = load_dict('hall_of_famers.pickle')
# retired_all_stars = load_dict('retired_all_stars.pickle')
# retired_all_nbas = load_dict('retired_all_nbas.pickle')
# players_2015_2016 = load_dict('players_2015_2016.pickle')

CPU times: user 10 µs, sys: 1 µs, total: 11 µs
Wall time: 16 µs


## GET URL

### Get URL for a player name

In [12]:
%%time

#TODO(jameshu): Add logic to verify the url returned  in fact matches the player name
# Currently, even gibberish player_name e.g. "James Hu" would have results returned.

def get_url_title(url):
    page = urllib.request.urlopen(url)
    soup = BeautifulSoup(page, "html.parser")
    return soup.title.text

def get_url(player_name):       
    query = (
        'site:www.basketball-reference.com/players/*/*.html '
        '{player_name} Overview').format(player_name=player_name)
    print('query: ', query)

    results = google.search(query=query, start=0, stop=1)
    urls = list(results)        
    
    time.sleep(random.randint(5, 10))
    
    if urls:
        return {player_name: urls[0]}
    else:
        print('url found: None')
        return {player_name: None}
        
print(get_url('Michael Jordan'))

query:  site:www.basketball-reference.com/players/*/*.html Michael Jordan Overview
{'Michael Jordan': 'https://www.basketball-reference.com/players/j/jordami01.html'}
CPU times: user 72.2 ms, sys: 8.27 ms, total: 80.4 ms
Wall time: 12.2 s


### Get URLs for a list of player names, MULTIPROCESSING

In [13]:
%%time

def get_urls(player_names, num_processes):
    p = multiprocessing.Pool(processes=num_processes)
    outputs = p.map(get_url, player_names)
    p.close()
    return outputs

print(get_urls(test_names[0:2], 2))

query:  site:www.basketball-reference.com/players/*/*.html Michael Jordan Overview
query:  site:www.basketball-reference.com/players/*/*.html Vince Carter Overview
[{'Michael Jordan': 'https://www.basketball-reference.com/players/j/jordami01.html'}, {'Vince Carter': 'https://www.basketball-reference.com/players/c/cartevi01.html'}]
CPU times: user 16.8 ms, sys: 14.7 ms, total: 31.5 ms
Wall time: 12.1 s


## GET TABLES

### Get stats table for an url

In [64]:
%%time

def get_table(url):
    
    output = {}
    
    page = urllib.request.urlopen(url)
    urlHtml = page.read().decode()

    # Get the player name
    soup = BeautifulSoup(urlHtml, "html.parser")
    player_name = soup.find("h1").text

    # Set the url
    output.setdefault(player_name, {}).setdefault('url', url);

    # Uncomment the tables
    uncommentedUrlHtml = urlHtml.replace('-->', '')
    uncommentedUrlHtml = uncommentedUrlHtml.replace('<!--', '')

    for table_id in table_ids:
        list_of_df = []
        try:
            list_of_df = pd.read_html(
                uncommentedUrlHtml, 
                header=0, 
                attrs={'id': table_id})
        except ValueError as err:
            # Set missing_tables
            output.setdefault(player_name, {}).setdefault('missing_tables', []).append(table_id)
            continue;

        # Drop 'Unnamed' columns
        for df in list_of_df:
          df.drop([col_name for col_name in df.columns if 'Unnamed' in col_name], axis=1, inplace=True)

        # Set table
        output.setdefault(player_name, {}).setdefault('tables', {}).update({table_id: list_of_df[0]})

    # Print processing info
    print(player_name, ' | ', url)
    print('Tables Found: ', len(output[player_name].get('tables', {})), 
          ' | missing_tables: ', output[player_name].get('missing_tables', []))
    print()

    return output

table = get_table('https://www.basketball-reference.com/players/b/bellawa01.html')
print('obj length: ', len(table))

Walt Bellamy  |  https://www.basketball-reference.com/players/b/bellawa01.html
Tables Found:  11  | missing_tables:  ['playoffs_per_poss', 'all_salaries']

obj length:  1
CPU times: user 13.2 s, sys: 57.9 ms, total: 13.2 s
Wall time: 13.8 s


### Get stats tables for a list of urls, MULTIPROCESSING

In [65]:
%%time

# Utility function to merge retrived data tables into 1 dictionary.
def merge_dict(list_of_dict):
    merged_dict = {}
    for dictionary in list_of_dict:
        merged_dict.update(dictionary)
    return merged_dict

test_list = [
    {'michael jordan': {'tables': {}, 'missing_tables': 'none', 'url': 'diety'}},
    {'kobe bryant': {'tables': {}, 'missing_tables': 'none', 'url': 'godly'}},
]

print(merge_dict(test_list).keys())

dict_keys(['kobe bryant', 'michael jordan'])
CPU times: user 627 µs, sys: 628 µs, total: 1.26 ms
Wall time: 684 µs


In [55]:
%%time

from multiprocessing import Pool
from functools import partial

def get_tables(urls, num_processes):
    pool = Pool(processes=num_processes)
    outputs = pool.map(get_table, urls)
    pool.close()
    pool.join()
    return merge_dict(outputs)

tables = get_tables(test_urls, 2)
print('obj length: ', len(tables))

Walt Bellamy  |  https://www.basketball-reference.com/players/b/bellawa01.html
Tables Found:  11  | Failures:  ['playoffs_per_poss', 'all_salaries']

Sidney Moncrief  |  https://www.basketball-reference.com/players/m/moncrsi01.html
Tables Found:  13  | Failures:  []

obj length:  2
CPU times: user 28.7 ms, sys: 25.2 ms, total: 53.9 ms
Wall time: 17.5 s


## Run Tasks

### Get Urls

In [None]:
worksheet = load_worksheet('nba_players_sanitized', 'hof')
hof_names = sanitize_list(worksheet[0].tolist())
print(hof_names)

hof_urls = get_urls(hof_names, 30)

In [None]:
worksheet = load_worksheet('nba_players_sanitized', 'retired_all_stars')
retired_all_stars_names = sanitize_list(worksheet[0].tolist())
print(retired_all_stars_names)

retired_all_stars_urls = get_urls(retired_all_stars_names, 30)

In [None]:
worksheet = load_worksheet('nba_players_sanitized', 'retired_all_nbas')
retired_all_nbas_names = sanitize_list(worksheet[0].tolist())
print(retired_all_nbas_names)

retired_all_nbas_urls = get_urls(retired_all_nbas_names, 30)

In [None]:
worksheet = load_worksheet('nba_players_sanitized', '2015')
players_2015_names = sanitize_list(worksheet[0].tolist())
print(players_2015_names)

players_2015_urls = get_urls(players_2015_names, 30)

In [None]:
save_dict(hof_urls, 'hof_urls.pickle')
save_dict(retired_all_stars_urls, 'retired_all_stars_urls.pickle')
save_dict(retired_all_nbas_urls, 'retired_all_nbas_urls.pickle')
save_dict(players_2015_urls, 'players_2015_urls.pickle')

## Get Tables

In [67]:
%%time

hof_urls = load_dict('hof_urls.pickle')
retired_all_stars_urls = load_dict('retired_all_stars_urls.pickle')
retired_all_nbas_urls = load_dict('retired_all_nbas_urls.pickle')
players_2015_urls = load_dict('players_2015_urls.pickle')


LOADED >  hof_urls.pickle  |  12.4 KiB  |  length:  181 


LOADED >  retired_all_stars_urls.pickle  |  31.3 KiB  |  length:  352 


LOADED >  retired_all_nbas_urls.pickle  |  16.6 KiB  |  length:  190 


LOADED >  players_2015_urls.pickle  |  42.9 KiB  |  length:  476 

CPU times: user 5.18 ms, sys: 4.27 ms, total: 9.45 ms
Wall time: 6.75 ms


In [59]:
%%time

url_list = [url for url in hof_urls.values() if url is not None]
print(len(url_list))
hof_tables = get_tables(url_list, 4)
save_dict(hof_tables, 'hof_tables.pickle')

131
Arnie Risen  |  https://www.basketball-reference.com/players/r/risenar01.html
Tables Found:  9  | Failures:  ['per_poss', 'playoffs_per_poss', 'all_college_stats', 'all_salaries']

Drazen Petrovic  |  https://www.basketball-reference.com/players/p/petrodr01.html
Tables Found:  11  | Failures:  ['all_star', 'all_college_stats']

Spencer Haywood  |  https://www.basketball-reference.com/players/h/haywosp01.html
Tables Found:  12  | Failures:  ['all_salaries']

Marquez Haynes  |  https://www.basketball-reference.com/players/h/haynema01.html
Tables Found:  0  | Failures:  ['per_game', 'totals', 'per_minute', 'per_poss', 'advanced', 'playoffs_per_game', 'playoffs_totals', 'playoffs_per_minute', 'playoffs_per_poss', 'playoffs_advanced', 'all_star', 'all_college_stats', 'all_salaries']

Bob McAdoo  |  https://www.basketball-reference.com/players/m/mcadobo01.html
Tables Found:  13  | Failures:  []

Tiny Archibald  |  https://www.basketball-reference.com/players/a/architi01.html
Tables Found

In [68]:
%%time

url_list = [url for url in retired_all_nbas_urls.values() if url is not None]
print(len(url_list))
retired_all_nbas_tables = get_tables(url_list, 4)
save_dict(retired_all_nbas_tables, 'retired_all_nbas_tables.pickle')

190
Al Cervi  |  https://www.basketball-reference.com/players/c/cervial01.html
Tables Found:  8  | missing_tables:  ['per_poss', 'playoffs_per_poss', 'all_star', 'all_college_stats', 'all_salaries']

Mitch Richmond  |  https://www.basketball-reference.com/players/r/richmmi01.html
Tables Found:  13  | missing_tables:  []

Bob McAdoo  |  https://www.basketball-reference.com/players/m/mcadobo01.html
Tables Found:  13  | missing_tables:  []

Gilbert Arenas  |  https://www.basketball-reference.com/players/a/arenagi01.html
Tables Found:  13  | missing_tables:  []

Moses Malone  |  https://www.basketball-reference.com/players/m/malonmo01.html
Tables Found:  12  | missing_tables:  ['all_college_stats']

Jack Twyman  |  https://www.basketball-reference.com/players/t/twymaja01.html
Tables Found:  10  | missing_tables:  ['per_poss', 'playoffs_per_poss', 'all_salaries']

Connie Hawkins  |  https://www.basketball-reference.com/players/h/hawkico01.html
Tables Found:  11  | missing_tables:  ['all_col

In [69]:
%%time

url_list = [url for url in retired_all_stars_urls.values() if url is not None]
print(len(url_list))
retired_all_stars_tables = get_tables(url_list, 4)
save_dict(retired_all_stars_tables, 'retired_all_stars_tables.pickle')

352
Don Sunderlage  |  https://www.basketball-reference.com/players/s/sundedo01.html
Tables Found:  5  | missing_tables:  ['per_poss', 'playoffs_per_game', 'playoffs_totals', 'playoffs_per_minute', 'playoffs_per_poss', 'playoffs_advanced', 'all_college_stats', 'all_salaries']

Jack George  |  https://www.basketball-reference.com/players/g/georgja01.html
Tables Found:  9  | missing_tables:  ['per_poss', 'playoffs_per_poss', 'all_college_stats', 'all_salaries']

Sidney Wicks  |  https://www.basketball-reference.com/players/w/wickssi01.html
Tables Found:  12  | missing_tables:  ['all_salaries']

Dan Roundfield  |  https://www.basketball-reference.com/players/r/roundda01.html
Tables Found:  13  | missing_tables:  []

Bob Pettit  |  https://www.basketball-reference.com/players/p/pettibo01.html
Tables Found:  10  | missing_tables:  ['per_poss', 'playoffs_per_poss', 'all_salaries']

Dick Van Arsdale  |  https://www.basketball-reference.com/players/v/vanardi01.html
Tables Found:  11  | missing

In [70]:
%%time

url_list = [url for url in players_2015_urls.values() if url is not None]
print(len(url_list))
players_2015_tables = get_tables(url_list, 4)
save_dict(players_2015_tables, 'players_2015_tables.pickle')

476
Rakeem Christmas  |  https://www.basketball-reference.com/players/c/chrisra01.html
Tables Found:  7  | missing_tables:  ['playoffs_per_game', 'playoffs_totals', 'playoffs_per_minute', 'playoffs_per_poss', 'playoffs_advanced', 'all_star']

Anthony Brown  |  https://www.basketball-reference.com/players/b/brownan02.html
Tables Found:  7  | missing_tables:  ['playoffs_per_game', 'playoffs_totals', 'playoffs_per_minute', 'playoffs_per_poss', 'playoffs_advanced', 'all_star']

Kyle Lowry  |  https://www.basketball-reference.com/players/l/lowryky01.html
Tables Found:  13  | missing_tables:  []

Pablo Prigioni  |  https://www.basketball-reference.com/players/p/prigipa01.html
Tables Found:  11  | missing_tables:  ['all_star', 'all_college_stats']

Thabo Sefolosha  |  https://www.basketball-reference.com/players/s/sefolth01.html
Tables Found:  11  | missing_tables:  ['all_star', 'all_college_stats']

Kelly Oubre Jr.  |  https://www.basketball-reference.com/players/o/oubreke01.html
Tables Foun

In [71]:
save_dict(players_2015_tables, 'players_2015_tables.pickle')


SAVED  >  players_2015_tables.pickle  |  15.7 MiB  |  length:  476 



True