# Extract Stats from Basketball-Reference.com

## Imports, Constants, Utilities

### Imports

In [None]:
%%time

import os
import sys
import urllib
import pickle
import json
import pandas as pd
import google
import random
import time

from bs4 import BeautifulSoup
from multiprocessing import Pool
from oauth2client.service_account import ServiceAccountCredentials

import gspread
from oauth2client.client import GoogleCredentials

import unidecode


### Load Spreadsheets

In [None]:
def load_worksheet(spreadsheet_name, worksheet_name):
  
#   gc = gspread.authorize(GoogleCredentials.get_application_default())
  
  scope = ['https://spreadsheets.google.com/feeds']
    
  credentials = ServiceAccountCredentials.from_json_keyfile_name('Data-35df9a696bc1.json', scope)
  
  gc = gspread.authorize(credentials)

  spreadsheet = gc.open(spreadsheet_name)
    
  worksheet = spreadsheet.worksheet(worksheet_name)

  rows = worksheet.get_all_values()

  print('{num_rows} rows loaded.'.format(num_rows=len(rows)))

  df = pd.DataFrame.from_records(rows)
  
  print(df.head(5))

  return df
        
worksheet = load_worksheet('nba_players_sanitized', 'hof')

In [None]:
def sanitize_list(raw_list):
    # Remove accented (Spanish) characters.
    sanitized_list = [unidecode.unidecode(accented_string) for accented_string in raw_list]
    # Trim & lower-case
    sanitized_list = [string.strip().lower() for string in sanitized_list]
    # Remove quotes
    sanitized_list = [string.replace("'", "") for string in sanitized_list]
    sanitized_list = [string.replace('"', '') for string in sanitized_list]
    # Remove dots
    sanitized_list = [string.replace('.', '') for string in sanitized_list]
    for i, string in enumerate(sanitized_list):
        if "," in string:
            lst = string.split(",")
            lst.reverse()
            lst = [token.strip() for token in lst]
            sanitized_string = " ".join(lst)
            sanitized_list[i] = sanitized_string
            
    return sanitized_list

sample_list = ["Shaquille O'neal", "J. J. Reddick", "VinCe Carter ", "Bryant, Kobe"]

print(sanitize_list(sample_list))

### Save & Load Pickled Dictionaries)

In [None]:
%%time

test = {
    'words': """
        Lorem ipsum dolor sit amet, consectetur adipiscing 
        elit. Mauris adipiscing adipiscing placerat. 
        Vestibulum augue augue, 
        pellentesque quis sollicitudin id, adipiscing.
        """,
    'list': list(range(10000)),
    'dict': dict((str(i),'a') for i in range(10000)),
    'int': 100,
    'float': 100.123456
}

def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

def get_file_size(filename):
    statinfo = os.stat(filename)
    return sizeof_fmt(statinfo.st_size)

def save_dict(dictionary, filename):
    with open(filename, 'wb') as file:
        pickle.dump(dictionary, file)
    print('\n', 'file size: ', get_file_size(filename))
    return True;

def load_dict(filename):
    print('\n', 'file size: ', get_file_size(filename))
    with open(filename, 'rb') as file:
        return pickle.load(file)

save_dict(test, 'test.pickle')

len(load_dict('test.pickle'))

### Define & Load Constants

In [None]:
%%time

# Tables to retrieve for each player, by table html ids
table_ids = [
  'per_game',
  'totals',
  'per_minute', # per 36 minutes
  'per_poss', # per 100 possessions
  'advanced', # advanced
    
  'playoffs_per_game',
  'playoffs_totals',
  'playoffs_per_minute', # playoffs per 36 minutes
  'playoffs_per_poss', # playoffs per 100 possessions
  'playoffs_advanced', 
    
  'all_star',
  'all_college_stats',
  'all_salaries',
]

test_names = ['Michael Jordan', 'Vince Carter', 'Yao Ming']
test_urls = [
    "https://www.basketball-reference.com/players/m/moncrsi01.html",
    "https://www.basketball-reference.com/players/b/bellawa01.html",
]

# Load list of player (names) 
# hall_of_famers = load_dict('hall_of_famers.pickle')
# retired_all_stars = load_dict('retired_all_stars.pickle')
# retired_all_nbas = load_dict('retired_all_nbas.pickle')
# players_2015_2016 = load_dict('players_2015_2016.pickle')

## GET URL

### Get URL for a player name

In [None]:
%%time

#TODO(jameshu): Add logic to verify the url returned  in fact matches the player name
# Currently, even gibberish player_name e.g. "James Hu" would have results returned.

def get_url_title(url):
    page = urllib.request.urlopen(url)
    soup = BeautifulSoup(page, "html.parser")
    return soup.title.text

def get_url(player_name):       
    query = (
        'site:www.basketball-reference.com/players/*/*.html '
        '{player_name} Overview').format(player_name=player_name)
    print('query: ', query)

    results = google.search(query=query, start=0, stop=1)
    urls = list(results)        
    
    time.sleep(random.randint(5, 10))
    
    if urls:
        return {player_name: urls[0]}
    else:
        print('url found: None')
        return {player_name: None}
        
print(get_url('Michael Jordan'))

### Get URLs for a list of player names, MULTIPROCESSING

In [None]:
%%time

def get_urls(player_names, num_processes):
    p = Pool(processes=num_processes)
    outputs = p.map(get_url, player_names)
    p.close()
    return outputs

print(get_urls(test_names[0:2], 2))

## GET TABLES

### Get stats table for an url

In [None]:
%%time

def get_table(url):
    
    output = {}
    
    page = urllib.request.urlopen(url)
    urlHtml = page.read().decode()

    # Get the player name
    soup = BeautifulSoup(urlHtml, "html.parser")
    player_name = soup.find("h1").text

    # Set the url
    output.setdefault(player_name, {}).setdefault('url', url);

    # Uncomment the tables
    uncommentedUrlHtml = urlHtml.replace('-->', '')
    uncommentedUrlHtml = uncommentedUrlHtml.replace('<!--', '')

    for table_id in table_ids:
        list_of_df = []
        try:
            list_of_df = pd.read_html(
                uncommentedUrlHtml, 
                header=0, 
                attrs={'id': table_id})
        except ValueError as err:
            # Set failures
            output.setdefault(player_name, {}).setdefault('failures', {}).update({table_id: str(err)})
            continue;

        # Drop 'Unnamed' columns
        for df in list_of_df:
          df.drop([col_name for col_name in df.columns if 'Unnamed' in col_name], axis=1, inplace=True)

        # Set table
        output.setdefault(player_name, {}).setdefault('tables', {}).update({table_id: list_of_df[0]})

    # Print processing info
    print(player_name)
    print('Tables Found: ', len(output[player_name].get('tables', {})), 
          ' | Failures: ', output[player_name].get('failures', {}).keys())
    print(' ')

    return output

print(get_table(test_urls[0]))

### Get stats tables for a list of urls, MULTIPROCESSING

In [None]:
%%time

from multiprocessing import Pool
from functools import partial

def get_tables(urls, num_processes):
    p = Pool(processes=num_processes)
    outputs = p.map(get_table, urls)
    p.close()
    return outputs

len(get_tables(test_urls, 2))


In [None]:
%%time

# Utility function to merge retrived data tables into 1 dictionary.
def merge_dict(list_of_dict):
    merged_dict = {}
    for dictionary in list_of_dict:
        merged_dict.update(dictionary)
    return merged_dict

# print(merge_dict(test_list).keys())

## Run Tasks

### Get Urls

In [None]:
worksheet = load_worksheet('nba_players_sanitized', 'hof')
hof_names = sanitize_list(worksheet[0].tolist())
print(hof_names)

hof_urls = get_urls(hof_names, 30)

In [None]:
worksheet = load_worksheet('nba_players_sanitized', 'retired_all_stars')
retired_all_stars_names = sanitize_list(worksheet[0].tolist())
print(retired_all_stars_names)

retired_all_stars_urls = get_urls(retired_all_stars_names, 30)

In [None]:
worksheet = load_worksheet('nba_players_sanitized', 'retired_all_nbas')
retired_all_nbas_names = sanitize_list(worksheet[0].tolist())
print(retired_all_nbas_names)

retired_all_nbas_urls = get_urls(retired_all_nbas_names, 30)

In [None]:
worksheet = load_worksheet('nba_players_sanitized', '2015')
players_2015_names = sanitize_list(worksheet[0].tolist())
print(players_2015_names)

players_2015_urls = get_urls(players_2015_names, 30)

In [None]:
save_dict(hof_urls, 'hof_urls.pickle')
save_dict(retired_all_stars_urls, 'retired_all_stars_urls.pickle')
save_dict(retired_all_nbas_urls, 'retired_all_nbas_urls.pickle')
save_dict(players_2015_urls, 'players_2015_urls.pickle')