# Using Beautiful Soup

In [None]:
# from urllib.request import urlopen
# from bs4 import BeautifulSoup
# import pandas as pd

# url = "http://history.basketballmonster.com/Season?seasonId=51&cats=9"
# page = urlopen(url)
# soup = BeautifulSoup(page, "html.parser")

# print(soup.prettify())

# table = soup.find("table", attrs={"class":"seasonDetailsT"})

# table.prettify()

# data = []
# rows = table.find_all('tr')
# for row in rows:
#     cols = row.find_all('td') or row.find_all('label') 
#     cols = [ele.text.strip() for ele in cols]
#     data.append([ele for ele in cols if ele])
# df = pd.DataFrame(data)
# df.head()

# Using Pandas

In [None]:
# import pandas as pd
# from urllib
# from bs4 import BeautifulSoup

# leads = [
#   ("http://history.basketballmonster.com/Season?seasonId=18&cats=9",
#    "2009-2010"),
#   ("http://history.basketballmonster.com/Season?seasonId=27&cats=9",
#    "2010-2011"),
#   ("http://history.basketballmonster.com/Season?seasonId=32&cats=9",
#    "2011-2012"),
#   ("http://history.basketballmonster.com/Season?seasonId=36&cats=9",
#    "2012-2013"),
#   ("http://history.basketballmonster.com/Season?seasonId=44&cats=9",
#    "2013-2014"),
#   ("http://history.basketballmonster.com/Season?seasonId=50&cats=9",
#    "2014-2015"),
#   ("http://history.basketballmonster.com/Season?seasonId=51&cats=9",
#    "2015-2016")
# ]

# results = pd.DataFrame()

# for lead in leads:
    
#   url = lead[0]
#   season = lead[1]

#   # Read all tables on url, use row 0 as the headers 
#   df = pd.read_html(url, header=0) 

#   # Drop table 0, unneeded
#   df = df[1]

#   # Lower-case all column names
#   df.columns = map(str.lower, df.columns)
#   df.columns = map(str.strip, df.columns)
  
#   # Drop repeated header rows, 2nd method prefered
#   # df = df.drop_duplicates(keep=False)
#   # df = df[df.player != 'player']
  
#   # Add a new column to record the season
#   df['season'] = pd.Series()
#   df['season'] = season
  
#   # Print general info
#   print(df.shape)
    
#   if results.empty:
#     results = df
#   else:
#     results = results.append(df)

# print(results.shape)
# print(results.columns)
# results.head(10)

# Extract Stats from Basketball-Reference.com

## Definitions & Setup

### Imports

In [None]:
%%time

import os
import sys
import urllib
import pickle
import json
import pandas as pd
from bs4 import BeautifulSoup
from google import search
from multiprocessing import Pool

### Save/load dictionaries

In [None]:
%%time

test = {
    'words': """
        Lorem ipsum dolor sit amet, consectetur adipiscing 
        elit. Mauris adipiscing adipiscing placerat. 
        Vestibulum augue augue, 
        pellentesque quis sollicitudin id, adipiscing.
        """,
    'list': list(range(10000)),
    'dict': dict((str(i),'a') for i in range(10000)),
    'int': 100,
    'float': 100.123456
}

def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

def get_file_size(filename):
    statinfo = os.stat(filename)
    return sizeof_fmt(statinfo.st_size)

def save_dict(dictionary, filename):
    with open(filename, 'wb') as file:
        pickle.dump(dictionary, file)
    print('\n', 'file size: ', get_file_size(filename))
    return True;

def load_dict(filename):
    print('\n', 'file size: ', get_file_size(filename))
    with open(filename, 'rb') as file:
        return pickle.load(file)

save_dict(test, 'test.pickle')

len(load_dict('test.pickle'))

### Define & Load Constants

In [None]:
%%time

# Tables to retrieve for each player, by table html ids
table_ids = [
  'per_game',
  'totals',
  'per_minute', # per 36 minutes
  'per_poss', # per 100 possessions
  'advanced', # advanced
    
  'playoffs_per_game',
  'playoffs_totals',
  'playoffs_per_minute', # playoffs per 36 minutes
  'playoffs_per_poss', # playoffs per 100 possessions
  'playoffs_advanced', 
    
  'all_star',
  'all_college_stats',
  'all_salaries',
]

test_names = ['Michael Jordan', 'Vince Carter', 'Yao Ming']
test_urls = [
    "https://www.basketball-reference.com/players/m/moncrsi01.html",
    "https://www.basketball-reference.com/players/b/bellawa01.html",
]
test_data = load_dict('data_outputs_temp.pickle')
test_keys = list(test_data.keys())[0:3]
test_list = [{key: test_data[key]} for key in test_keys]


# save_dict(hall_of_famers, 'hall_of_famers.pickle')
# save_dict(retired_all_stars, 'retired_all_stars.pickle')
# save_dict(retired_all_nbas, 'retired_all_nbas.pickle')

# Load list of player (names) 
hall_of_famers = load_dict('hall_of_famers.pickle')
retired_all_stars = load_dict('retired_all_stars.pickle')
retired_all_nbas = load_dict('retired_all_nbas.pickle')
players_2015_2016 = load_dict('players_2015_2016.pickle')

### Get stats URL for a player name

In [None]:
%%time

def get_url_title(url):
    page = urllib.request.urlopen(url)
    soup = BeautifulSoup(page, "html.parser")
    return soup.title.text

def get_url(player_name):       
    query = (
        'site:www.basketball-reference.com/players/*/*.html '
        '{player_name} Stats').format(player_name=player_name)
    print('query: ', query)

    results = search(query=query, start=0, stop=1)
    urls = list(results)        

    if urls:
        return {player_name: urls[0]}
    else:
        print('url found: None')
        
        
get_url(test_names[0])

### Get URLs for a list of player names, MULTIPROCESSING

In [None]:
%%time

def get_urls(player_names, num_processes):
    p = Pool(processes=num_processes)
    outputs = p.map(get_url, player_names)
    p.close()
    return outputs

print(get_urls(test_names[0:2], 2))

### Get stats table for an URL

In [None]:
%%time

def get_table(url):
    
    output = {}
    
    page = urllib.request.urlopen(url)
    urlHtml = page.read().decode()

    # Get the player name
    soup = BeautifulSoup(urlHtml, "html.parser")
    player_name = soup.find("h1").text

    # Set the url
    output.setdefault(player_name, {}).setdefault('url', url);

    # Uncomment the tables
    uncommentedUrlHtml = urlHtml.replace('-->', '')
    uncommentedUrlHtml = uncommentedUrlHtml.replace('<!--', '')

    for table_id in table_ids:
        list_of_df = []
        try:
            list_of_df = pd.read_html(
                uncommentedUrlHtml, 
                header=0, 
                attrs={'id': table_id})
        except ValueError as err:
            # Set failures
            output.setdefault(player_name, {}).setdefault('failures', {}).update({table_id: str(err)})
            continue;

        # Drop 'Unnamed' columns
        for df in list_of_df:
          df.drop([col_name for col_name in df.columns if 'Unnamed' in col_name], axis=1, inplace=True)

        # Set table
        output.setdefault(player_name, {}).setdefault('tables', {}).update({table_id: list_of_df[0]})

    # Print processing info
    print(player_name)
    print('Tables Found: ', len(output[player_name].get('tables', {})), 
          ' | Failures: ', output[player_name].get('failures', {}).keys())
    print(' ')

    return output

print(get_table(test_urls[0]))

## Get stats tables for a list of URLs, MULTIPROCESSING

In [None]:
%%time

from multiprocessing import Pool
from functools import partial

def get_tables(urls, num_processes):
    p = Pool(processes=num_processes)
    outputs = p.map(get_table, urls)
    p.close()
    return outputs

len(get_tables(test_urls, 2))


In [None]:
%%time

# Utility function to merge retrived data tables into 1 dictionary.
def merge_dict(list_of_dict):
    merged_dict = {}
    for dictionary in list_of_dict:
        merged_dict.update(dictionary)
    return merged_dict

print(merge_dict(test_list).keys())

In [None]:
# players_2015_2016_data = merge_dict(outputs)
# len(players_2015_2016_data)
# players_2015_2016_data['LeBron James']['tables']['all_star']
# save_dict(players_2015_2016_data, 'players_2015_2016_data.pickle')

## Run Tasks

### Get Urls

### Get Data