# Using Beautiful Soup

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
url = "http://history.basketballmonster.com/Season?seasonId=51&cats=9"
page = urlopen(url)
soup = BeautifulSoup(page, "html.parser")

In [None]:
print(soup.prettify())

In [None]:
table = soup.find("table", attrs={"class":"seasonDetailsT"})

In [None]:
table.prettify()

In [None]:
data = []
rows = table.find_all('tr')
for row in rows:
    cols = row.find_all('td') or row.find_all('label') 
    cols = [ele.text.strip() for ele in cols]
    data.append([ele for ele in cols if ele])
df = pd.DataFrame(data)
df.head()

# Using Pandas

In [None]:
import pandas as pd

In [None]:
leads = [
  ("http://history.basketballmonster.com/Season?seasonId=18&cats=9",
   "2009-2010"),
  ("http://history.basketballmonster.com/Season?seasonId=27&cats=9",
   "2010-2011"),
  ("http://history.basketballmonster.com/Season?seasonId=32&cats=9",
   "2011-2012"),
  ("http://history.basketballmonster.com/Season?seasonId=36&cats=9",
   "2012-2013"),
  ("http://history.basketballmonster.com/Season?seasonId=44&cats=9",
   "2013-2014"),
  ("http://history.basketballmonster.com/Season?seasonId=50&cats=9",
   "2014-2015"),
  ("http://history.basketballmonster.com/Season?seasonId=51&cats=9",
   "2015-2016")
]

results = pd.DataFrame()

for lead in leads:
    
  url = lead[0]
  season = lead[1]

  # Read all tables on url, use row 0 as the headers 
  df = pd.read_html(url, header=0) 

  # Drop table 0, unneeded
  df = df[1]

  # Lower-case all column names
  df.columns = map(str.lower, df.columns)
  df.columns = map(str.strip, df.columns)
  
  # Drop repeated header rows, 2nd method prefered
  # df = df.drop_duplicates(keep=False)
  # df = df[df.player != 'player']
  
  # Add a new column to record the season
  df['season'] = pd.Series()
  df['season'] = season
  
  # Print general info
  print(df.shape)
    
  if results.empty:
    results = df
  else:
    results = results.append(df)

print(results.shape)
print(results.columns)
results.head(10)

# Extract Stats from Basketball-Reference.com

In [1]:
import urllib
from bs4 import BeautifulSoup
import pandas as pd
import sys

table_ids = [
  'per_game',
  'totals',
  'per_minute', # per 36 minutes
  'per_poss', # per 100 possessions
  'advanced', # advanced
    
  'playoffs_per_game',
  'playoffs_totals',
  'playoffs_per_minute', # playoffs per 36 minutes
  'playoffs_per_poss', # playoffs per 100 possessions
  'playoffs_advanced', 
    
  'all_star',
  'all_college_stats',
  'all_salaries',
]

hall_of_famers = [
    'Chuck Hyatt',
    'Hank Luisetti',
    'George Mikan',
    'John Schommer',
    'Vic Hanson',
    'Ed Macauley',
    'Branch McCracken',
    'Charles Murphy',
    'John Wooden',
    'Bennie Borgmann',
    'Forrest DeBernardi',
    'Bob Kurland',
    'Andy Phillip',
    'John Roosma',
    'Chris Steinmetz',
    'Ed Wachter',
    'Jack McCracken',
    'Pat Page',
    'Barney Sedran',
    'John Thompson',
    'Robert Gruenig',
    'Bud Foster',
    'Nat Holman',
    'John Russell',
    'Joe Lapchick',
    'Dutch Dehnert',
    'Bob Davies',
    'Bob Cousy',
    'Bob Pettit',
    'Paul Endacott',
    'Marty Friedman',
    'John Beckman',
    'Dolph Schayes',
    'Ernest Schmidt',
    'Joe Brennan',
    'Bill Russell',
    'Robert Vandivier',
    'Tom Gola',
    'Ed Krause',
    'Bill Sharman',
    'Elgin Baylor',
    'Lauren Gale',
    'William Johnson',
    'Paul Arizin',
    'Joe Fulks',
    'Cliff Hagan',
    'Jim Pollard',
    'Wilt Chamberlain',
    'Jerry Lucas',
    'Oscar Robertson',
    'Jerry West',
    'Tom Barlow',
    'Hal Greer',
    'Slater Martin',
    'Frank Ramsey',
    'Willis Reed',
    'Bill Bradley',
    'Dave DeBusschere',
    'Jack Twyman',
    'John Havlicek',
    'Sam Jones',
    'Al Cervi',
    'Nate Thurmond',
    'Billy Cunningham',
    'Tom Heinsohn',
    'Rick Barry',
    'Walt Frazier',
    'Bob Houbregs',
    'Pete Maravich',
    'Bobby Wanzer',
    'Clyde Lovellette',
    'Bobby McDermott',
    'Wes Unseld',
    'William Gates',
    'K.C. Jones',
    'Lenny Wilkens',
    'Dave Bing',
    'Elvin Hayes',
    'Neil Johnston',
    'Earl Monroe',
    'Nate Archibald',
    'Dave Cowens',
    'Harry Gallatin',
    'Sergei Belov',
    'Lusia Harris-Stewart',
    'Connie Hawkins',
    'Bob Lanier',
    'Nera White',
    'Walt Bellamy',
    'Julius Erving',
    'Dan Issel',
    'Dick McGuire',
    'Ann Meyers',
    'Calvin Murphy',
    'Uļjana Semjonova',
    'Bill Walton',
    'Carol Blazejowski',
    'Buddy Jeannette',
    'Kareem Abdul-Jabbar',
    'Anne Donovan',
    'Vern Mikkelsen',
    'Cheryl Miller',
    'Krešimir Ćosić',
    'George Gervin',
    'Gail Goodrich',
    'Nancy Lieberman',
    'David Thompson',
    'George Yardley',
    'Joan Crawford',
    'Denise Curry',
    'Alex English',
    'Bailey Howell',
    'Larry Bird',
    'Marques Haynes',
    'Arnie Risen',
    'Kevin McHale',
    'Bob McAdoo',
    'Isiah Thomas',
    'Moses Malone',
    'Magic Johnson',
    'Dražen Petrović',
    'Dino Meneghin',
    'Robert Parish',
    'James Worthy',
    'Dražen Dalipagić',
    'Clyde Drexler',
    'Maurice Stokes',
    'Lynette Woodard',
    'Hortencia de Fatima Marcari',
    'Charles Barkley',
    'Joe Dumars',
    'Dominique Wilkins',
    'Adrian Dantley',
    'Patrick Ewing',
    'Hakeem Olajuwon',
    'Michael Jordan',
    'David Robinson',
    'John Stockton',
    'Cynthia Cooper-Dyke',
    'Dennis Johnson',
    'Gus Johnson',
    'Karl Malone',
    'Ubiratan Pereira Maciel',
    'Scottie Pippen',
    'Dennis Rodman',
    'Chris Mullin',
    'Arvydas Sabonis',
    'Artis Gilmore',
    'Teresa Edwards',
    'Goose Tatum',
    'Mel Daniels',
    'Katrina McClain',
    'Reggie Miller',
    'Ralph Sampson',
    'Chet Walker',
    'Jamaal Wilkes',
    'Roger Brown',
    'Bernard King',
    'Gary Payton',
    'Richie Guerin',
    'Dawn Staley',
    'Oscar Schmidt',
    'Šarūnas Marčiulionis',
    'Alonzo Mourning',
    'Mitch Richmond',
    'Guy Rodgers',
    'Louie Dampier',
    'Spencer Haywood',
    'John Isaacs',
    'Lisa Leslie',
    'Dikembe Mutombo',
    'Jo Jo White',
    'Yao Ming',
    'Cumberland Posey',
    'Sheryl Swoopes',
    'Zelmo Beaty',
    'Shaquille O\'Neal',
    'Allen Iverson',
    'Tracy McGrady',
    'Nikos Galis',
    'George McGinnis']

retired_all_stars = [
    'Kareem Abdul-Jabbar',
    'Kobe Bryant',
    'Tim Duncan',
    'Kevin Garnett',
    'Shaquille O\'Neal',
    'Michael Jordan',
    'Karl Malone',
    'Jerry West',
    'Wilt Chamberlain',
    'Bob Cousy',
    'John Havlicek',
    'Larry Bird',
    'Elvin Hayes',
    'Magic Johnson',
    'Moses Malone',
    'Hakeem Olajuwon',
    'Oscar Robertson',
    'Bill Russell',
    'Dolph Schayes',
    'Isiah Thomas',
    'Charles Barkley',
    'Elgin Baylor',
    'Julius Erving',
    'Patrick Ewing',
    'Allen Iverson',
    'Bob Pettit',
    'Ray Allen',
    'Paul Arizin',
    'Clyde Drexler',
    'Hal Greer',
    'Jason Kidd',
    'Paul Pierce',
    'David Robinson',
    'John Stockton',
    'George Gervin',
    'Robert Parish',
    'Gary Payton',
    'Lenny Wilkens',
    'Dominique Wilkins',
    'Rick Barry',
    'Dave Cowens',
    'Dave DeBusschere',
    'Alex English',
    'Larry Foust',
    'Bob Lanier',
    'Dikembe Mutombo',
    'Steve Nash',
    'Bill Sharman',
    'Yao Ming',
    'Dave Bing',
    'Walt Frazier',
    'Harry Gallatin',
    'Grant Hill',
    'Jerry Lucas',
    'Ed Macauley',
    'Slater Martin',
    'Tracy McGrady',
    'Dick McGuire',
    'Kevin McHale',
    'Alonzo Mourning',
    'Scottie Pippen',
    'Willis Reed',
    'Jack Sikma',
    'Nate Thurmond',
    'Chet Walker',
    'Jo Jo White',
    'James Worthy',
    'Nate Archibald',
    'Larry Costello',
    'Adrian Dantley',
    'Walter Davis',
    'Joe Dumars',
    'Artis Gilmore',
    'Richie Guerin',
    'Tom Heinsohn',
    'Bailey Howell',
    'Lou Hudson',
    'Neil Johnston',
    'Shawn Kemp',
    'Vern Mikkelsen',
    'Jermaine O\'Neal',
    'Mitch Richmond',
    'Amar\'e Stoudemire',
    'Jack Twyman',
    'George Yardley',
    'Chauncey Billups',
    'Carl Braun',
    'Brad Daugherty',
    'Wayne Embry',
    'Tom Gola',
    'Gail Goodrich',
    'Cliff Hagan',
    'Tim Hardaway',
    'Dennis Johnson',
    'Gus Johnson',
    'Marques Johnson',
    'Sam Jones',
    'Rudy LaRusso',
    'Pete Maravich',
    'Bob McAdoo',
    'Reggie Miller',
    'Sidney Moncrief',
    'Chris Mullin',
    'Don Ohl',
    'Andy Phillip',
    'Gene Shue',
    'Rudy Tomjanovich',
    'Wes Unseld',
    'Bobby Wanzer',
    'Chris Webber',
    'Paul Westphal',
    'Vin Baker',
    'Walt Bellamy',
    'Otis Birdsong',
    'Rolando Blackman',
    'Tom Chambers',
    'Maurice Cheeks',
    'Doug Collins',
    'Billy Cunningham',
    'Bob Dandridge',
    'Bob Davies',
    'Dick Garmaker',
    'Johnny Green',
    'Penny Hardaway',
    'Connie Hawkins',
    'Spencer Haywood',
    'Mel Hutchins',
    'Bobby Jones',
    'Bernard King',
    'Bill Laimbeer',
    'Clyde Lovellette',
    'Maurice Lucas',
    'Shawn Marion',
    'George Mikan',
    'Earl Monroe',
    'Willie Naulls',
    'Jim Pollard',
    'Mark Price',
    'Micheal Ray Richardson',
    'Arnie Risen',
    'Alvin Robertson',
    'Guy Rodgers',
    'Ralph Sampson',
    'Latrell Sprewell',
    'David Thompson',
    'Ben Wallace§',
    'Rasheed Wallace',
    'Sidney Wicks',
    'Mark Aguirre',
    'Gilbert Arenas§',
    'Bill Bridges',
    'Phil Chenier',
    'Terry Dischinger',
    'Steve Francis',
    'Richard Hamilton',
    'Kevin Johnson',
    'Eddie Jones',
    'Bob Kauffman',
    'Johnny Kerr',
    'Bob Love',
    'Dan Majerle',
    'George McGinnis',
    'Jeff Mullins',
    'Larry Nance',
    'Glen Rice',
    'Dan Roundfield',
    'Brandon Roy',
    'Detlef Schrempf',
    'Charlie Scott',
    'Paul Seymour',
    'Peja Stojaković',
    'Maurice Stokes',
    'Dick Van Arsdale',
    'Tom Van Arsdale',
    'Norm Van Lier',
    'Antoine Walker',
    'Jamaal Wilkes',
    'Buck Williams',
    'Leo Barnhorst',
    'Zelmo Beaty',
    'Carlos Boozer',
    'Elton Brand',
    'Terrell Brandon',
    'Frank Brian',
    'Caron Butler',
    'Joe Caldwell',
    'Archie Clark',
    'Terry Cummings',
    'Baron Davis§',
    'John Drew',
    'Kevin Duckworth',
    'Walter Dukes',
    'Dike Eddleman',
    'Sean Elliott',
    'Michael Finley',
    'Joe Fulks',
    'Jack George',
    'Allan Houston',
    'Rod Hundley',
    'Žydrūnas Ilgauskas',
    'Antawn Jamison',
    'Eddie Johnson',
    'John Johnson',
    'Larry Johnson',
    'Larry Kenon',
    'Don Kojis',
    'Fat Lever',
    'Rashard Lewis',
    'Jeff Malone',
    'Danny Manning',
    'Stephon Marbury',
    'Jack Marin',
    'Brad Miller§',
    'Norm Nixon',
    'Jim Paxson',
    'Geoff Petrie',
    'Terry Porter',
    'Glenn Robinson',
    'Truck Robinson',
    'Red Rocha',
    'Dennis Rodman',
    'Jeff Ruland',
    'Fred Scolari',
    'Ken Sears',
    'Frank Selvy',
    'Paul Silas',
    'Jerry Sloan',
    'Phil Smith',
    'Randy Smith',
    'Jerry Stackhouse',
    'Reggie Theus',
    'Andrew Toney',
    'Kelly Tripucka',
    'Kiki Vandeweghe',
    'Jimmy Walker',
    'Bill Walton',
    'Scott Wedman',
    'Gus Williams',
    'Brian Winters',
    'Shareef Abdur-Rahim',
    'Alvan Adams',
    'Michael Adams',
    'Danny Ainge',
    'Kenny Anderson',
    'B. J. Armstrong',
    'Don Barksdale',
    'Dick Barnett',
    'Dana Barros',
    'Butch Beard',
    'Ralph Beard',
    'Mookie Blaylock',
    'John Block',
    'Bob Boozer',
    'Vince Boryla',
    'Bill Bradley',
    'Fred Brown',
    'Don Buse',
    'Andrew Bynum',
    'Austin Carr',
    'Joe Barry Carroll',
    'Bill Cartwright',
    'Sam Cassell',
    'Cedric Ceballos',
    'Len Chappell',
    'Nathaniel Clifton',
    'Derrick Coleman',
    'Jack Coleman',
    'Antonio Davis',
    'Dale Davis',
    'Vlade Divac',
    'James Donaldson',
    'Mark Eaton',
    'Dale Ellis',
    'Ray Felix',
    'Sleepy Floyd',
    'World B. Free',
    'Billy Gabor',
    'Chris Gatling',
    'Danny Granger',
    'Horace Grant',
    'A. C. Green',
    'Rickey Green',
    'Alex Groza',
    'Tom Gugliotta',
    'Bob Harrison',
    'Hersey Hawkins',
    'Walt Hazzard',
    'Tyrone Hill',
    'Lionel Hollins',
    'Jeff Hornacek',
    'Josh Howard',
    'Juwan Howard',
    'Darrall Imhoff',
    'Dan Issel',
    'Lucious Jackson',
    'Mark Jackson',
    'Steve Johnson',
    'Chris Kaman',
    'Jim King',
    'Andrei Kirilenko',
    'Billy Knight',
    'Sam Lacey',
    'Christian Laettner',
    'Clyde Lee',
    'Reggie Lewis',
    'Jamaal Magloire',
    'Kenyon Martin',
    'Jamal Mashburn',
    'Anthony Mason',
    'Xavier McDaniel',
    'Antonio McDyess',
    'Jon McGlocklin',
    'Tom Meschery',
    'Eddie Miles',
    'Mike Mitchell',
    'Steve Mix',
    'Calvin Murphy',
    'Calvin Natt',
    'Chuck Noble',
    'Charles Oakley',
    'Mehmet Okur§',
    'Ricky Pierce',
    'Jim Price',
    'Theo Ratliff',
    'Michael Redd§',
    'Richie Regan',
    'Doc Rivers',
    'Clifford Robinson',
    'Flynn Robinson',
    'Curtis Rowe',
    'Bob Rule',
    'Campy Russell',
    'Cazzie Russell',
    'Woody Sauldsberry',
    'Fred Schaus',
    'Lee Shaffer',
    'Lonnie Shelton',
    'Adrian Smith',
    'Steve Smith',
    'Rik Smits',
    'John Starks',
    'Don Sunderlage',
    'Wally Szczerbiak',
    'Otis Thorpe',
    'Nick Van Exel',
    'Gerald Wallace',
    'Paul Walther',
    'Kermit Washington',
    'Jayson Williams',
    'Mo Williams',
    'Kevin Willis',
    'Max Zaslofsky',
]

retired_all_nbas = [
    'Adrian Dantley',
    'Al Cervi',
    'Alex English',
    'Alex Groza',
    'Allen Iverson',
    'Alonzo Mourning',
    'Alvin Robertson',
    'Amar\'e Stoudemire',
    'Amare Stoudemire',
    'Andrew Bynum',
    'Andy Phillip',
    'Anthony Mason',
    'Antonio McDyess',
    'Archie Clark',
    'Arnie Risen',
    'Bailey Howell',
    'Baron Davis',
    'Ben Wallace',
    'Bernard King',
    'Bill Russell',
    'Bill Sharman',
    'Bill Walton',
    'Billy Cunningham',
    'Bob Cousy',
    'Bob Dandridge',
    'Bob Davies',
    'Bob Feerick',
    'Bob Love',
    'Bob McAdoo',
    'Bob Pettit',
    'Bobby Wanzer',
    'Bones McKinney',
    'Brad Daugherty',
    'Brandon Roy',
    'Buck Williams',
    'Buddy Jeannette',
    'Carl Braun',
    'Charles Barkley',
    'Chauncey Billups',
    'Chick Halbert',
    'Chris Bosh',
    'Chris Mullin',
    'Chris Webber',
    'Cliff Hagan',
    'Clyde Drexler',
    'Clyde Lovellette',
    'Connie Hawkins',
    'Dale Ellis',
    'Dan Roundfield',
    'Dave Bing',
    'Dave Cowens',
    'Dave DeBusschere',
    'David Robinson',
    'David Thompson',
    'Dennis Johnson',
    'Dennis Rodman',
    'Derrick Coleman',
    'Detlef Schrempf',
    'Dick Garmaker',
    'Dick McGuire',
    'Dikembe Mutombo',
    'Dolph Schayes',
    'Dominique Wilkins',
    'Dražen Petrović',
    'Earl Monroe',
    'Ed Macauley',
    'Ed Sadowski',
    'Eddie Jones',
    'Elgin Baylor',
    'Elton Brand',
    'Elvin Hayes',
    'Ernie Calverley',
    'Frank Baumholtz',
    'Frank Brian',
    'Fred Schaus',
    'Fred Scolari',
    'Gail Goodrich',
    'Gary Payton',
    'Gene Shue',
    'George Gervin',
    'George McGinnis',
    'George Mikan',
    'George Yardley',
    'Gilbert Arenas',
    'Glen Rice',
    'Grant Hill',
    'Gus Johnson',
    'Gus Williams',
    'Hakeem Olajuwon',
    'Hal Greer',
    'Harry Gallatin',
    'Howie Dallmar',
    'Isiah Thomas',
    'Jack George',
    'Jack Twyman',
    'Jamal Mashburn',
    'James Worthy',
    'Jason Kidd',
    'Jermaine O\'Neal',
    'Jerry Lucas',
    'Jerry West',
    'Jim Paxson',
    'Jim Pollard',
    'Jo Jo White',
    'Joe Dumars',
    'Joe Fulks',
    'John Havlicek',
    'John Stockton',
    'Johnny Logan',
    'Julius Erving',
    'Juwan Howard',
    'Kareem Abdul-Jabbar',
    'Karl Malone',
    'Ken Sailors',
    'Kevin Garnett',
    'Kevin Johnson',
    'Kevin McHale',
    'Kevin Willis',
    'Kobe Bryant',
    'Lafayette Lever',
    'Larry Bird',
    'Larry Costello',
    'Larry Foust',
    'Larry Johnson',
    'Latrell Sprewell',
    'Lew Alcindor',
    'Lloyd Free',
    'Lou Hudson',
    'Magic Johnson',
    'Mark Price',
    'Marques Johnson',
    'Maurice Lucas',
    'Maurice Stokes',
    'Max Zaslofsky',
    'Michael Jordan',
    'Michael Redd',
    'Mitch Richmond',
    'Moses Malone',
    'Nate Archibald',
    'Neil Johnston',
    'Norm Van Lier',
    'Oscar Robertson',
    'Otis Birdsong',
    'Patrick Ewing',
    'Paul Arizin',
    'Paul Pierce',
    'Paul Seymour',
    'Paul Westphal',
    'Peja Stojaković',
    'Penny Hardaway',
    'Pete Maravich',
    'Phil Chenier',
    'Phil Ford',
    'Phil Smith',
    'Ralph Beard',
    'Ralph Sampson',
    'Randy Smith',
    'Ray Allen',
    'Reggie Miller',
    'Richie Guerin',
    'Rick Barry',
    'Robert Parish',
    'Rod Strickland',
    'Sam Cassell',
    'Sam Jones',
    'Scottie Pippen',
    'Shaquille O\'Neal',
    'Shawn Kemp',
    'Shawn Marion',
    'Sidney Moncrief',
    'Slater Martin',
    'Spencer Haywood',
    'Stan Miasek',
    'Stephon Marbury',
    'Steve Nash',
    'Terry Cummings',
    'Tim Duncan',
    'Tim Hardaway',
    'Tom Chambers',
    'Tom Gola',
    'Tom Heinsohn',
    'Tracy McGrady',
    'Truck Robinson',
    'Vern Mikkelsen',
    'Vin Baker',
    'Walt Frazier',
    'Walter Davis',
    'Wes Unseld',
    'Willis Reed',
    'Wilt Chamberlain',
    'Yao Ming',
]

print('hall_of_famers length: ', len(hall_of_famers));
print('retired_all_stars length: ', len(retired_all_stars));
print('retired_all_nbas length: ', len(retired_all_nbas));

hall_of_famers length:  181
retired_all_stars length:  352
retired_all_nbas length:  191


### Get stats table from a list of (player stats) urls

In [2]:
import urllib

def parse_urls_into_tables(urls, outputs, failures):
    pd.set_option('display.max_columns', None)

    i = 1
    
    for url in urls:
    
        page = urllib.request.urlopen(url)
        urlHtml = page.read().decode()
        
        # Get the player name
        soup = BeautifulSoup(urlHtml, "html.parser")
        player_name = soup.find("h1").text

        # uncomment the tables
        uncommentedUrlHtml = urlHtml.replace('-->', '')
        uncommentedUrlHtml = uncommentedUrlHtml.replace('<!--', '')

        for table_id in table_ids:
            list_of_df = []
            try:
                list_of_df = pd.read_html(
                    uncommentedUrlHtml, 
                    header=0, 
                    attrs={'id': table_id})
            except ValueError as err:
                err_msg = str(err) + ' for table: ' + table_id;
                if player_name in failures:
                    failures[player_name].append(err_msg);
                else:
                    err_msg = str(err) + ' for table: ' + table_id;
                    failures.update({player_name: [err_msg]})
                continue;

            # Drop 'Unnamed' columns
            for df in list_of_df:
              df.drop([col_name for col_name in df.columns if 'Unnamed' in col_name], axis=1, inplace=True)

            # Update outputs dictionary
            table = {table_id: list_of_df[0]}
            if player_name in outputs:    
                outputs[player_name].update(table)
            else:
                outputs[player_name] = table
                outputs[player_name]['url'] = url
        
        if(i % 5 == 0):
            save_dict(outputs, 'outputs_temp.pickle')
        
        print(str(i))
#         print('player: ', player_name)
#         print('url parsed: ', url)
        
#         print('# of tables found: ', len(outputs[player_name]) - 1)
#         print('tables not found: ', failures.get(player_name))
#         print(' ')
        i += 1
    
    print('------------ COMPLETED! ------------')
    print('# of players parsed: ', len(outputs))
    print('failures: ', failures)

players = {}
failures = {}

urls = [
    "https://www.basketball-reference.com/players/m/moncrsi01.html",
    "https://www.basketball-reference.com/players/b/bellawa01.html",
]

parse_urls_into_tables(urls, players, failures)

1
2
------------ COMPLETED! ------------
# of players parsed:  2
failures:  {'Walt Bellamy': ['No tables found for table: playoffs_per_poss', 'No tables found for table: all_salaries']}


In [None]:
import pickle

with open('players.pickle', 'wb') as file:
    pickle.dump(players, file)
    
with open('failures.pickle', 'wb') as file:
    pickle.dump(failures, file)

### Get urls from a list of player names

In [3]:
from google import search
import urllib
from bs4 import BeautifulSoup

def get_url_title(url):
    page = urllib.request.urlopen(url)
    soup = BeautifulSoup(page, "html.parser")
    return soup.title.text

def extract_stats_urls(player_names, output):
    pd.set_option('display.max_columns', None)
    
    i = 1
    
    for player_name in player_names:
        query = 'site:www.basketball-reference.com/players/*/*.html ' + str(player_name) + ' Stats'
        print(str(i))
        print('query: ' + '"' + query + '"')
            
        while True:
            try:
                results = search(query=query, start=0, stop=1)
                urls = list(results)        
            except Exception as e:
                print('retrying...')
                continue
            break    
            
        if len(urls) < 1:
            print('url found: None')
        else:
            print('url found: ' + str(urls[0]))
            output.append(urls[0]);
        
        print('Output Length: ' + str(len(output)))
        print(' ')    
        i+=1

player_names = hall_of_famers[0:1]
player_urls = []

extract_stats_urls(player_names, player_urls)

1
query: "site:www.basketball-reference.com/players/*/*.html C Stats"
url found: https://www.basketball-reference.com/players/c/capelca01.html
Output Length: 1
 
2
query: "site:www.basketball-reference.com/players/*/*.html h Stats"
url found: https://www.basketball-reference.com/players/h/howardw01.html
Output Length: 2
 
3
query: "site:www.basketball-reference.com/players/*/*.html u Stats"
url found: https://www.basketball-reference.com/players/u/udrihbe01.html
Output Length: 3
 
4
query: "site:www.basketball-reference.com/players/*/*.html c Stats"
url found: https://www.basketball-reference.com/players/c/capelca01.html
Output Length: 4
 
5
query: "site:www.basketball-reference.com/players/*/*.html k Stats"


KeyboardInterrupt: 

### Save/load dictionaries

In [4]:
import pickle
import os

test = {
    'words': """
        Lorem ipsum dolor sit amet, consectetur adipiscing 
        elit. Mauris adipiscing adipiscing placerat. 
        Vestibulum augue augue, 
        pellentesque quis sollicitudin id, adipiscing.
        """,
    'list': list(range(10000)),
    'dict': dict((str(i),'a') for i in range(10000)),
    'int': 100,
    'float': 100.123456
}

def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

def get_file_size(filename):
    statinfo = os.stat(filename)
    return sizeof_fmt(statinfo.st_size)

def save_dict(dictionary, filename):
    with open(filename, 'wb') as file:
        pickle.dump(dictionary, file)
    print('file size: ', get_file_size(filename))
    return True;

def load_dict(filename):
    print('file size: ', get_file_size(filename))
    with open(filename, 'rb') as file:
        return pickle.load(file)

save_dict(test, 'test.json')

len(load_dict('test.json'))

file size:  183.8 KiB
file size:  183.8 KiB


5

### Get Urls for Different Players Lists

In [None]:
hall_of_famers_urls = []

extract_stats_urls(hall_of_famers, hall_of_famers_urls)

hall_of_famers_urls.remove('https://www.basketball-reference.com/players/t/thomais02.html')
hall_of_famers_urls.append('https://www.basketball-reference.com/players/t/thomais01.html')

In [None]:
retired_all_stars_urls = []

extract_stats_urls(retired_all_stars, retired_all_stars_urls)

In [None]:
retired_all_nbas_urls = []

extract_stats_urls(retired_all_nbas, retired_all_nbas_urls)

### Get Stats for URLs Lists

In [None]:
hall_of_famers_stats = {}
hall_of_famers_failures = {}

parse_urls_into_tables(url, players, failures)

In [None]:
print(len(hall_of_famers_urls))
print(len(retired_all_stars_urls))
print(len(retired_all_nbas_urls))

### Get Stats for All

In [6]:
import pickle

retired_stars_urls = []

with open('retired_stars_urls.pickle', 'rb') as file:
    retired_stars_urls = pickle.load(file)
    
len(retired_stars_urls)

388

In [7]:
# retired_stars_urls = list(set(hall_of_famers_urls).union(set(retired_all_stars_urls)).union(set(retired_all_nbas_urls)))
retired_stars_stats = {}
retired_stars_failures = {}

parse_urls_into_tables(retired_stars_urls, retired_stars_stats, retired_stars_failures)

1
2
3
4
file size:  171.8 KiB
5
6
7
8
9
file size:  415.8 KiB
10
11
12
13
14
file size:  645.6 KiB
15
16
17
18
19
file size:  799.5 KiB
20
21
22
23
24
file size:  1.0 MiB
25
26
27
28
29
file size:  1.2 MiB
30
31
32
33
34
file size:  1.4 MiB
35
36
37
38
39
file size:  1.7 MiB
40
41
42
43
44
file size:  1.9 MiB
45
46
47
48
49
file size:  2.1 MiB
50
51
52
53
54
file size:  2.3 MiB
55
56
57
58
59
file size:  2.5 MiB
60
61
62
63
64
file size:  2.7 MiB
65
66
67
68
69
file size:  2.9 MiB
70
71
72
73
74
file size:  3.1 MiB
75
76
77
78
79
file size:  3.3 MiB
80
81
82
83
84
file size:  3.5 MiB
85
86
87
88
89
file size:  3.7 MiB
90
91
92
93
94
file size:  3.9 MiB
95
96
97
98
99
file size:  4.0 MiB
100
101
102
103
104
file size:  4.2 MiB
105
106
107
108
109
file size:  4.4 MiB
110
111
112
113
114
file size:  4.6 MiB
115
116
117
118
119
file size:  4.9 MiB
120
121
122
123
124
file size:  5.1 MiB
125
126
127
128
129
file size:  5.3 MiB
130
131
132
133
134
file size:  5.5 MiB
135
136
137
138
139
file

In [8]:
len(retired_stars_stats)

387

In [11]:
# retired_stars_stats['Walt Bellamy']['all_salaries']

retired_stars_stats['Zydrunas Ilgauskas']['per_game']

Unnamed: 0,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1997-98,22.0,CLE,NBA,C,82.0,81.0,29.0,5.5,10.7,0.518,0.0,0.0,0.25,5.5,10.6,0.519,0.519,2.8,3.7,0.762,3.4,5.4,8.8,0.9,0.6,1.6,1.8,3.5,13.9
1,1998-99,23.0,CLE,NBA,C,5.0,5.0,34.2,5.8,11.4,0.509,0.0,0.0,,5.8,11.4,0.509,0.509,3.6,6.0,0.6,3.4,5.4,8.8,0.8,0.8,1.4,1.8,4.8,15.2
2,1999-00,24.0,Did Not Play (injury—foot),,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2000-01,25.0,CLE,NBA,C,24.0,24.0,25.7,4.8,9.8,0.487,0.0,0.1,0.0,4.8,9.7,0.491,0.487,2.2,3.3,0.679,2.7,4.0,6.7,0.8,0.6,1.5,2.5,3.3,11.7
4,2001-02,26.0,CLE,NBA,C,62.0,23.0,21.4,3.9,9.1,0.425,0.0,0.1,0.0,3.9,9.1,0.429,0.425,3.4,4.5,0.754,2.2,3.2,5.4,1.1,0.3,1.4,1.5,3.0,11.1
5,2002-03,27.0,CLE,NBA,C,81.0,81.0,30.0,6.1,13.9,0.441,0.0,0.1,0.0,6.1,13.8,0.443,0.441,4.9,6.3,0.781,3.0,4.6,7.5,1.6,0.7,1.9,2.6,3.4,17.2
6,2003-04,28.0,CLE,NBA,C,81.0,81.0,31.3,5.8,11.9,0.483,0.0,0.1,0.286,5.7,11.8,0.484,0.484,3.7,5.0,0.746,3.4,4.6,8.1,1.3,0.5,2.5,2.0,3.4,15.3
7,2004-05,29.0,CLE,NBA,C,78.0,78.0,33.5,5.9,12.6,0.468,0.0,0.1,0.286,5.8,12.5,0.469,0.469,5.2,6.4,0.799,3.8,4.8,8.6,1.3,0.7,2.1,2.4,4.0,16.9
8,2005-06,30.0,CLE,NBA,C,78.0,78.0,29.3,5.8,11.4,0.506,0.0,0.1,0.0,5.8,11.3,0.509,0.506,4.1,4.9,0.834,3.1,4.4,7.6,1.2,0.5,1.7,2.0,3.6,15.6
9,2006-07,31.0,CLE,NBA,C,78.0,78.0,27.3,4.9,10.2,0.485,0.0,0.0,0.0,4.9,10.2,0.486,0.485,2.0,2.5,0.807,3.1,4.6,7.7,1.6,0.6,1.3,1.8,3.3,11.9


In [10]:
save_dict(retired_stars_stats, 'retired_stars_stats.pickle')

file size:  15.9 MiB


True

In [None]:
save_dict(retired_stars_urls, 'retired_stars_urls.pickle')

In [5]:
retired_stars_urls = load_dict('retired_stars_urls.pickle')
len(retired_stars_urls)

file size:  26.1 KiB


388