In [1]:
import urllib.request
import re
import pickle
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
YEAR=2016

In [3]:
### FUNCTIONS
### FUNCTIONS

def gets_html(url, headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'}):
    """ Lee un url y devuelve el código html """
    req = urllib.request.Request(url,headers=headers)
    return(urllib.request.urlopen(req).read().decode("utf-8"))

def scrap_table(soup, table_class):
    table = soup.find(class_=table_class)
    table_body = table.find('tbody')
    table_rows = table.find_all('tr') if table_body is None else table_body.find_all('tr')
    table_list = []
    for row in table_rows:
        row_list = []
        for cell in row.find_all('td'):
            # Get case value
            row_list.append(cell.get_text())
        table_list.append(row_list)
    return table_list

def create_basketball_reference_df(link, columns):
    # Scrap info
    html = gets_html(link)
    soup = BeautifulSoup(html)
    # Create free agents list 
    records_list = scrap_table(soup, table_class='table_container')
    # Create dataframe
    df = pd.DataFrame.from_records(records_list, columns=columns)
    return df

def create_salaries_df(year):
    html = gets_html(f"https://web.archive.org/web/{year}0501000000/http://www.basketball-reference.com/contracts/players.html")
    soup = BeautifulSoup(html)
    columns = soup.find_all('th', class_=['tooltip','poptip'])
    columns = [c.get_text() for c in columns]
    records_list = scrap_table(soup, table_class='table_container')
    salaries_df = pd.DataFrame.from_records(records_list, columns=columns[1:])
    if '' in salaries_df.columns:
        salaries_df.drop(columns='', inplace=True)
    return salaries_df

def create_free_agents_wayback_df(year):
    html = gets_html(f"https://web.archive.org/web/{year}0630000000/https://www.basketball-reference.com/friv/free_agents.cgi")
    soup = BeautifulSoup(html)
    columns = soup.find_all('th', class_=['tooltip','poptip'])
    columns = ['Rk'] + [c.get_text() for c in columns]
    records_list = scrap_table(soup, table_class='table_container')
    salaries_df = pd.DataFrame.from_records(records_list, columns=columns)
    return salaries_df



In [8]:
a = create_free_agents_wayback_df(2013)

In [7]:
a

Unnamed: 0,Rk,Player,Age,Type,OTm,NTm,Notes,Summary
0,1,Arron Afflalo,26-258,RFA,DEN,DEN,,"6.2 WS, .128 WS/48"
1,2,Alexis Ajinca,24-054,UFA,TOR,,,"0.4 WS, .052 WS/48"
2,3,Malik Allen,34-002,UFA,ORL,,,"0.2 WS, .047 WS/48"
3,4,Ray Allen,36-345,UFA-P,BOS,BOS,exercised player option in June,"10.0 WS, .166 WS/48"
4,5,Louis Amundson,29-205,UFA-P,GSW,GSW,exercised player option in June,"0.6 WS, .044 WS/48"
...,...,...,...,...,...,...,...,...
156,150,Brandan Wright,24-268,RFA,NJN,DAL,,"0.8 WS, .099 WS/48"
157,151,Julian Wright,25-040,RFA,TOR,,,"0.6 WS, .039 WS/48"
158,152,Nick Young,27-028,RFA,WAS,WAS,,"2.8 WS, .065 WS/48"
159,153,Sam Young,27-028,RFA-T,MEM,MEM,team exercised option in June,"2.9 WS, .088 WS/48"


In [7]:
### ARGUMENTS

DIRECTORY = f'datasets/{YEAR}/'

FREE_AGENTS_LINK = f"https://www.basketball-reference.com/friv/free_agents.fcgi?year={YEAR}"
FREE_AGENTS_COLUMNS = ["player", "position", "age", "type", "old_team",
                       "stats", "ws", "new_team", "terms", "notes"]

STATS_LINK = f"https://www.basketball-reference.com/leagues/NBA_{YEAR}_per_game.html"
STATS_COLUMNS = ["player", "position", "age","team_id","g","gs","mp_per_g","fg_per_g",
"fga_per_g","fg_pct","fg3_per_g","fg3a_per_g","fg3_pct","fg2_per_g","fg2a_per_g",
"fg2_pct","efg_pct","ft_per_g","fta_per_g","ft_pct","orb_per_g","drb_per_g","trb_per_g",
"ast_per_g","stl_per_g","blk_per_g","tov_per_g","pf_per_g","pts_per_g"]
NUM_STATS_COLUMNS = [col for col in STATS_COLUMNS if col not in ('"player", "position","team_id"')]


ADVANCED_STATS_LINK = f"https://www.basketball-reference.com/leagues/NBA_{YEAR}_advanced.html"
ADVANCED_STATS_COLUMNS =["player","position","age","team_id","g","mp","per","ts_pct","fg3a_per_fga_pct",
"fta_per_fga_pct","orb_pct","drb_pct","trb_pct","ast_pct","stl_pct","blk_pct","tov_pct","usg_pct",
"ws-dum","ows","dws","ws","ws_per_48","bpm-dum","obpm","dbpm","bpm","vorp"] 
NUM_ADVANCED_STATS_COLUMNS = [col for col in ADVANCED_STATS_COLUMNS if col not in ('"player", "position","team_id"')]


### Salaries 2opt

In [8]:
# Free agents
free_agents = create_basketball_reference_df(FREE_AGENTS_LINK, FREE_AGENTS_COLUMNS)
free_agents = free_agents[free_agents.player.notnull()]
# Stats per game
stats = create_basketball_reference_df(STATS_LINK, STATS_COLUMNS)
stats = stats[stats.player.notnull()]
stats[NUM_STATS_COLUMNS] = stats[NUM_STATS_COLUMNS].apply(pd.to_numeric)

# Advanced stats
advanced_stats = create_basketball_reference_df(ADVANCED_STATS_LINK, ADVANCED_STATS_COLUMNS)
advanced_stats = advanced_stats[advanced_stats.player.notnull()]
advanced_stats[NUM_ADVANCED_STATS_COLUMNS] = advanced_stats[NUM_ADVANCED_STATS_COLUMNS].apply(pd.to_numeric)
    
# Salaries
salaries = create_salaries_df(YEAR)
salaries = salaries[salaries.Player.notnull()]
NUM_SALARIES_COLUMNS = [c for c in salaries.columns if c not in ('Player', 'Tm', 'Signed Using')]
salaries[NUM_SALARIES_COLUMNS] = salaries[NUM_SALARIES_COLUMNS]\
                                        .apply(lambda x: x.str.replace('\$|,','')).apply(pd.to_numeric)

AssertionError: 0 columns passed, passed data had 11 columns

In [66]:
html = gets_html("https://web.archive.org/web/20190101000000/http://www.basketball-reference.com/contracts/players.html")
soup = BeautifulSoup(html)

In [67]:
columns = soup.find_all('th', class_=['tooltip','poptip'])
columns = [c.get_text() for c in columns]
records_list = scrap_table(soup, table_class='table_container')
salaries_df = pd.DataFrame.from_records(records_list, columns=columns[1:])
if '' in salaries_df.columns:
    salaries_df.drop(columns='', inplace=True)

In [65]:
salaries_df

Unnamed: 0,Player,Tm,2015-16,2016-17,2017-18,2018-19,2019-20,2020-21,Signed Using,Guaranteed
0,Kobe Bryant,LAL,"$25,000,000",,,,,,Bird Rights,"$25,000,000"
1,Joe Johnson,BRK,"$24,894,863",,,,,,Bird Rights,"$24,894,863"
2,LeBron James,CLE,"$22,971,000","$24,004,000",,,,,Cap Space,"$22,971,000"
3,Carmelo Anthony,NYK,"$22,875,000","$24,559,380","$26,243,760","$27,928,140",,,Bird Rights,"$73,678,140"
4,Dwight Howard,HOU,"$22,359,364","$23,282,457",,,,,Cap Space,"$22,359,364"
5,Chris Bosh,MIA,"$22,192,730","$23,741,060","$25,289,390","$26,837,720",,,Bird Rights,"$98,060,900"
6,Chris Paul,LAC,"$21,468,696","$22,868,828","$24,268,960",,,,Bird Rights,"$44,337,524"
7,Kevin Durant,OKC,"$20,158,622",,,,,,Bird Rights,"$20,158,622"
8,Derrick Rose,CHI,"$20,093,063","$21,323,250",,,,,Bird Rights,"$41,416,313"
9,Dwyane Wade,MIA,"$20,000,000",,,,,,Bird Rights,"$20,000,000"


In [53]:
soup.find_all('th', class_='tooltip')

[<th align="CENTER" class="tooltip over_header" colspan="3" data-stat=""></th>,
 <th align="CENTER" class="tooltip over_header" colspan="2" data-stat=""></th>,
 <th align="left" class="tooltip sort_default_asc" data-stat="player">Player</th>,
 <th align="left" class="tooltip sort_default_asc" data-stat="team_id" tip="Team">Tm</th>,
 <th align="CENTER" class="tooltip" data-stat="y1">2015-16</th>,
 <th align="CENTER" class="tooltip" data-stat="y2">2016-17</th>,
 <th align="CENTER" class="tooltip" data-stat="y3">2017-18</th>,
 <th align="CENTER" class="tooltip" data-stat="y4">2018-19</th>,
 <th align="CENTER" class="tooltip" data-stat="y5">2019-20</th>,
 <th align="CENTER" class="tooltip" data-stat="y6">2020-21</th>,
 <th align="left" class="tooltip sort_default_asc" data-stat="signed_using">Signed Using</th>,
 <th align="right" class="tooltip" data-stat="remain_gtd" tip="The amount of a player's remaining salary that is guaranteed.">Guaranteed</th>]

In [26]:
soup.find_all('table', id='contracts')

d>
 <td align="right"></td>
 <td align="right"></td>
 <td align="right"></td>
 <td align="right"></td>
 <td align="right"></td>
 <td align="left"></td>
 <td align="right" csk="100000">$100,000</td>
 </tr>
 <tr class="">
 <td align="right" csk="480">480</td>
 <td align="left"><a href="/web/20160115232712/http://www.basketball-reference.com/players/h/holmejo01.html">Jonathan Holmes</a></td>
 <td align="left"><a href="/web/20160115232712/http://www.basketball-reference.com/contracts/LAL.html">LAL</a></td>
 <td align="right" csk="100000">$100,000</td>
 <td align="right"></td>
 <td align="right"></td>
 <td align="right"></td>
 <td align="right"></td>
 <td align="right"></td>
 <td align="left"></td>
 <td align="right" csk="100000">$100,000</td>
 </tr>
 <tr class="no_ranker thead over_header">
 <th align="CENTER" class="over_header" colspan="3" data-stat=""></th>
 <th align="center" class="bold_text over_header" colspan="6" data-stat="header_salary">Salary</th>
 <th align="CENTER" class="over