In [1]:
import pandas as pd
import requests
import time
import re
import numpy as np
from bs4 import BeautifulSoup
from sklearn.linear_model import LinearRegression


In [2]:
data = pd.DataFrame(columns= ['Team', 'Year','Wins','Champion','MVP','Scoring Leader', 
                              'Rebound Leader', 'Assist Leader', 'WS Leader','DPOY',
                              'MIP', '6MOY', 'Coach of Year', 'All-Stars', 'All-Defensive',
                              'All-NBA','FG%','3P Attempts', '3P%', '2P Attempts', 
                              '2P%','FT%','RPG','APG','STL',  'BLK' , 'TOVPG','PPG', 
                             ])
data



Unnamed: 0,Team,Year,Wins,Champion,MVP,Scoring Leader,Rebound Leader,Assist Leader,WS Leader,DPOY,...,3P%,2P Attempts,2P%,FT%,RPG,APG,STL,BLK,TOVPG,PPG


In [21]:
def check_roster(team_soup, season_awards, allst_roster, all_def, all_nba_roster):
    awards = [0,0,0,0,0,0,0,0,0,0,0,0]
    players = team_soup.find_all('td', {'class':'left', 'data-stat':'player'})
    roster = [player.text for player in players]
    coach = roster.append(team_soup.find(text=re.compile('Coach:')).parent.parent.find('a').text)
    
    star_count = 0
    all_def_count = 0
    for allst in allst_roster:
        if allst in roster:
            star_count += 1
 
    awards[9] = star_count
    for i in range(9):
        if season_awards[i] in roster:
            awards[i] = 1
            
    for player in roster:
        if player in all_def:
            all_def_count +=1 
    awards[10] = all_def_count


    for i in range(3):
        for player in all_nba_roster[i]:
            if player in roster:
                awards[11] += 3-i
    
    return awards

In [22]:
def get_team_row(all_def, all_nba,season_awards, allst_roster, name, team_url, year, wins):
    row = []
    row.append(name.replace('*',''))
    row.append(year)
    row.append(wins)
    
    roster = []
    
    url = 'http://www.basketball-reference.com' + team_url
    req = requests.get(url)
    time.sleep(0.1)
    team_soup = BeautifulSoup(req.text, 'html.parser')
    try:
        row.append(team_soup.find('br').text[:4] == 'Won ')
    except:
        row.append(False)
        
    row += check_roster(team_soup, season_awards, allst_roster,all_def, all_nba)
    row += get_statline(req.text)    
    return row

In [23]:
leaders = ['Most Valuable Player', 'PPG Leader', 'RPG Leader',
           'APG Leader', 'WS Leader']

awards = ['Defensive Player of the Year', 'Most Improved Player', 
          'Sixth Man of the Year', 'Coach of the Year']

def get_award_winners(season, year):
    winners = []

    if year == 2017:
        return ['Russell Westbrook', 'Russell Westbrook', 'Hassan Whiteside','James Harden',
                'James Harden', 'Draymond Green', 'Giannis Antetokounmpo','Andrew Iguodala',
                "Mike D'Antoni"]
    for leader in leaders:
        winners.append(season.body.find(text=leader).parent.parent.find('a').text)

    #Using a different site that's more convenient to scrape other awards.
    year_string = '%s' % str(year)
    next_year = '%s' % str(year+1)
    years = '%s-%s' % (year_string, next_year[2:4])

    for award in awards:
        aw_url = 'http://www.nba.com/history/%s-award/' %(award.lower().replace(' ', '-'))
        time.sleep(0.1)
        aw_req = requests.get(aw_url)
        aw_soup = BeautifulSoup(aw_req.text, 'html.parser')

        table = aw_soup.find('table')    
        try:
            #If there is a link, you need 3 parents. if no link, 2
            if ((award == 'Defensive Player of the Year' and year > 2000)
                or (award == 'Most Improved Player' and year > 2004)
                or (award == 'Sixth Man of the Year' and year > 2001)
                or (award == 'Coach of the Year' and year > 2002)):
                winners.append(table.find(text=re.compile(years)).parent.parent.parent.find_all('td')[1].text.strip())
            else:
                winners.append(table.find(text=re.compile(years)).parent.parent.find_all('td')[1].text.strip())

        except:
            winners.append('')
    
    return winners 

In [24]:
# Append all_stars rosters to list
def get_allstars(year):

    url = "http://www.basketball-reference.com/allstar/NBA_%s.html/" % str(year)
    req = requests.get(url)
    soup = BeautifulSoup(req.text, 'html.parser')
    all_stars = []
    
    #No all star game in 1999 due to league lockout
    if year != 1999:
        for row in soup.find("div", {"id" : "all_East"}).find_all("th", {"class" : "left", "scope" : "row", "csk" : True}):
            all_stars.append(row.find("a").text)
        for row in soup.find("div", {"id" : "all_West"}).find_all("th", {"class" : "left", "scope" : "row", "csk" : True}):
            all_stars.append(row.find("a").text)
        
    return all_stars

In [25]:
## Because we need only 1 request to get the whole defensive page, each five players from list is from the First team of a season.
def get_all_def():
    url = "http://www.nba.com/history/all-defensive-teams/"
    req = requests.get(url)
    defensive = BeautifulSoup(req.text, 'html.parser')
    all_defense = []
    
    for i in defensive.find_all("td" , {"class" : "cnnIEColTxtL", "style" : "text-align: center"}):
        if i.text == "First Team":
            continue
        all_defense.append(i.text)
        
    for i in defensive.find_all("td" , {"class" : "cnnIEColTxtR", "style" : "text-align: center"}):
        if i.text == "Second Team":
            continue
        all_defense.append(i.text)
    return all_defense

In [26]:
def get_all_nba():
    req = requests.get("http://www.basketball-reference.com/awards/all_league.html")
    soup = BeautifulSoup(req.text, "html.parser")
    j = 0
    i = 0
    data_stat = 1
    prev_year = "17"
    dict_allnba = {

    }
    players = []
    for row in soup.find_all("tr", {"class" : False}):
        content = row.text
        year = content[:7]
        year = year[:2] + year[-2:]
        league = content[7:10]
        if league != "NBA":
            j+=1 
            continue
        if ((i == 2) & (year != prev_year)):
            dict_allnba[prev_year].append(players)
            data_stat = 1
            i = 0
            j += 1
        if (i == 3):
            i = 0
            data_stat = 1
        if (soup.find_all("td", {"class" : "left", "data-stat" : "%s" % str(1)})[j].find("a") == None):
            j += 1
        # Create a list of players to hold 5 players
        players = []   
        if year not in dict_allnba:
            dict_allnba[year] = []

        for stat in range(data_stat, data_stat + 5):
            try:
                if (stat > 5) & (j != 0): 
                    new_j = int(j / 2)
                    players.append(soup.find_all("td", {"class" : "left", "data-stat" : "%s" % str(stat)})[new_j].find("a").text)
                else:
                    players.append(soup.find_all("td", {"class" : "left", "data-stat" : "%s" % str(stat)})[j].find("a").text)
            except:
                pass           
            if (stat == 5) | (stat == 10) | (stat == 15):
                dict_allnba[year].append(players)
                players = []
        data_stat += 5
        i += 1
        prev_year = year

        if (data_stat == 16):
            data_stat = 1
            j += 1
    dict_allnba['2000'] = dict_allnba.pop('1900')
    return dict_allnba

In [27]:
def get_statline(page_text):
    stats = [0,0,0,0,0,0,0,0,0,0,0,0]
    stat_search = ['fg_pct', 'fg3a', 'fg3_pct', 'fg2a', 'fg2_pct',
                   'ft_pct','trb_per_g','ast_per_g','stl_per_g',
                   'blk_per_g','tov_per_g','pts_per_g']
    
    for i in range(len(stat_search)):
        indexer = page_text.find('data-stat="%s" >' % (stat_search[i]))
        try:
            offset = len(stat_search[i]) + 14
            stat = re.sub("<", '',page_text[indexer+offset: indexer+offset+6]).rstrip('/td<>')
            stat = float(stat)
        except:
            stat = 0
        stats[i] = stat
    
    return stats

In [28]:
row_num = 0
#all_def = get_all_def()
#all_nba = get_all_nba()
for i in range(1960,2018):
    print(i)
    double_count = 0
    url = ('http://www.basketball-reference.com/leagues/NBA_%s.html' % str(i))
    time.sleep(0.1)
    req = requests.get(url)
    season = BeautifulSoup(req.text, 'html.parser')
    teams = season.find_all('th', {'class': 'left', 'data-stat':'team_name'})
   
    season_awards = get_award_winners(season, i)
    allst_roster = get_allstars(i)
    all_nba_team = all_nba[str(i)]
    
    for team in teams:
        double_count += 1
        if(double_count > 32):
            break
        if(team.text == 'Team' or 'Conference' in team.text):
            continue
        wins = int(team.parent.find('td', {'data-stat':'wins'}).text)
     
        team_name = repr(team.text).replace('\\x','*').split('*')[0].strip("'")
        for a in team.find_all('a',href=True):
            new_row = get_team_row(all_def,all_nba_team,season_awards,allst_roster,team_name,a['href'], i, wins)
            data.loc[row_num] = new_row
            row_num += 1


1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017


In [31]:
name_changes = [['Rockets', 'Houston Rockets'], ['Lakers', 'Los Angeles Lakers'], ['Jazz', 'Utah Jazz'],
                ['Hawks', 'Atlanta Hawks'], ['Clippers', 'Los Angeles Clippers'], ['Grizzlies', 'Memphis Grizzlies'],
                ['SuperSonics', 'Oklahoma City Thunder'], ['Kings', 'Sacramento Kings'], 
                ['Royals', 'Sacramento Kings'], ['Zephyrs', 'Washington Wizards'],
                ['Bobcats', 'Charlotte Hornets'], ['Warriors', 'Golden State Warriors'],
                ['New Orleans', 'New Orleans Pelicans'], ['Syracuse', 'Philadelphia 76ers'],
                ['Nets','Brooklyn Nets'], ['Bullets', 'Washington Wizards'], 
                ['Buffalo', 'Los Angeles Clippers'], ['Packers', 'Washington Wizards'] ]
for name_change in name_changes:
    data.ix[data.Team.str.contains(name_change[0]), 'Team'] = name_change[1]



In [3]:
data = pd.read_csv('./nba_data.csv')

In [4]:
data['Accolades'] = (data['MVP'] + data['Scoring Leader'] + data['Rebound Leader'] + data['Assist Leader']
                     + data['WS Leader'] + data['DPOY'] + data['MIP'] + data['6MOY'] + data['Coach of Year']
                     + data['All-Stars'] + data['All-Defensive'] + data['All-NBA'])
data['Franchise ID'] = data['Team'].str.split().str[-1]

In [5]:
data

Unnamed: 0.1,Unnamed: 0,Team,Year,Wins,Champion,MVP,Scoring Leader,Rebound Leader,Assist Leader,WS Leader,...,2P%,FT%,RPG,APG,STL,BLK,TOVPG,PPG,Accolades,Franchise ID
0,0,Boston Celtics,1960.0,59.0,True,0.0,0.0,0.0,1.0,0.0,...,0.417,0.734,80.2,24.7,0.0,0.0,0.0,124.5,13.0,Celtics
1,1,Golden State Warriors,1960.0,49.0,False,1.0,1.0,1.0,0.0,1.0,...,0.409,0.669,78.9,23.9,0.0,0.0,0.0,118.6,9.0,Warriors
2,2,Philadelphia 76ers,1960.0,45.0,False,0.0,0.0,0.0,0.0,0.0,...,0.414,0.791,72.1,22.3,0.0,0.0,0.0,118.9,5.0,76ers
3,3,New York Knicks,1960.0,27.0,False,0.0,0.0,0.0,0.0,0.0,...,0.421,0.765,70.0,22.2,0.0,0.0,0.0,117.3,5.0,Knicks
4,4,Atlanta Hawks,1960.0,46.0,False,0.0,0.0,0.0,0.0,0.0,...,0.419,0.745,71.2,25.1,0.0,0.0,0.0,113.4,10.0,Hawks
5,5,Detroit Pistons,1960.0,30.0,False,0.0,0.0,0.0,0.0,0.0,...,0.397,0.729,73.2,19.6,0.0,0.0,0.0,111.6,3.0,Pistons
6,6,Los Angeles Lakers,1960.0,25.0,False,0.0,0.0,0.0,0.0,0.0,...,0.386,0.730,72.4,19.3,0.0,0.0,0.0,107.3,7.0,Lakers
7,7,Sacramento Kings,1960.0,19.0,False,0.0,0.0,0.0,0.0,0.0,...,0.412,0.716,70.0,23.3,0.0,0.0,0.0,111.1,1.0,Kings
8,8,Boston Celtics,1961.0,57.0,True,1.0,0.0,0.0,0.0,0.0,...,0.398,0.735,77.6,23.7,0.0,0.0,0.0,119.7,12.0,Celtics
9,9,Golden State Warriors,1961.0,46.0,False,0.0,1.0,1.0,0.0,1.0,...,0.424,0.651,75.2,24.8,0.0,0.0,0.0,121.0,10.0,Warriors


In [6]:
data.to_csv('nba_data.csv')