## Scraping Basketball Reference and NBA.com
- We also eventually found a csv file on player data 
- Another available resource is a dataset on team ELO, a team rating created by FiveThirtyEight


In [None]:
import pandas as pd
import requests
import time
import re
import numpy as np
from bs4 import BeautifulSoup
from sklearn.linear_model import LinearRegression


In [None]:
data = pd.DataFrame(columns= ['Team', 'Year','Wins','Champion','MVP','Scoring Leader', 
                              'Rebound Leader', 'Assist Leader', 'WS Leader','DPOY',
                              'MIP', '6MOY', 'Coach of Year', 'All-Stars', 'All-Defensive',
                              'All-NBA','FG%','3P Attempts', '3P%', '2P Attempts', 
                              '2P%','FT%','RPG','APG','STL',  'BLK' , 'TOVPG','PPG', 
                             ])
data



### Functions

In [None]:
# Check if each member of the given team is apart of the list of award winners. 
# Returns a list of each award, to be apart of the row in the dataframe. 
def check_roster(team_soup, season_awards, allst_roster, all_def, all_nba_roster):
    awards = [0,0,0,0,0,0,0,0,0,0,0,0]
    players = team_soup.find_all('td', {'class':'left', 'data-stat':'player'})
    roster = [player.text for player in players]
    coach = roster.append(team_soup.find(text=re.compile('Coach:')).parent.parent.find('a').text)
    
    star_count = 0
    all_def_count = 0
    for allst in allst_roster:
        if allst in roster:
            star_count += 1
 
    awards[9] = star_count
    for i in range(9):
        if season_awards[i] in roster:
            awards[i] = 1
            
    for player in roster:
        if player in all_def:
            all_def_count +=1 
    awards[10] = all_def_count


    for i in range(3):
        for player in all_nba_roster[i]:
            if player in roster:
                awards[11] += 3-i
    
    return awards

In [None]:
# Collects all the data to be turned into a row for the dataframe. One season for one team. 
def get_team_row(all_def, all_nba,season_awards, allst_roster, name, team_url, year, wins):
    row = []
    row.append(name.replace('*',''))
    row.append(year)
    row.append(wins)
    
    
    url = 'http://www.basketball-reference.com' + team_url
    req = requests.get(url)
    time.sleep(0.1)
    team_soup = BeautifulSoup(req.text, 'html.parser')
    try:
        row.append(team_soup.find('br').text[:4] == 'Won ')
    except:
        row.append(False)
        
    row += check_roster(team_soup, season_awards, allst_roster,all_def, all_nba)
    row += get_statline(req.text)    
    return row

In [None]:
# Gets every leader/award winner for a single season.
leaders = ['Most Valuable Player', 'PPG Leader', 'RPG Leader',
           'APG Leader', 'WS Leader']

awards = ['Defensive Player of the Year', 'Most Improved Player', 
          'Sixth Man of the Year', 'Coach of the Year']

def get_award_winners(season, year):
    winners = []

    if year == 2017:
        return ['Russell Westbrook', 'Russell Westbrook', 'Hassan Whiteside','James Harden',
                'James Harden', 'Draymond Green', 'Giannis Antetokounmpo','Andrew Iguodala',
                "Mike D'Antoni"]
    for leader in leaders:
        winners.append(season.body.find(text=leader).parent.parent.find('a').text)

    #Using a different site that's more convenient to scrape other awards.
    year_string = '%s' % str(year)
    next_year = '%s' % str(year+1)
    years = '%s-%s' % (year_string, next_year[2:4])

    for award in awards:
        aw_url = 'http://www.nba.com/history/%s-award/' %(award.lower().replace(' ', '-'))
        time.sleep(0.1)
        aw_req = requests.get(aw_url)
        aw_soup = BeautifulSoup(aw_req.text, 'html.parser')

        table = aw_soup.find('table')    
        try:
            #If there is a link, you need 3 parents. if no link, 2
            if ((award == 'Defensive Player of the Year' and year > 2000)
                or (award == 'Most Improved Player' and year > 2004)
                or (award == 'Sixth Man of the Year' and year > 2001)
                or (award == 'Coach of the Year' and year > 2002)):
                winners.append(table.find(text=re.compile(years)).parent.parent.parent.find_all('td')[1].text.strip())
            else:
                winners.append(table.find(text=re.compile(years)).parent.parent.find_all('td')[1].text.strip())

        except:
            winners.append('')
    
    return winners 

In [None]:
# Get a list of all stars for the year. 
def get_allstars(year):

    url = "http://www.basketball-reference.com/allstar/NBA_%s.html/" % str(year)
    req = requests.get(url)
    soup = BeautifulSoup(req.text, 'html.parser')
    all_stars = []
    
    #No all star game in 1999 due to league lockout
    if year != 1999:
        for row in soup.find("div", {"id" : "all_East"}).find_all("th", {"class" : "left", "scope" : "row", "csk" : True}):
            all_stars.append(row.find("a").text)
        for row in soup.find("div", {"id" : "all_West"}).find_all("th", {"class" : "left", "scope" : "row", "csk" : True}):
            all_stars.append(row.find("a").text)
        
    return all_stars

In [None]:
# List of all defensive players in NBA history. 
def get_all_def():
    url = "http://www.nba.com/history/all-defensive-teams/"
    req = requests.get(url)
    defensive = BeautifulSoup(req.text, 'html.parser')
    all_defense = []
    
    for i in defensive.find_all("td" , {"class" : "cnnIEColTxtL", "style" : "text-align: center"}):
        if i.text == "First Team":
            continue
        all_defense.append(i.text)
        
    for i in defensive.find_all("td" , {"class" : "cnnIEColTxtR", "style" : "text-align: center"}):
        if i.text == "Second Team":
            continue
        all_defense.append(i.text)
    return all_defense

In [None]:
# Returns a dictionary. The key is the year, the item is a list of lists: first, second and third team. 
def get_all_nba():
    req = requests.get("http://www.basketball-reference.com/awards/all_league.html")
    soup = BeautifulSoup(req.text, "html.parser")
    j = 0
    i = 0
    data_stat = 1
    prev_year = "17"
    dict_allnba = {

    }
    players = []
    for row in soup.find_all("tr", {"class" : False}):
        content = row.text
        year = content[:7]
        year = year[:2] + year[-2:]
        league = content[7:10]
        if league != "NBA":
            j+=1 
            continue
        if ((i == 2) & (year != prev_year)):
            dict_allnba[prev_year].append(players)
            data_stat = 1
            i = 0
            j += 1
        if (i == 3):
            i = 0
            data_stat = 1
        if (soup.find_all("td", {"class" : "left", "data-stat" : "%s" % str(1)})[j].find("a") == None):
            j += 1
        # Create a list of players to hold 5 players
        players = []   
        if year not in dict_allnba:
            dict_allnba[year] = []

        for stat in range(data_stat, data_stat + 5):
            try:
                if (stat > 5) & (j != 0): 
                    new_j = int(j / 2)
                    players.append(soup.find_all("td", {"class" : "left", "data-stat" : "%s" % str(stat)})[new_j].find("a").text)
                else:
                    players.append(soup.find_all("td", {"class" : "left", "data-stat" : "%s" % str(stat)})[j].find("a").text)
            except:
                pass           
            if (stat == 5) | (stat == 10) | (stat == 15):
                dict_allnba[year].append(players)
                players = []
        data_stat += 5
        i += 1
        prev_year = year

        if (data_stat == 16):
            data_stat = 1
            j += 1
    dict_allnba['2000'] = dict_allnba.pop('1900')
    return dict_allnba

In [None]:
# Gets the average statline for the team's season 
def get_statline(page_text):
    stats = [0,0,0,0,0,0,0,0,0,0,0,0]
    stat_search = ['fg_pct', 'fg3a', 'fg3_pct', 'fg2a', 'fg2_pct',
                   'ft_pct','trb_per_g','ast_per_g','stl_per_g',
                   'blk_per_g','tov_per_g','pts_per_g']
    
    for i in range(len(stat_search)):
        indexer = page_text.find('data-stat="%s" >' % (stat_search[i]))
        try:
            offset = len(stat_search[i]) + 14
            stat = re.sub("<", '',page_text[indexer+offset: indexer+offset+6]).rstrip('/td<>')
            stat = float(stat)
        except:
            stat = 0
        stats[i] = stat
    
    return stats

## Main cell that calls functions and assembles the dataframe. 

In [None]:
row_num = 0
all_def = get_all_def()
all_nba = get_all_nba()
for i in range(1960,2018):
    print(i)
    double_count = 0
    url = ('http://www.basketball-reference.com/leagues/NBA_%s.html' % str(i))
    time.sleep(0.1)
    req = requests.get(url)
    season = BeautifulSoup(req.text, 'html.parser')
    teams = season.find_all('th', {'class': 'left', 'data-stat':'team_name'})
   
    season_awards = get_award_winners(season, i)
    allst_roster = get_allstars(i)
    all_nba_team = all_nba[str(i)]
    
    for team in teams:
        double_count += 1
        if(double_count > 32):
            break
        if(team.text == 'Team' or 'Conference' in team.text):
            continue
        wins = int(team.parent.find('td', {'data-stat':'wins'}).text)
     
        team_name = repr(team.text).replace('\\x','*').split('*')[0].strip("'")
        for a in team.find_all('a',href=True):
            new_row = get_team_row(all_def,all_nba_team,season_awards,allst_roster,team_name,a['href'], i, wins)
            data.loc[row_num] = new_row
            row_num += 1


In [None]:
# Some franchises have changed names. They still inherit the history of the prior team
name_changes = [['Rockets', 'Houston Rockets'], ['Lakers', 'Los Angeles Lakers'], ['Jazz', 'Utah Jazz'],
                ['Hawks', 'Atlanta Hawks'], ['Clippers', 'Los Angeles Clippers'], ['Grizzlies', 'Memphis Grizzlies'],
                ['SuperSonics', 'Oklahoma City Thunder'], ['Kings', 'Sacramento Kings'], 
                ['Royals', 'Sacramento Kings'], ['Zephyrs', 'Washington Wizards'],
                ['Bobcats', 'Charlotte Hornets'], ['Warriors', 'Golden State Warriors'],
                ['New Orleans', 'New Orleans Pelicans'], ['Syracuse', 'Philadelphia 76ers'],
                ['Nets','Brooklyn Nets'], ['Bullets', 'Washington Wizards'], 
                ['Buffalo', 'Los Angeles Clippers'], ['Packers', 'Washington Wizards'] ]
for name_change in name_changes:
    data.ix[data.Team.str.contains(name_change[0]), 'Team'] = name_change[1]



In [None]:
# Column to collect all the accolades a team has. 
data['Accolades'] = (data['MVP'] + data['Scoring Leader'] + data['Rebound Leader'] + data['Assist Leader']
                     + data['WS Leader'] + data['DPOY'] + data['MIP'] + data['6MOY'] + data['Coach of Year']
                     + data['All-Stars'] + data['All-Defensive'] + data['All-NBA'])
data['Franchise ID'] = data['Team'].str.split().str[-1]

In [None]:
data

In [None]:
data.to_csv('nba_data.csv')

### Clean player data file and add full team names

In [None]:
player_data = pd.read_csv('player_data.csv')
player_data = player_data[player_data.Player != '0'].fillna(0)

In [None]:
shorts = [['CHI', 'Chicago Bulls'], ['NOJ', 'Utah Jazz'], ['NYK', 'New York Knicks'],
          ['NJN', 'Brooklyn Nets'], ['DEN', 'Denver Nuggets'], ['LAL', 'Los Angeles Lakers'],
          ['CLE', 'Cleveland Cavaliers'], ['SAS', 'San Antonio Spurs'], ['UTA', 'Utah Jazz'],
          ['TOR', 'Toronto Raptors'], ['HOU' ,'Houston Rockets'], ['LAC', 'Los Angeles Clippers'],
          ['DAL', 'Dallas Mavericks'], ['BUF', 'Los Angeles Clippers'], ['BOS' ,'Boston Celtics'],
          ['NOP', 'New Orleans Pelicans'], ['MIN', 'Minnesota Timberwolves'], ['ATL', 'Atlanta Hawks'],
          ['KCK', 'Sacramento Kings'], ['SAC', 'Sacramento Kings'], ['GSW', 'Golden State Warriors'],
          ['DET', 'Detroit Pistons'], ['MIA', 'Miami Heat'], ['OKC', 'Oklahoma City Thunder'],
          ['MEM', 'Memphis Grizzlies'], ['IND', 'Indiana Pacers'], ['PHI', 'Philadelphia 76ers'],
          ['ORL', 'Orlando Magic'], ['CHA', 'Charlotte Hornets'], ['WSB', 'Washington Wizards'],
          ['SEA', 'Oklahoma City Thunder'], ['CHH', 'Charlotte Hornets'], ['VAN', 'Memphis Grizzlies'],
          ['WAS', 'Washington Wizards'], ['BRK', 'Brooklyn Nets'], ['MIL', 'Milwaukee Bucks'],
          ['NOH', 'New Orleans Pelicans'], ['NOK', 'New Orleans Pelicans'], ['SDC', 'Los Angeles Clippers'],
          ['CHO', 'Charlotte Hornets'], ['POR', 'Portland Trailblazers'], ['PHO', 'Phoenix Suns']
    ]
         
for short in shorts:
    player_data.ix[player_data.Team.str.contains(short[0]), 'Team'] = short[1]

In [None]:
player_data.Team.unique()

In [None]:
player_data

In [None]:
player_data.to_csv('player_data_clean.csv')