In [3]:
import pandas as pd
import requests
import time
import re
import numpy as np
from bs4 import BeautifulSoup
from sklearn.linear_model import LinearRegression

#Get the comment straight from request. Then extract the comment ("<!"). Use BeautifulSoup on this comment.

In [3]:
data = pd.DataFrame(columns= ['Name', 'Year','Wins','Champion','MVP','Scoring Leader', 
                              'Rebound Leader', 'Assist Leader', 'WS Leader','DPOY',
                              'MIP', '6MOY', 'Coach of Year', 'All-Stars', 'All-Defensive',
                              #'All-NBA', 
                              'FG%','3P Attempts', '3P%', 
                              '2P%', '2P Attempts' 'FT%', 'PPG', 'TOVPG', 'APG','RPG', 'STL', 'BLK', 'FG2A', 'FG3A' 
                             ])
data

Unnamed: 0,Name,Year,Wins,Champion,MVP,Scoring Leader,Rebound Leader,Assist Leader,WS Leader,DPOY,...,2P%,2P AttemptsFT%,PPG,TOVPG,APG,RPG,STL,BLK,FG2A,FG3A


In [4]:
def check_roster(team_soup, season_awards, allst_roster, all_def):
    awards = [0,0,0,0,0,0,0,0,0,0,0]
    players = team_soup.find_all('td', {'class':'left', 'data-stat':'player'})
    roster = [player.text for player in players]
    coach = roster.append(team_soup.find(text=re.compile('Coach:')).parent.parent.find('a').text)
    
    star_count = 0
    all_def_count = 0
    for allst in allst_roster:
        if allst in roster:
            star_count += 1
 
    awards[9] = star_count
    for i in range(9):
        if season_awards[i] in roster:
            awards[i] = 1
            
    for player in roster:
        if player in all_def:
            all_def_count +=1 
    awards[10] = all_def_count
    
    return awards

In [5]:
## Get Team Row
def get_team_row(all_def, season_awards, allst_roster, name, team_url, year, wins):
    row = []
    row.append(name.replace('*',''))
    row.append(year)
    row.append(wins)
    
    roster = []
    
    url = 'http://www.basketball-reference.com' + team_url
    req = requests.get(url)
    time.sleep(0.1)
    team_soup = BeautifulSoup(req.text, 'html.parser')
    try:
        row.append(team_soup.find('br').text[:4] == 'Won ')
    except:
        row.append(False)
    row += check_roster(team_soup, season_awards, allst_roster,all_def)
    
    row += get_statline(req.text)
    print(row)
    
    return row

In [6]:
leaders = ['Most Valuable Player', 'PPG Leader', 'RPG Leader',
           'APG Leader', 'WS Leader']

awards = ['Defensive Player of the Year', 'Most Improved Player', 
          'Sixth Man of the Year', 'Coach of the Year']
winners = []

def get_award_winners(season, year):
    for leader in leaders:
        winners.append(season.body.find(text=leader).parent.parent.find('a').text)

    #Using a different site that's more convenient to scrape other awards.
    year_string = '%s' % str(year)
    next_year = '%s' % str(year+1)
    year = '%s-%s' % (year_string, next_year[2:4])
    for award in awards:
        url = 'http://www.nba.com/history/%s-award/' %(award.lower().replace(' ', '-'))
        req = requests.get(url)
        soup = BeautifulSoup(req.text, 'html.parser')

        table = soup.find('table')    
        try:
            winners.append(table.find(text=re.compile(year)).parent.parent.find_all('td')[1].text.strip())
        except:
            winners.append('')
    
    return winners

In [15]:
# Append all_stars rosters to list
def get_allstars(year):
    url = "http://www.basketball-reference.com/allstar/NBA_%s.html/" % str(year)
    req = requests.get(url)
    soup = BeautifulSoup(req.text, 'html.parser')
    all_stars = []
    
    for row in soup.find("div", {"id" : "all_East"}).find_all("th", {"class" : "left", "scope" : "row", "csk" : True}):
        all_stars.append(row.find("a").text)
        
        
    for row in soup.find("div", {"id" : "all_West"}).find_all("th", {"class" : "left", "scope" : "row", "csk" : True}):
        all_stars.append(row.find("a").text)
        
    return all_stars

In [11]:
# Because we need only 1 request to get the whole defensive page, each five players from list is from the First team of a season.
def get_all_def():
    url = "http://www.nba.com/history/all-defensive-teams/"
    req = requests.get(url)
    defensive = BeautifulSoup(req.text, 'html.parser')
    all_defense = []
    
    for i in defensive.find_all("td" , {"class" : "cnnIEColTxtL", "style" : "text-align: center"}):
        if i.text == "First Team":
            continue
        all_defense.append(i.text)
        
    for i in defensive.find_all("td" , {"class" : "cnnIEColTxtR", "style" : "text-align: center"}):
        if i.text == "Second Team":
            continue
        all_defense.append(i.text)
    return all_defense

In [12]:
def get_statline(page_text):
    stats = [0,0,0,0,0,0,0,0,0,0,0,0,0]
    stat_search = ['fg_pct', 'fg3a', 'fg3_pct', 'fg2a', 'fg2_pct',
                   'ft_pct','trb_per_g','ast_per_g','stl_per_g',
                   'blk_per_g','tov_per_g','pts_per_g']
    
    for i in range(len(stat_search)):
        indexer = page_text.find('data-stat="%s" >' % (stat_search[i]))
        try:
            offset = len(stat_search[i]) + 14
            stat = re.sub("<", '',page_text[indexer+offset: indexer+offset+4])    
            stat = float(stat)
        except:
            stat = 0
        stats[i] = stat
    return stats

In [17]:
row_num = 0
all_def = get_all_def()

for i in range(1975, 1976):
    url = ('http://www.basketball-reference.com/leagues/NBA_%s.html' % str(i))
    time.sleep(0.1)
    req = requests.get(url)
    season = BeautifulSoup(req.text, 'html.parser')
    teams = season.find_all('th', {'class': 'left', 'data-stat':'team_name'})
   
    season_awards = get_award_winners(season, i-1)
    
    allst_roster = get_allstars(i)
    
    for team in teams:
        if(team.text == 'Team' or 'Conference' in team.text):
            continue
        wins = int(team.parent.find('td', {'data-stat':'wins'}).text)
     
        team_name = repr(team.text).replace('\\x','*').split('*')[0].lstrip("'")
        for a in team.find_all('a',href=True):
            new_row = get_team_row(all_def,season_awards,allst_roster,team_name,a['href'], i, wins)
            #print(len(new_row))
            data.loc[row_num] = new_row
            row_num += 1

['Boston Celtics', 1975, 60, False, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 0.458, 0, 0, 7825.0, 0.458, 0.791, 52.0, 26.3, 8.1, 3.5, 19.8, 106.0, 0]
['Buffalo Braves', 1975, 49, False, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0.476, 0, 0, 7469.0, 0.476, 0.78, 46.9, 25.2, 8.8, 5.6, 20.9, 107.0, 0]
['New York Knicks', 1975, 40, False, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0.45, 0, 0, 7464.0, 0.45, 0.772, 44.3, 20.4, 8.0, 3.7, 16.8, 100.0, 0]
['Philadelphia 76ers', 1975, 34, False, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0.445, 0, 0, 7476.0, 0.445, 0.749, 47.6, 20.8, 7.0, 3.2, 19.4, 99.8, 0]
['Washington Bullets', 1975, 60, False, 0, 0, 1, 1, 0, 0, 0, 0, 0, 3, 2, 0.462, 0, 0, 7697.0, 0.462, 0.752, 47.5, 24.5, 11.3, 5.0, 19.4, 104.0, 0]
['Houston Rockets', 1975, 41, False, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0.477, 0, 0, 7231.0, 0.477, 0.799, 44.8, 26.3, 9.1, 4.3, 21.5, 103.0, 0]
['Cleveland Cavaliers', 1975, 40, False, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0.462, 0, 0, 7371.0, 0.462, 0.742, 43.4, 23.2, 7.3, 4.2, 17.8, 99.0

In [11]:
y = "2016-17"
x = y[:2] + y[-2:]
x

'2017'

In [10]:
y[-2:]

'17'

In [12]:
def get_all_nba():
    req = requests.get("http://www.basketball-reference.com/awards/all_league.html")
    soup = BeautifulSoup(req.text, "html.parser")
    j = 0
    i = 0
    data_stat = 1
    
    #first_time = True
    prev_year = "2017"

    dict_allnba = {

    }
    
    players = []

    for row in soup.find_all("tr", {"class" : False}):
        content = row.text
        year = content[:7]
        year = year[:2] + year[-2:]
        
        #dict_allnba[year]   
        league = content[7:10]

        if league == "NBA":
            if ((i == 2) & (year != prev_year)):
                #print(i)
                #print("Current %s" %year)
                #print("Prev year %s"  %prev_year)
                #print("Why in here")

                #print(players)
                dict_allnba[prev_year].append(players)
                #print(dict_allnba[prev_year])
                data_stat = 1
                i = 0
                j += 1

            if (i == 3):
                i = 0
                data_stat = 1


            #for i in range(0, len(soup.find_all("td", {"class" : "left", "data-stat" : "1" }))):          
            if (soup.find_all("td", {"class" : "left", "data-stat" : "%s" % str(1)})[j].find("a") == None):
                j += 1
                #print("Inside None")
                #continue

            # Create a list of players to hold 5 players
            players = []   

            if year not in dict_allnba:
                dict_allnba[year] = []

            #print(row)
            #print("---")
            #print(i)
            #print(year)
            #print(j)

            for stat in range(data_stat, data_stat + 5):
                #print(i)
                #print(j)
                # for first team
                try:
                    if (stat > 5) & (j != 0): 
                        new_j = int(j / 2)
                        #print(new_j)
                        #print(stat)
                        #print(soup.find_all("td", {"class" : "left", "data-stat" : "%s" % str(stat)})[new_j])
                        players.append(soup.find_all("td", {"class" : "left", "data-stat" : "%s" % str(stat)})[new_j].find("a").text)
                        #print(players)

                    else:
                        players.append(soup.find_all("td", {"class" : "left", "data-stat" : "%s" % str(stat)})[j].find("a").text)
                except:
                    pass           


                #print(stat)
                if (stat == 5) | (stat == 10) | (stat == 15):
                    #print(players)
                    #print(j)
                    dict_allnba[year].append(players)
                    players = []


            data_stat += 5
            i += 1
            prev_year = year

            if (data_stat == 16):
                data_stat = 1
                j += 1
            # Add a list of players to dict with a given year    
            #dict_allnba[year].append(players)

            #j += 1
        elif (league == "ABA") | (league == "BAA") :
            j += 1
            
    return dict_allnba

In [13]:
all_nba = get_all_nba()

In [14]:
all_nba

{'1900': [["Shaquille O'Neal",
   'Tim Duncan',
   'Kevin Garnett',
   'Jason Kidd',
   'Gary Payton'],
  ['Alonzo Mourning',
   'Grant Hill',
   'Karl Malone',
   'Kobe Bryant',
   'Allen Iverson'],
  ['David Robinson',
   'Vince Carter',
   'Chris Webber',
   'Eddie Jones',
   'Stephon Marbury']],
 '1950': [['Bob Davies',
   'Alex Groza',
   'George Mikan',
   'Jim Pollard',
   'Max Zaslofsky'],
  ['Ralph Beard',
   'Frankie Brian',
   'Al Cervi',
   'Fred Schaus',
   'Dolph Schayes']],
 '1951': [['Ralph Beard',
   'Bob Davies',
   'Alex Groza',
   'Ed Macauley',
   'George Mikan'],
  ['Frankie Brian',
   'Joe Fulks',
   'Dick McGuire',
   'Vern Mikkelsen',
   'Dolph Schayes'],
  []],
 '1952': [['Paul Arizin',
   'Bob Cousy',
   'Ed Macauley',
   'George Mikan',
   'Bob Davies'],
  ['Larry Foust',
   'Vern Mikkelsen',
   'Andy Phillip',
   'Jim Pollard',
   'Bobby Wanzer'],
  []],
 '1953': [['Bob Cousy',
   'Neil Johnston',
   'Ed Macauley',
   'George Mikan',
   'Dolph Schayes'],
  