In [1]:
import pandas as pd
import requests
import time
import re
import numpy as np
from bs4 import BeautifulSoup
from sklearn.linear_model import LinearRegression

In [2]:
data = pd.DataFrame(columns= ['Name', 'Year','Wins','Champion','MVP','Scoring Leader', 
                              'Rebound Leader', 'Assist Leader', 'WS Leader','DPOY',
                              'MIP', '6MOY', 'Coach of Year'#'All-Stars', 'All-NBA',
                              #'All-Defensive', FG%', '3P%', 
                              #'2P%', 'FT%', 'PPG', 'TOV', 'AST', 'STL', 'BLK' 
                              #'SEED', 'GOAT Factor', 'Coach', 'Overall Team Score' 
                             ])
data

Unnamed: 0,Name,Year,Wins,Champion,MVP,Scoring Leader,Rebound Leader,Assist Leader,WS Leader,DPOY,MIP,6MOY,Coach of Year


In [3]:
def check_roster(team_soup, season_awards):
    awards = [0,0,0,0,0,0,0,0,0]
    players = team_soup.find_all('td', {'class':'left', 'data-stat':'player'})
    roster = [player.text for player in players]
    coach = roster.append(team_soup.find(text=re.compile('Coach:')).parent.parent.find('a').text)

 
    for i in range(9):
        if season_awards[i] in roster:
            awards[i] = 1
    
    return awards

In [4]:
def get_team_row(season_awards, name, team_url, year, wins):
    row = []
    row.append(name.replace('*',''))
    row.append(year)
    row.append(wins)
    
    roster = []
    
    url = 'http://www.basketball-reference.com' + team_url
    req = requests.get(url)
    time.sleep(0.1)
    team_soup = BeautifulSoup(req.text, 'html.parser')
    try:
        row.append(team_soup.find('br').text[:4] == 'Won ')
    except:
        row.append(False)
    row += check_roster(team_soup, season_awards)
    
    return row

In [5]:
leaders = ['Most Valuable Player', 'PPG Leader', 'RPG Leader',
           'APG Leader', 'WS Leader']

awards = ['Defensive Player of the Year', 'Most Improved Player', 
          'Sixth Man of the Year', 'Coach of the Year']
winners = []

def get_award_winners(season, year):
    for leader in leaders:
        winners.append(season.body.find(text=leader).parent.parent.find('a').text)

    #Using a different site that's more convenient to scrape other awards.
    year_string = '%s' % str(year)
    next_year = '%s' % str(year+1)
    year = '%s-%s' % (year_string, next_year[2:4])
    for award in awards:
        url = 'http://www.nba.com/history/%s-award/' %(award.lower().replace(' ', '-'))
        req = requests.get(url)
        soup = BeautifulSoup(req.text, 'html.parser')

        table = soup.find('table')    
        try:
            winners.append(table.find(text=re.compile(year)).parent.parent.find_all('td')[1].text.strip())
        except:
            winners.append('')
    
    return winners 

In [6]:
# Append all_stars rosters to list
def get_allstar_players(year):
    year_string = '%s' % str(year)
    url = "http://www.basketball-reference.com/allstar/NBA_%s.html/" %year_string
    req = requests.get(url)
    allstar = BeautifulSoup(req.text, 'html.parser')
    count1 = 0
    count2 = 0
    
    for row in allstar.find("div", {"id" : "all_East"}).find_all("th", {"class" : "left", "scope" : "row", "csk" : True}):
        all_stars_East.append(row.find("a").text)   
        count1 += 1
        
    if count1 != 14:
        for i in range(count1, 14):
            all_stars_East.append("")
            count1 += 1
        
    for row in allstar.find("div", {"id" : "all_West"}).find_all("th", {"class" : "left", "scope" : "row", "csk" : True}):
        all_stars_West.append(row.find("a").text)
        count2 += 1
    
    #print(all_stars_East), print(count1), print(all_stars_West), print(count2)
    if count2 != 14:
        for i in range(count2, 14):
            all_stars_West.append("")
            count2 += 1

In [7]:
years = np.arange(1965, 2017)
index_to_be_delete = 1999 - 1965
years = np.delete(years, index_to_be_delete)
reversed_year = years[::-1]

In [8]:
num_players = ["Player 1", "Player 2", "Player 3", "Player 4", "Player 5", "Player 6", "Player 7", "Player 8", "Player 9", 
              "Player 10", "Player 11", "Player 12" , "Player 13", "Player 14"]
east = pd.DataFrame(columns=num_players, index = reversed_year).fillna("")
west = pd.DataFrame(columns=num_players, index = reversed_year).fillna("")
row = 0

for i in reversed_year:
    all_stars_East = []
    all_stars_West = []
    
    #print(i)
    get_allstar_players(i)
    time.sleep(0.2)

    east.loc[i, :] = all_stars_East
    west.loc[i, :] = all_stars_West

    row += 1

In [11]:
num_defends = ["Player 1", "Player 2", "Player 3", "Player 4", "Player 5"]
years_array = []

for year in range(2015,1967,-1):
    year_string = '%s' % str(year)
    next_year = '%s' % str(year+1)
    year = '%s-%s' % (year_string, next_year[2:4])
    years_array.append(year)
    
df_alldefend = pd.DataFrame(columns = num_defends, index = years_array).fillna("")

In [12]:
## Because we need only 1 request to get the whole defensive page, each five players from list is from the First team of a season.

def get_defensive():
    url = "http://www.nba.com/history/all-defensive-teams/"
    req = requests.get(url)
    defensive = BeautifulSoup(req.text, 'html.parser')
    all_defend = []
        
    ## Get First Team Rosters Only here
    for i in defensive.find_all("td" , {"class" : "cnnIEColTxtL", "style" : "text-align: center"}):
        if i.text == "First Team":
            continue
        else:
            all_defend.append(i.text)
            
    return all_defend

list_alldefend = get_defensive()

In [17]:
j = 0

for i in years_array:
    df_alldefend.loc[i, :] = list_alldefend[j: (j+5)]
    j += 5

In [18]:
df_alldefend

Unnamed: 0,Player 1,Player 2,Player 3,Player 4,Player 5
2015-16,Kawhi Leonard,Draymond Green,DeAndre Jordan,Avery Bradley,Chris Paul
2014-15,Kawhi Leonard,Draymond Green,Tony Allen,DeAndre Jordan,Chris Paul
2013-14,Joakim Noah,Paul George,Chris Paul,Serge Ibaka,Andre Iguodala
2012-13,LeBron James,Serge Ibaka,Tyson Chandler,Joakim Noah,Tony Allen
2011-12,Chris Paul,Chris Paul,Tony Allen,Dwight Howard,LeBron James
2010-11,Serge Ibaka,Kobe Bryant,Kevin Garnett,Dwight Howard,LeBron James
2009-10,Rajon Rondo,Kobe Bryant,Gerald Wallace,Dwight Howard,LeBron James
2008-09,Rajon Rondo,Kobe Bryant,Kevin Garnett,Dwight Howard,LeBron James
2007-08,Chris Paul,Kobe Bryant,Bruce Bowen,Marcus Camby,Tim Duncan
2006-07,Kevin Garnett,Kobe Bryant,Bruce Bowen,Marcus Camby,Tim Duncan


In [24]:
list_alldefend[0:5]

['Kawhi Leonard',
 'Draymond Green',
 'DeAndre Jordan',
 'Avery Bradley',
 'Chris Paul']

In [None]:
row_num = 0
for i in range(1965, 1966):
    url = ('http://www.basketball-reference.com/leagues/NBA_%s.html' % str(i))
    time.sleep(0.1)
    req = requests.get(url)
    season = BeautifulSoup(req.text, 'html.parser')
    teams = season.find_all('th', {'class': 'left', 'data-stat':'team_name'})
   
    season_awards = get_award_winners(season, i-1)
    
    as_rosters = get_allstar_players(season, i)
    
    for team in teams:
        if(team.text == 'Team' or 'Conference' in team.text):
            continue
        wins = int(team.parent.find('td', {'data-stat':'wins'}).text)
     
        team_name = repr(team.text).replace('\\x','*').split('*')[0].lstrip("'")
        for a in team.find_all('a',href=True):
            new_row = get_team_row(season_awards,team_name,a['href'], i, wins)
  
            data.loc[row_num] = new_row
            row_num += 1
