In [1]:
import requests
from bs4 import BeautifulSoup
import re
from collections import defaultdict
import time
import pandas as pd

## Getting the data

We are getting the data beginning from 1980-81 season up to past season for train and validation.
We are beginning from 1980-81 season because previous to that season, voting was done by players.

In [6]:
def work_player_profile(param, season):
    url = "https://www.basketball-reference.com" + param
    res = requests.get(url)
    soup = BeautifulSoup(res.text)
    #print(url)
    
    data_dict = {}
    
    per_game = soup.find(attrs={'id': 'all_per_game'})
    for row in per_game.findAll("tr"):
        if 'id' in row.attrs and row.attrs['id'] == 'per_game.' + season:
            #print(float(row.find('td', attrs={'data-stat': 'fga_per_g'}).text))
            data_dict['fga'] = float(row.find('td', attrs={'data-stat': 'fga_per_g'}).text)
            data_dict['fg3a'] = float(row.find('td', attrs={'data-stat': 'fg3a_per_g'}).text)
            data_dict['fta'] = float(row.find('td', attrs={'data-stat': 'fta_per_g'}).text)
            break
    
    advanced_table = soup.find(attrs={'id': 'all_advanced'})
    for child in advanced_table.children:
        if 'table_outer_container' in child:
            other_soup = BeautifulSoup(child)
            rows = other_soup.findAll('tr')
    for row in rows:
        if 'id' in row.attrs and row.attrs['id'] == 'advanced.' + season:
            data_dict.update(
                {
                    'per': float(row.find('td', attrs={'data-stat': 'per'}).text),
                    'ts_pct': float(row.find('td', attrs={'data-stat': 'ts_pct'}).text),
                    'usg_pct': float(row.find('td', attrs={'data-stat': 'usg_pct'}).text),
                    'bpm': float(row.find('td', attrs={'data-stat': 'bpm'}).text),
                    'season': str(int(season)-1) + '-' + season[-2:],
                }
            )
            return data_dict

In [7]:
def get_stats_of_voting(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.text)
    item = soup.find(attrs={'class': 'stats_table'})
    rows = item.findAll('tr')
    
    season = url.split('.html')[0][-4:]
    print(f"Season: {season}")
    
    players_stats = defaultdict(list)
    
    for index, row in enumerate(rows):
        #print(f"\fIndex: {index} of {len(rows)}")
        header_cells = row.findAll('th')
        for header_cell in header_cells:
            if ('data-stat' in header_cell.attrs and 
                header_cell['data-stat'] == 'ranker' and
                'csk' in header_cell.attrs):
                rank = int(header_cell.getText())
                print(rank)
        td_cells = row.findAll('td')
        if not td_cells:
            continue
        for cell in td_cells:
            if 'data-stat' not in cell.attrs:
                continue
            if cell['data-stat'] == 'age':
                continue
            elif cell['data-stat'] == 'team_id':
                base = "https://www.basketball-reference.com"
                try:
                    link = cell.find("a")['href']
                except Exception:
                    players_stats['win_pct'].append(0.5) # average
                    continue
                url = base + link
                time.sleep(1)
                soup = BeautifulSoup(requests.get(url).text)
                for item in soup.findAll('p'):
                    if "Record" in item.text:
                        record = re.findall('\d+\-\d+', item.text)[0]
                        splitted = record.split('-')
                        players_stats['win_pct'].append(float(splitted[0]) / (float(splitted[1]) + float(splitted[0])))
                        break
                continue
            #print('Text: ', cell.getText())
            #print(cell['data-stat'])
            if cell['data-stat'] == 'player':
                time.sleep(1)
                advanced_dict = work_player_profile(cell.find('a')['href'], season)
                for key in advanced_dict:
                    players_stats[key].append(advanced_dict[key])
                players_stats[cell['data-stat']].append(cell.getText())
            else:
                text = cell.getText() or "0"
                players_stats[cell['data-stat']].append(float(text))
    return players_stats

In [9]:
seasons = range(1981, 2019)
new_data = defaultdict(list)

for season in seasons:
    url = f"https://www.basketball-reference.com/awards/awards_{str(season)}.html"
    season_dict = get_stats_of_voting(url)
    for key in season_dict:
        new_data[key].extend(season_dict[key])

data_frame = pd.DataFrame(new_data)
data_frame.to_csv("mvp_votings.csv")

In [10]:
data_frame = pd.read_csv("mvp_votings.csv")

In [11]:
data_frame.head()

Unnamed: 0.1,Unnamed: 0,fga,fg3a,fta,per,ts_pct,usg_pct,bpm,season,player,...,pts_per_g,trb_per_g,ast_per_g,stl_per_g,blk_per_g,fg_pct,fg3_pct,ft_pct,ws,ws_per_48
0,0,18.6,0.2,6.5,25.1,0.572,28.4,8.0,1980-81,Julius Erving,...,24.6,8.0,4.4,2.1,1.8,0.521,0.222,0.787,13.8,0.231
1,1,18.3,0.9,4.0,19.9,0.528,24.3,5.1,1980-81,Larry Bird,...,21.2,10.9,5.5,2.0,0.8,0.478,0.27,0.863,10.8,0.16
2,2,18.2,0.0,6.9,25.5,0.616,26.3,5.3,1980-81,Kareem Abdul-Jabbar,...,26.2,10.3,3.4,0.7,2.9,0.574,0.0,0.766,14.3,0.23
3,3,19.3,0.0,10.1,25.1,0.585,27.6,3.7,1980-81,Moses Malone,...,27.8,14.8,1.8,1.0,1.9,0.522,0.333,0.757,13.7,0.202
4,4,21.1,0.4,7.6,22.9,0.555,32.3,1.6,1980-81,George Gervin,...,27.1,5.1,3.2,1.1,0.7,0.492,0.257,0.826,10.5,0.182


In [12]:
data_frame.tail()

Unnamed: 0.1,Unnamed: 0,fga,fg3a,fta,per,ts_pct,usg_pct,bpm,season,player,...,pts_per_g,trb_per_g,ast_per_g,stl_per_g,blk_per_g,fg_pct,fg3_pct,ft_pct,ws,ws_per_48
632,632,18.0,1.2,5.3,25.0,0.57,29.1,3.3,2017-18,LaMarcus Aldridge,...,23.1,8.5,2.0,0.6,1.2,0.51,0.293,0.837,10.9,0.209
633,633,15.6,3.4,7.2,23.7,0.59,24.9,5.0,2017-18,Jimmy Butler,...,22.2,5.3,4.9,2.0,0.4,0.474,0.35,0.854,8.9,0.198
634,634,16.9,9.8,5.9,28.2,0.675,31.0,8.6,2017-18,Stephen Curry,...,26.4,5.1,6.1,1.6,0.2,0.495,0.423,0.921,9.1,0.267
635,635,16.8,3.4,7.4,22.9,0.573,33.4,2.6,2017-18,Joel Embiid,...,22.9,11.0,3.2,0.6,1.8,0.483,0.308,0.769,6.2,0.155
636,636,17.9,5.8,4.9,23.1,0.577,30.1,4.9,2017-18,Victor Oladipo,...,23.1,5.2,4.3,2.4,0.8,0.477,0.371,0.799,8.2,0.155


In [51]:
data_frame.columns

Index(['fga', 'fg3a', 'fta', 'per', 'ts_pct', 'usg_pct', 'bpm', 'season',
       'player', 'win_pct', 'votes_first', 'points_won', 'points_max',
       'award_share', 'g', 'mp_per_g', 'pts_per_g', 'trb_per_g', 'ast_per_g',
       'stl_per_g', 'blk_per_g', 'fg_pct', 'fg3_pct', 'ft_pct', 'ws',
       'ws_per_48'],
      dtype='object')

In [52]:
len(data_frame.columns)

26

In [57]:
data_frame.iloc[0]

fga                    19.7
fg3a                    4.8
fta                     6.9
per                    23.5
ts_pct                 0.55
usg_pct                32.2
bpm                     5.9
season              2010-11
player         Derrick Rose
win_pct            0.756098
votes_first             113
points_won             1182
points_max             1210
award_share           0.977
g                        81
mp_per_g               37.4
pts_per_g                25
trb_per_g               4.1
ast_per_g               7.7
stl_per_g                 1
blk_per_g               0.6
fg_pct                0.445
fg3_pct               0.332
ft_pct                0.858
ws                     13.1
ws_per_48             0.208
Name: 0, dtype: object