In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import os

In [3]:
years = range(1991, 2023)
mvp_requests = "https://www.basketball-reference.com/awards/awards_{}.html"

Gets requests from basketball reference in order to determine who won MVP in the last ~30 years

In [4]:
# for year in years:
#     page = requests.get(mvp_requests.format(year))
    
#     with open('mvps/{}.html'.format(year), 'w+') as file:
#         file.write(page.text)

# DO NOT RUN AGAIN - WILL CAUSE ERROR WITH BASKETBALL REFERENCE

From opening all csv files, locate only the MVP table and add the data from that table into a list of dataframes, as well as adding a 'Year' column to the data to ensure we know which year each MVP candidate is from. Use Beautiful Soup to get the data and convert it into readable html, and get rid of headers as well as find the necessary MVP information

In [5]:
df = []
for year in years:
    with open('files/mvps/{}.html'.format(year)) as f:
        page = f.read()
        soup = BeautifulSoup(page, 'html.parser')
        soup.find('tr', class_ = 'over_header').decompose()
        mvp_table = soup.find_all(id = 'mvp')
        mvp = pd.read_html(str(mvp_table))[0]
        mvp['Year'] = year
        df.append(mvp)

Concatenate the dataframes into one dataframe using pd.concat(), and then make it into a csv called 'mvps.csv' for future use

In [6]:
mvps = pd.concat(df)
mvps.to_csv('files/mvps.csv')

In order to build a machine-learning model based on predicting an MVP, we have to find out the player stats and associate the MVP votes with how well each player did. The problem is that we need to find these stats and get more requests

In [55]:
from selenium import webdriver
import time
driver = webdriver.Safari()


Use selenium to automate all the stats, since basketball-reference uses JavaScript to load in their stats

In [8]:
stats_url = 'https://www.basketball-reference.com/leagues/NBA_{}_per_game.html'

In [9]:
# import time

# for year in years:
#     with open('files/full_player_stats/{}.html'.format(year), 'w+') as file:
#         driver.get(stats_url.format(year))
#         driver.execute_script('window.scrollTo(1,10000)')
#         time.sleep(2)
#         html = driver.page_source
#         file.write(html) 

In [10]:
# df = []
# for year in years:
#     with open('files/full_player_stats/{}.html'.format(year)) as f:
#         f = f.read()
#         soup = BeautifulSoup(f, 'html.parser')
#         soup.find('tr', class_ = 'thead').decompose()
#         player_stats = soup.find_all(id = 'per_game_stats')
#         stats = pd.read_html(str(player_stats))[0]
#         stats['Year'] = year
        
#         df.append(stats)

In [11]:
stats = pd.concat(df)
stats.to_csv('files/player_stats.csv')

Finally, since the MVP race is often biased towards players who are winning(and rightfully so), we want to include the team record in the data in order to let the model adjust for those who are winning more. In order to do so, we look up the historic data for the season

In [22]:
# record_url = 'https://www.basketball-reference.com/leagues/NBA_{}_standings.html' 
# for year in years:
#     data = requests.get(record_url.format(year))
#     with open('files/team_records/{}.html'.format(year), 'w+') as file:
#         file.write(data.text)


In [56]:
dfs = []

stats_url = 'https://www.basketball-reference.com/leagues/NBA_{}_standings.html' 
for year in years:
    with open('files/team_records/{}.html'.format(year), 'w+') as file:
        driver.get(stats_url.format(year))
        driver.execute_script('window.scrollTo(1,10000)')
        time.sleep(2)
        html = driver.page_source
        file.write(html) 

In [59]:
df = []
for year in years:
    with open('files/team_records/{}.html'.format(year)) as f:
        f = f.read()
        soup = BeautifulSoup(f, 'html.parser')
        soup.find('tr', class_ = 'thead').decompose()
        soup.find('tr', class_ = 'over_header').decompose()
        player_stats = soup.find_all(id = 'expanded_standings')
        stats = pd.read_html(str(player_stats))[0]
        stats['Year'] = year

        df.append(stats)

In [61]:
data = pd.concat(df)
data.to_csv('team_record.csv')