In [2]:
#conda create -n venv_name
#conda activate venv_name
#conda install pip
#pip install pandas
#pip install cloudscraper
#pip install beautifulsoup4
#pip install ipykernel
#pip install lxml

In [3]:
#remove warnings
import warnings
warnings.filterwarnings('ignore')

#matrix manipulation
import pandas as pd

#scrapping data
import cloudscraper
from bs4 import BeautifulSoup

### MVP List

In [4]:
years = list(range(2000, 2025))

In [5]:
url_start = "https://www.basketball-reference.com/awards/awards_{}.html"

In [6]:
scraper = cloudscraper.create_scraper() 

for year in years:
    url = url_start.format(year)
    data = scraper.get(url)

    with open('mvp/{}.html'.format(year), 'w+', encoding="utf-8") as f:
        f.write(data.text)

In [7]:
dfs = []
for year in years:
    with open('mvp/{}.html'.format(year), encoding="utf-8") as f:
        page = f.read()

    soup = BeautifulSoup(page, "html.parser")
    soup.find('tr', class_='over_header').decompose()
    mvp_table = soup.find(id="mvp")
    mvp = pd.read_html(str(mvp_table))[0]
    mvp['Year'] = year

    dfs.append(mvp)

    mvps = pd.concat(dfs)
    mvps.to_csv("mvsps.csv")

In [8]:
mvps.reset_index(inplace=True)

In [9]:
mvps.to_json("mvps.json")

### Player Stats in the League

In [10]:
player_stats_url = 'https://www.basketball-reference.com/leagues/NBA_{}_per_game.html'

In [11]:
scraper = cloudscraper.create_scraper() 

for year in years:
    url = player_stats_url.format(year)
    data = scraper.get(url)
    with open('player/{}.html'.format(year), 'w+',  encoding='utf-8') as f:
        f.write(data.text)

In [12]:
dfs = []
for year in years:
    with open('player/{}.html'.format(year), encoding="utf-8") as f:
        page = f.read()

    soup = BeautifulSoup(page, "html.parser")
    player_table = soup.find(id="switcher_per_game_stats")
    player = pd.read_html(str(player_table))[0]
    player['Year'] = year

    dfs.append(player)

In [13]:
players = pd.concat(dfs)

In [14]:
players = players.dropna(axis=0, subset=['Age'])

In [15]:
players.to_csv('players.csv')

In [16]:
players.reset_index(inplace=True)

In [17]:
players.to_json('players.json')

### Active

In [18]:
dfs = []
with open('player/{}.html'.format(2024), encoding="utf-8") as f:
    page = f.read()

soup = BeautifulSoup(page, "html.parser")
active_player_table = soup.find(id="switcher_per_game_stats")
active_player = pd.read_html(str(active_player_table))[0]

dfs.append(active_player)

In [19]:
active_players = pd.concat(dfs)

In [20]:
active_players = active_players.dropna(axis=0, subset=['Age'])

In [21]:
active_players.reset_index(inplace=True)

In [23]:
active_players.to_json('active_players.json', orient='records', lines=True)

### Teams

In [None]:
team_stats_url = "https://www.basketball-reference.com/leagues/NBA_{}_standings.html"

In [None]:
for year in years:
    url = team_stats_url.format(year)

    data = scraper.get(url)

    with open("team/{}.html".format(year), "w+", encoding="utf-8") as f:
        f.write(data.text)

In [None]:
dfs = []
for year in years:
    with open("team/{}.html".format(year), encoding="utf-8") as f:
        page = f.read()

    soup = BeautifulSoup(page, "html.parser")
    team_table = soup.find(id="divs_standings_E")
    team = pd.read_html(str(team_table))[0]
    team['Year'] = year
    team['Team'] = team["Eastern Conference"]
    del team['Eastern Conference']
    dfs.append(team)

    soup = BeautifulSoup(page, "html.parser")
    team_table = soup.find(id="divs_standings_W")
    team = pd.read_html(str(team_table))[0]
    team['Year'] = year
    team['Team'] = team["Western Conference"]
    del team['Western Conference']
    dfs.append(team)

In [None]:
teams = pd.concat(dfs)

In [None]:
teams["W"] = pd.to_numeric(teams["W"], errors="coerce")
teams = teams.dropna(axis=0, subset=['W'])

In [None]:
teams.to_csv('teams.csv')

In [None]:
teams.reset_index(inplace=True)

In [None]:
teams.to_json('teams.json')