#### Import Necessary Packages

In [0]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from pyspark.sql.functions import regexp_extract
from pyspark.sql.functions import col, count, upper

In [0]:
teams = spark.read.csv("21.22_teamdata.csv", header=True, inferSchema=True)

teams_abbrv = teams.select("TEAM", "TEAM_ABBRV")

#### Save Necessary HTML Webpages

In [0]:
years = list(range(2017, 2025))

In [0]:
url_start = "https://www.nbastuffer.com/{}-{}-nba-team-stats/"

In [0]:
for year in years:
    url = url_start.format(year, year+1)
    data = requests.get(url)

    with open ("{year}.html", "w") as f:
        f.write(data.text)

#### Load Necessary HTML Webpages

In [0]:
with open (f"/dbfs/mnt/2017.html", "r") as f:
    page = f.read()

with open (f"/dbfs/mnt/2018.html", "r") as f:
    page1 = f.read()

with open (f"/dbfs/mnt/2019.html", "r") as f:
    page2 = f.read()

with open (f"/dbfs/mnt/2020.html", "r") as f:
    page3 = f.read()

with open (f"/dbfs/mnt/2021.html", "r") as f:
    page4 = f.read()

with open (f"/dbfs/mnt/2022.html", "r") as f:
    page5 = f.read()

with open (f"/dbfs/mnt/2023.html", "r") as f:
    page6 = f.read()

with open (f"/dbfs/mnt/2024.html", "r") as f:
    page7 = f.read()

#### Webscrape Each Webpage and Save as DataFrame

In [0]:
soup = BeautifulSoup(page, "html.parser")
soup.find("tr", class_="row-1").decompose()
stats_table = soup.find(id="tablepress-9")
stats = pd.read_html(str(stats_table))[0]
stats['year'] = "2017-2018"

soup = BeautifulSoup(page1, "html.parser")
soup.find("tr", class_="row-1").decompose()
stats_table1 = soup.find(id="tablepress-26")
stats1 = pd.read_html(str(stats_table1))[0]
stats1['year'] = "2018-2019"

soup = BeautifulSoup(page2, "html.parser")
soup.find("tr", class_="row-1").decompose()
stats_table2 = soup.find(id="tablepress-49")
stats2 = pd.read_html(str(stats_table2))[0]
stats2['year'] = "2019-2020"

# soup = BeautifulSoup(page3, "html.parser")
# soup.find("tr", class_="row-1").decompose()
# stats_table3 = soup.find(id="tablepress-9")
# stats3 = pd.read_html(str(stats_table3))[0]
# stats3['year'] = "2020-2021"

soup = BeautifulSoup(page4, "html.parser")
soup.find("tr", class_="row-1").decompose()
stats_table4 = soup.find(id="tablepress-78")
stats4 = pd.read_html(str(stats_table4))[0]
stats4['year'] = "2021-2022"

soup = BeautifulSoup(page5, "html.parser")
# soup.find("tr", class_="row-1").decompose()
stats_table5 = soup.find(id="tablepress-95")
stats5 = pd.read_html(str(stats_table5))[0]
stats5['year'] = "2022-2023"

soup = BeautifulSoup(page6, "html.parser")
# soup.find("tr", class_="row-1").decompose()
stats_table6 = soup.find(id="tablepress-109")
stats6 = pd.read_html(str(stats_table6))[0]
stats6['year'] = "2023-2024"

soup = BeautifulSoup(page7, "html.parser")
# soup.find("tr", class_="row-1").decompose()
stats_table7 = soup.find(id="tablepress-122")
stats7 = pd.read_html(str(stats_table7))[0]
stats7['year'] = "2024-2025"

In [0]:
stats_df = spark.createDataFrame(stats)
stats1_df = spark.createDataFrame(stats1)
stats2_df = spark.createDataFrame(stats2)
stats4_df = spark.createDataFrame(stats4)
stats5_df = spark.createDataFrame(stats5)
stats6_df = spark.createDataFrame(stats6)
stats7_df = spark.createDataFrame(stats7)

#### Clean Data and Select Pertinent Variables

In [0]:

stats_df = stats_df.withColumn("TEAM", regexp_extract(stats_df["TEAM"], r"\](.*?)\[/", 1))
stats1_df = stats1_df.withColumn("TEAM", regexp_extract(stats1_df["TEAM"], r"\](.*?)\[/", 1))
stats2_df = stats2_df.withColumn("TEAM", regexp_extract(stats2_df["TEAM"], r"\](.*?)\[/", 1))
stats4_df = stats4_df.withColumn("TEAM", regexp_extract(stats4_df["TEAM"], r"\](.*?)\[/", 1))

In [0]:
stats5_df = stats5_df.withColumnRenamed("TEAM", "TEAM_ABBRV")
stats5_df = stats5_df.join(teams_abbrv, on="TEAM_ABBRV", how="left")
stats5_df = stats5_df.drop("TEAM_ABBRV")
stats5_df = stats5_df.select('RANK','TEAM','CONF', 'DIVISION', 'GP', 'PPG', 'oPPG', 'pDIFF', 'PACE', 'oEFF', 'dEFF', 'eDIFF', 'SoS', 'rSoS', 'SAR', 'CONS', 'A4F', 'W', 'L', 'WIN%', 'eWIN%', 'pWIN%', 'ACH', 'STRK', 'Year')

In [0]:
master = stats7_df.union(stats6_df).union(stats5_df).union(stats4_df).union(stats2_df).union(stats1_df).union(stats_df)

In [0]:
master_teamdata = master.drop("RANK")

#### Save Dataframe as CSV

In [0]:
master_teamdata.coalesce(1).write.mode("overwrite").option("header", "true").csv("master_agent_data.csv")