In [36]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [None]:
def scrapeData(statYear, statDay, statName, statAbbrev):
    url = f"https://www.teamrankings.com/ncaa-basketball/stat/{statName}?date={statYear}-{statDay}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    table = soup.find("table")
    data = []
    columns = ["team", "year", statAbbrev]

    for row in table.find_all("tr")[1:]:
        cells = row.find_all("td")
        rowData = {
            "team": cells[1].text.strip(),
            "year": int(statYear),
            statAbbrev: cells[2].text.strip(),
        }
        data.append(rowData)

    return pd.DataFrame(data, columns=columns)


def compileData(statYear, statDay, stats):
    dfs = []
    for stat in stats:
        statName = stat["name"]
        statAbbrev = stat["abbrev"]
        df = scrapeData(statYear, statDay, statName, statAbbrev)
        df.sort_values(by="team", inplace=True)
        if dfs:
            df = df.drop(columns=["year"])
        dfs.append(df)

    combinedDf = dfs[0]
    for df in dfs[1:]:
        combinedDf = combinedDf.merge(df, on="team")

    return combinedDf

In [37]:
teamPlusOppStats = [
    {"name": "offensive-efficiency", "abbrev": "OffEff"},
    {"name": "defensive-efficiency", "abbrev": "DefEff"},
    {"name": "average-scoring-margin", "abbrev": "ASM"},
    {"name": "opponent-average-scoring-margin", "abbrev": "OpASM"},
    {"name": "effective-possession-ratio", "abbrev": "EfcPos"},
    {"name": "opponent-effective-possession-ratio", "abbrev": "OpEfcPos"},
    {"name": "win-pct-all-games", "abbrev": "W%"},
    {"name": "win-pct-close-games", "abbrev": "ClsW%"},
    {"name": "opponent-win-pct-all-games", "abbrev": "OpW%"},
    {"name": "opponent-win-pct-close-games", "abbrev": "OpClsW%"},
    {"name": "points-per-game", "abbrev": "PPG"},
    {"name": "opponent-points-per-game", "abbrev": "OpPPG"},
    {"name": "percent-of-points-from-2-pointers", "abbrev": "2pPts%"},
    {"name": "opponent-percent-of-points-from-2-pointers", "abbrev": "Op2pPts%"},
    {"name": "percent-of-points-from-3-pointers", "abbrev": "3pPts%"},
    {"name": "opponent-percent-of-points-from-3-pointers", "abbrev": "Op3pPts%"},
    {"name": "percent-of-points-from-free-throws", "abbrev": "FTPts%"},
    {"name": "opponent-percent-of-points-from-free-throws", "abbrev": "OpFTPts%"},
    {"name": "shooting-pct", "abbrev": "FG%"},
    {"name": "opponent-shooting-pct", "abbrev": "OpFG%"},
    {"name": "effective-field-goal-pct", "abbrev": "EfcFG%"},
    {"name": "opponent-effective-field-goal-pct", "abbrev": "OpEfcFG%"},
    {"name": "two-point-pct", "abbrev": "2p%"},
    {"name": "opponent-two-point-pct", "abbrev": "Op2p%"},
    {"name": "three-point-pct", "abbrev": "3p%"},
    {"name": "opponent-three-point-pct", "abbrev": "Op3p%"},
    {"name": "free-throw-pct", "abbrev": "FT%"},
    {"name": "opponent-free-throw-pct", "abbrev": "OpFT%"},
    {"name": "total-rebounds-per-game", "abbrev": "RPG"},
    {"name": "opponent-total-rebounds-per-game", "abbrev": "OpRPG"},
    {"name": "offensive-rebounds-per-game", "abbrev": "OffRPG"},
    {"name": "opponent-offensive-rebounds-per-game", "abbrev": "OpOffRPG"},
    {"name": "defensive-rebounds-per-game", "abbrev": "DefRPG"},
    {"name": "opponent-defensive-rebounds-per-game", "abbrev": "OpDefRPG"},
    {"name": "assists-per-game", "abbrev": "AstPG"},
    {"name": "opponent-assists-per-game", "abbrev": "OpAstPG"},
    {"name": "turnovers-per-game", "abbrev": "TovPG"},
    {"name": "opponent-turnovers-per-game", "abbrev": "OpTovPG"},
    {"name": "blocks-per-game", "abbrev": "BlkPG"},
    {"name": "opponent-blocks-per-game", "abbrev": "OpBlkPG"},
    {"name": "steals-per-game", "abbrev": "StlPG"},
    {"name": "opponent-steals-per-game", "abbrev": "OpStlPG"},
    {"name": "personal-fouls-per-game", "abbrev": "FlsPG"},
    {"name": "opponent-personal-fouls-per-game", "abbrev": "OpFlsPG"},
]

In [41]:
keyCompStats = [
    {"name": "offensive-efficiency", "abbrev": "OffEff"},
    {"name": "defensive-efficiency", "abbrev": "DefEff"},
    {"name": "average-scoring-margin", "abbrev": "ASM"},
    {"name": "opponent-average-scoring-margin", "abbrev": "OpASM"},
    {"name": "effective-possession-ratio", "abbrev": "EfcPos"},
    {"name": "opponent-effective-possession-ratio", "abbrev": "OpEfcPos"},
    {"name": "win-pct-all-games", "abbrev": "W%"},
    {"name": "win-pct-close-games", "abbrev": "ClsW%"},
    {"name": "opponent-win-pct-all-games", "abbrev": "OpW%"},
    {"name": "opponent-win-pct-close-games", "abbrev": "OpClsW%"},
]

In [42]:
teamOnlyStats = [
    {"name": "points-per-game", "abbrev": "PPG"},
    {"name": "percent-of-points-from-2-pointers", "abbrev": "2pPts%"},
    {"name": "percent-of-points-from-3-pointers", "abbrev": "3pPts%"},
    {"name": "percent-of-points-from-free-throws", "abbrev": "FTPts%"},
    {"name": "shooting-pct", "abbrev": "FG%"},
    {"name": "effective-field-goal-pct", "abbrev": "EfcFG%"},
    {"name": "two-point-pct", "abbrev": "2p%"},
    {"name": "three-point-pct", "abbrev": "3p%"},
    {"name": "free-throw-pct", "abbrev": "FT%"},
    {"name": "total-rebounds-per-game", "abbrev": "RPG"},
    {"name": "offensive-rebounds-per-game", "abbrev": "OffRPG"},
    {"name": "defensive-rebounds-per-game", "abbrev": "DefRPG"},
    {"name": "assists-per-game", "abbrev": "AstPG"},
    {"name": "turnovers-per-game", "abbrev": "TovPG"},
    {"name": "blocks-per-game", "abbrev": "BlkPG"},
    {"name": "steals-per-game", "abbrev": "StlPG"},
    {"name": "personal-fouls-per-game", "abbrev": "FlsPG"},
]

In [43]:
oppOnlyStats = [
    {"name": "opponent-points-per-game", "abbrev": "OpPPG"},
    {"name": "opponent-percent-of-points-from-2-pointers", "abbrev": "Op2pPts%"},
    {"name": "opponent-percent-of-points-from-3-pointers", "abbrev": "Op3pPts%"},
    {"name": "opponent-percent-of-points-from-free-throws", "abbrev": "OpFTPts%"},
    {"name": "opponent-shooting-pct", "abbrev": "OpFG%"},
    {"name": "opponent-effective-field-goal-pct", "abbrev": "OpEfcFG%"},
    {"name": "opponent-two-point-pct", "abbrev": "Op2p%"},
    {"name": "opponent-three-point-pct", "abbrev": "Op3p%"},
    {"name": "opponent-free-throw-pct", "abbrev": "OpFT%"},
    {"name": "opponent-total-rebounds-per-game", "abbrev": "OpRPG"},
    {"name": "opponent-offensive-rebounds-per-game", "abbrev": "OpOffRPG"},
    {"name": "opponent-defensive-rebounds-per-game", "abbrev": "OpDefRPG"},
    {"name": "opponent-assists-per-game", "abbrev": "OpAstPG"},
    {"name": "opponent-turnovers-per-game", "abbrev": "OpTovPG"},
    {"name": "opponent-blocks-per-game", "abbrev": "OpBlkPG"},
    {"name": "opponent-steals-per-game", "abbrev": "OpStlPG"},
    {"name": "opponent-personal-fouls-per-game", "abbrev": "OpFlsPG"},
]

In [44]:
def collectHistoric(stats, filename, dir="../data"):
    for year in [
        "2014",
        "2015",
        "2016",
        "2017",
        "2018",
        "2019",
        "2021",
        "2022",
        "2023",
    ]:
        df = compileData(statYear=year, statDay="05-01", stats=stats)
        df.to_csv(f"{dir}/{filename}{year[2:]}.csv", index=False)
    
def collectCurrent(stats, filename, dir="../data"):
    df = compileData(statYear="2024", statDay="03-18", stats=stats)
    df.to_csv(f"{dir}/{filename}24.csv", index=False)