In [1]:
# from database.scrape import scrape_defense
import pandas as pd
import time
import re
import requests
from bs4 import BeautifulSoup as BS

In [2]:
def scrape_defense(year: int, defense: str):
    """
    Scrapes defensive game-by-game statistics for a specific team in a given year from Pro Football Reference.

    This function sends an HTTP GET request to the specified URL for the team's defensive statistics for the given year,
    parses the HTML content, and returns a DataFrame with cleaned and standardized column names. The DataFrame
    includes statistics such as passing completions, attempts, yards, touchdowns, interceptions, and rushing attempts, yards, and touchdowns.

    Parameters:
    -----------
    year : int
        The year for which the defensive statistics are to be scraped (e.g., 2023).
    defense : str
        The abbreviation of the defensive team (e.g., "NE" for New England Patriots).

    Returns:
    --------
    pd.DataFrame
        A DataFrame containing the defensive team's game-by-game statistics for the specified year, with standardized column names.
    """
    URL = f"https://www.pro-football-reference.com/teams/{defense}/{year}/gamelog/"

    res = requests.get(URL, verify=False)

    soup = BS(res.content, "html.parser")

    table = soup.find_all("table", {"id": f"gamelog_opp{year}"})

    test = soup.find_all("td", {"data-stat": "opp"}, limit=17)
    code_list = []
    for i in test:
        html_string = str(i)

        # Regular expression to match the three-letter team code
        match = re.search(r"/teams/([a-z]{3})/", html_string)

        if match:
            team_code = match.group(1)
        else:
            print("No match found")
        code_list.append(team_code)

    df = pd.read_html(str(table))[0]
    flattened_columns = ["_".join(col).strip() for col in df.columns.values]
    df.columns = flattened_columns
    df.rename(
        columns={
            "Unnamed: 0_level_0_Week": "WEEK",
        },
        inplace=True,
    )
    df.columns = df.columns.str.upper()
    df.columns = df.columns.str.replace(".", "")
    df.columns = df.columns.str.replace("/", "_")
    df.columns = df.columns.str.replace(".", "")
    df.columns = df.columns.str.replace(" ", "_")
    df.columns = df.columns.str.replace("%", "_PCT")
    df["YEAR"] = year
    df["WEEK"] = df["WEEK"].astype(float)
    df["DEF_TEAM"] = defense.upper()
    df["OPP_CODE"] = code_list
    df["OPP_CODE"] = df["OPP_CODE"].str.upper()

  
    df_def = df[
        [
            "DEF_TEAM",
            "OPP_CODE",
            "YEAR",
            "WEEK",
            "SCORE_OPP",
            "PASSING_CMP",
            "PASSING_ATT",
            "PASSING_YDS",
            "PASSING_TD",
            "PASSING_INT",
            "PASSING_SK",
            "PASSING_Y_A",
            "PASSING_NY_A",
            "PASSING_CMP_PCT",
            "PASSING_RATE",
            "RUSHING_ATT",
            "RUSHING_YDS",
            "RUSHING_Y_A",
            "RUSHING_TD",
        ]
    ]

    return df_def

In [3]:
scrape_defense(year=2024 , defense='clt')

  df.columns = df.columns.str.replace(".", "")
  df.columns = df.columns.str.replace(".", "")


Unnamed: 0,DEF_TEAM,OPP_CODE,YEAR,WEEK,SCORE_OPP,PASSING_CMP,PASSING_ATT,PASSING_YDS,PASSING_TD,PASSING_INT,PASSING_SK,PASSING_Y_A,PASSING_NY_A,PASSING_CMP_PCT,PASSING_RATE,RUSHING_ATT,RUSHING_YDS,RUSHING_Y_A,RUSHING_TD
0,CLT,HTX,2024,1.0,,,,,,,,,,,,,,,
1,CLT,GNB,2024,2.0,,,,,,,,,,,,,,,
2,CLT,CHI,2024,3.0,,,,,,,,,,,,,,,
3,CLT,PIT,2024,4.0,,,,,,,,,,,,,,,
4,CLT,JAX,2024,5.0,,,,,,,,,,,,,,,
5,CLT,OTI,2024,6.0,,,,,,,,,,,,,,,
6,CLT,MIA,2024,7.0,,,,,,,,,,,,,,,
7,CLT,HTX,2024,8.0,,,,,,,,,,,,,,,
8,CLT,MIN,2024,9.0,,,,,,,,,,,,,,,
9,CLT,BUF,2024,10.0,,,,,,,,,,,,,,,


In [4]:
# URL = f"https://www.pro-football-reference.com/teams/clt/2024/gamelog/"

# res = requests.get(URL, verify=False)

# soup = BS(res.content, "html.parser")

# table = soup.find_all("table", {"id": "gamelog_opp2024"})
# test = soup.find_all("td", {"data-stat":"opp"}, limit=17)

In [5]:
# import re
# code_list = []
# for i in test: 
#     html_string = str(i)

#     # Regular expression to match the three-letter team code
#     match = re.search(r'/teams/([a-z]{3})/', html_string)

#     if match:
#         team_code = match.group(1)
#     else:
#         print("No match found")
#     code_list.append(team_code)
# code_list

In [6]:
# len(code_list)