In [1]:
from bs4 import BeautifulSoup
import requests as req
from time import sleep
import pandas as pd
import os
import time
from datetime import datetime as dt
from tqdm import tqdm
#from tqdm import tqdm


# configure pandas
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36"
    )
}

# Historical Odds Data - data not ready until Thursday
___

In [2]:
url = f"https://www.sportsoddshistory.com/nfl-game-season/?y=2022"
page = req.get(url, headers=headers).text
soup = BeautifulSoup(page, "html.parser")
tables = soup.find_all("table", attrs={"class":"soh1"})
pd_list = []
for i, table in enumerate(tables[2:]):
    df = pd.read_html(str(table))
    df = df[0]
    df.columns = df.iloc[-2]
    df = df.iloc[:-2,: ]
    df["week"] = i + 1
    pd_list.append(df)
df = pd.concat(pd_list)
print(df.head())
df.to_csv("data/nfl_weekly_betting_summary.csv")

ValueError: Plan shapes are not aligned

# Ranking Data
____

## Team Stats

In [3]:
date = "2023-02-09"
url = "https://www.teamrankings.com/nfl/stats/"
page = req.get(url, headers=headers).text
soup = BeautifulSoup(page, "html.parser")
pbar = tqdm([a["href"] for a in soup.find(attrs={"class":"column large-2"}).find_all("a") if a["href"] != "#"])

for end_point in pbar:
    time.sleep(.25)
    url = f"https://www.teamrankings.com{end_point}?date={date}"
    page = req.get(url, headers=headers).text
    soup = BeautifulSoup(page, "html.parser")
    table = soup.find("table")
    df = pd.read_html(str(table))[0].replace("--",0)
    df["asof"] = date
    df["url"] = end_point
    file_name = end_point.replace('/', '_').replace("-","_") + "_" + date.replace("-", "")
    full_file_path = f"data/team_ranks/{file_name}.csv"
    df.to_csv(full_file_path)
    pbar.set_description(full_file_path)
print(df.head())

data/team_ranks/_nfl_stat_opponent_penalties_per_play_20230209.csv: 100%|██████████| 226/226 [02:36<00:00,  1.44it/s]                     

   Rank       Team  2022  Last 3  Last 1  Home  Away  2021        asof                                    url
0     1  Tennessee  0.05    0.03    0.03  0.05  0.06  0.05  2023-02-09  /nfl/stat/opponent-penalties-per-play
1     2   Carolina  0.05    0.04    0.02  0.05  0.05  0.05  2023-02-09  /nfl/stat/opponent-penalties-per-play
2     3  NY Giants  0.05    0.03    0.04  0.05  0.05  0.05  2023-02-09  /nfl/stat/opponent-penalties-per-play
3     4     Denver  0.05    0.03    0.04  0.06  0.04  0.05  2023-02-09  /nfl/stat/opponent-penalties-per-play
4     5  Minnesota  0.05    0.03    0.04  0.05  0.05  0.04  2023-02-09  /nfl/stat/opponent-penalties-per-play





## Player Stats

In [4]:
url = "https://www.teamrankings.com/nfl/stats/"
page = req.get(url, headers=headers).text
soup = BeautifulSoup(page, "html.parser")

pbar = tqdm([a["href"] for a in soup.find_all(attrs={"class":"column large-2"})[-1].find_all("a") if a["href"] != "#"])

for end_point in pbar:
    time.sleep(.25)
    url = f"https://www.teamrankings.com{end_point}?season_id=20"
    page = req.get(url, headers=headers).text
    soup = BeautifulSoup(page, "html.parser")
    table = soup.find("table")
    df = pd.read_html(str(table))[0].replace("--",0)
    df["asof"] = dt.now()
    df["url"] = end_point
    file_name = end_point.replace('/', '_').replace("-","_") + "_" + pd.to_datetime(date).strftime("%Y%m%d").replace("-", "")
    full_file_path = f"data/team_ranks/{file_name}.csv"
    df.to_csv(full_file_path)
    pbar.set_description(full_file_path)
print(df.head())

data/team_ranks/_nfl_player_stat_scoring_total_points_20230209.csv: 100%|██████████| 112/112 [01:27<00:00,  1.27it/s]                   

   Rank         Player                 Team Pos  Value                       asof                                    url
0     1   Robbie Gould  San Francisco 49ers   K    160 2023-02-09 11:53:33.344206  /nfl/player-stat/scoring-total-points
1     2    Jason Myers     Seattle Seahawks   K    148 2023-02-09 11:53:33.344206  /nfl/player-stat/scoring-total-points
2     3  Justin Tucker     Baltimore Ravens   K    147 2023-02-09 11:53:33.344206  /nfl/player-stat/scoring-total-points
3     4    Brett Maher       Dallas Cowboys   K    144 2023-02-09 11:53:33.344206  /nfl/player-stat/scoring-total-points
4     5     Tyler Bass        Buffalo Bills   K    143 2023-02-09 11:53:33.344206  /nfl/player-stat/scoring-total-points





## Team Schedules


In [5]:
url = "https://www.teamrankings.com/nfl/"
page = req.get(url, headers=headers).text
soup = BeautifulSoup(page, "html.parser")

pbar = tqdm([li.find("a")["href"] for li in soup.find("div", attrs={"data-section":"nfl"}).find_all(attrs={"class":"teams-group"})[0].find_all("li")])

for end_point in pbar:
    time.sleep(.25)
    url = f"https://www.teamrankings.com{end_point}"
    page = req.get(url, headers=headers).text
    soup = BeautifulSoup(page, "html.parser")
    table = soup.find("table", attrs={"class":"tr-table datatable scrollable"})
    df = pd.read_html(str(table), displayed_only=False)[0].replace("--",0)
    df["asof"] = dt.now()
    df["url"] = end_point
    file_name = end_point.replace('/', '_').replace("-","_") + "_" + pd.to_datetime(date).strftime("%Y%m%d").replace("-", "")    
    
    full_file_path = f"data/team_ranks/{file_name}.csv"
    df.to_csv(full_file_path)
    pbar.set_description(full_file_path)

print(df.head())

data/team_ranks/_nfl_team_washington_commanders__20230209.csv: 100%|██████████| 32/32 [00:20<00:00,  1.58it/s]

    Date      Opponent   Result Location  W/L  Div  Spread    Total  Money                       asof                               url
0  09/11  Jacksonville  W 28-22     Home  1-0  0-0    -3.0  Ov 43.0   -157 2023-02-09 11:53:53.948050  /nfl/team/washington-commanders/
1  09/18       Detroit  L 27-36     Away  1-1  0-0  (Pick)  Ov 48.0   -115 2023-02-09 11:53:53.948050  /nfl/team/washington-commanders/
2  09/25  Philadelphia   L 8-24     Home  1-2  0-1    +5.5  Un 47.5    200 2023-02-09 11:53:53.948050  /nfl/team/washington-commanders/
3  10/02        Dallas  L 10-25     Away  1-3  0-2    +3.0  Un 41.0    140 2023-02-09 11:53:53.948050  /nfl/team/washington-commanders/
4  10/09     Tennessee  L 17-21     Home  1-4  0-2    +0.5  Un 43.0   -110 2023-02-09 11:53:53.948050  /nfl/team/washington-commanders/





In [6]:
url = "https://www.teamrankings.com/nfl/schedules/season/?week=557"
page = req.get(url, headers=headers).text
soup = BeautifulSoup(page, "html.parser")
table = soup.find("table")
df = pd.read_html(str(table), displayed_only=False)[0]
df.columns = ["teams", "time", "location"]


# 557 == week 11
url = "https://www.teamrankings.com/nfl/schedules/season/?week=557"
page = req.get(url, headers=headers).text
soup = BeautifulSoup(page, "html.parser")
table = soup.find("table")
df_schedule = pd.read_html(str(table), displayed_only=False)[0]
df_schedule.columns = ["teams", "time", "location"]
df_schedule["home"] = df_schedule.teams.str.split("@", expand=True)[1].str.strip("")
df_schedule["away"] = df_schedule.teams.str.split("@", expand=True)[0].str.strip()
df_schedule


Unnamed: 0,teams,time,location,home,away
0,Tennessee @ Green Bay,8:15 PM,Lambeau Field,Green Bay,Tennessee
1,Detroit @ NY Giants,1:00 PM,MetLife Stadium,NY Giants,Detroit
2,LA Rams @ New Orleans,1:00 PM,Caesars Superdome,New Orleans,LA Rams
3,NY Jets @ New England,1:00 PM,Gillette Stadium,New England,NY Jets
4,Philadelphia @ Indianapolis,1:00 PM,Lucas Oil Stadium,Indianapolis,Philadelphia
5,Washington @ Houston,1:00 PM,NRG Stadium,Houston,Washington
6,Cleveland vs. Buffalo,1:00 PM,Ford Field,,Cleveland vs. Buffalo
7,Carolina @ Baltimore,1:00 PM,M&T Bank Stadium,Baltimore,Carolina
8,Chicago @ Atlanta,1:00 PM,Mercedes-Benz Stadium,Atlanta,Chicago
9,Las Vegas @ Denver,4:05 PM,Empower Field at Mile High,Denver,Las Vegas
