In [21]:
%pip install lxml
%pip install importlib
import importlib

# Check if the package is installed, and if not, install it
def install_package(package_name):
    try:
        importlib.import_module(package_name)
    except ImportError:
        import subprocess
        subprocess.check_call(['pip', 'install', package_name])

# Check and install required packages
required_packages = ['pandas', 'requests', 'beautifulsoup4', 'nbformat', 'io']

for package in required_packages:
    install_package(package)

# Now you can safely import the packages
import pandas as pd
import requests
from bs4 import BeautifulSoup
import nbformat
from io import StringIO

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


# Scrape basic stat datasets from FantasyPros.com

Scrape overall scoring data

In [5]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# List of URLs to scrape for fantasy stats
urls = [
    'https://www.fantasypros.com/nfl/stats/qb.php?scoring=HALF&roster=y',
    'https://www.fantasypros.com/nfl/stats/rb.php?scoring=HALF&roster=y',
    'https://www.fantasypros.com/nfl/stats/wr.php?scoring=HALF&roster=y',
    'https://www.fantasypros.com/nfl/stats/te.php?scoring=HALF&roster=y'
]

# Initialize an empty list to store DataFrames
data_frames = []

for url in urls:
    # Send an HTTP GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find the table on the page
        table = soup.find('table')

        # Read the table into a Pandas DataFrame
        df = pd.read_html(str(table), header=[0, 1])[0]
        
        # Add a "LOC" column to the DataFrame
        loc = url.split('/')[-1][:2]
        df[("LOC", "POS")] = loc

        data_frames.append(df)
    else:
        print(f"Failed to retrieve data from {url}")

# Merge all DataFrames into one based on the first and second row headers
merged_df = pd.concat(data_frames, ignore_index=True)

# Combine values in column names (headers) and row 0
merged_df.columns = merged_df.columns.map(' '.join)

# Reset the index
merged_df.reset_index(drop=True, inplace=True)

# Rename columns as specified
merged_df = merged_df.rename(columns={"Unnamed: 0_level_0 Rank": "POS RANK", "Unnamed: 1_level_0 Player": "PLAYER", "LOC POS": "POS"})

merged_df.to_csv('datasets/overall_scoring.csv', index=False)

  df = pd.read_html(str(table), header=[0, 1])[0]


ImportError: Missing optional dependency 'lxml'.  Use pip or conda to install lxml.

Scrape snap counts data

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# List of URLs to scrape for snap counts
snap_count_urls = [
    'https://www.fantasypros.com/nfl/reports/snap-counts/rb.php?show=perc',
    'https://www.fantasypros.com/nfl/reports/snap-counts/wr.php?show=perc',
    'https://www.fantasypros.com/nfl/reports/snap-counts/te.php?show=perc'
]

# Initialize an empty list to store DataFrames for snap counts
snap_count_data_frames = []

for url in snap_count_urls:
    # Send an HTTP GET request to the URL for snap counts
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find the table on the page
        table = soup.find('table')

        # Read the table into a Pandas DataFrame
        df = pd.read_html(str(table), header=[0])[0]
        
        # Add a "POS" column to the DataFrame for snap counts
        pos = url.split('/')[-1][:2]
        df[("POS")] = pos

        snap_count_data_frames.append(df)
    else:
        print(f"Failed to retrieve data from {url} (snap counts)")

# Concatenate (append) all DataFrames for snap counts
snap_count_merged_df = pd.concat(snap_count_data_frames, ignore_index=True)

# If you want to save the data to a CSV file, you can do it like this:
snap_count_merged_df.to_csv('datasets/snap_counts.csv', index=False)

snap_count_merged_df.head(10)


  df = pd.read_html(str(table), header=[0])[0]
  df = pd.read_html(str(table), header=[0])[0]
  df = pd.read_html(str(table), header=[0])[0]


Unnamed: 0,Player,Team,1,2,3,4,5,6,7,8,...,12,13,14,15,16,17,18,TTL,AVG,POS
0,Isiah Pacheco,KC,48%,51%,42%,60%,59%,,,,...,,,,,,,,178,52%,rb
1,Clyde Edwards-Helaire,KC,22%,16%,30%,9%,14%,,,,...,,,,,,,,63,18%,rb
2,Jerick McKinnon,KC,31%,33%,29%,31%,27%,,,,...,,,,,,,,104,30%,rb
3,Jahmyr Gibbs,DET,27%,48%,60%,37%,0%,,,,...,,,,,,,,121,43%,rb
4,David Montgomery,DET,79%,45%,0%,71%,75%,,,,...,,,,,,,,183,68%,rb
5,Gus Edwards,BAL,23%,43%,44%,69%,43%,,,,...,,,,,,,,149,44%,rb
6,J.K. Dobbins,BAL,47%,0%,0%,0%,0%,,,,...,,,,,,,,30,47%,rb
7,Justice Hill,BAL,30%,57%,0%,12%,56%,,,,...,,,,,,,,108,40%,rb
8,Dameon Pierce,HOU,47%,45%,54%,59%,59%,,,,...,,,,,,,,180,52%,rb
9,Devin Singletary,HOU,21%,36%,39%,35%,29%,,,,...,,,,,,,,110,32%,rb


Push updated data file

In [None]:
import subprocess

# List of file paths to push
file_paths = ["datasets/"]

# Specify the GitHub repository URL
repo_url = "https://github.com/jtaylor515/FFanalysis.git"

# Specify your commit message
commit_message = "Update files"

# Git commands to add, commit, and push each file in the list
for file_path in file_paths:
    try:
        subprocess.run(["git", "add", file_path])
        subprocess.run(["git", "commit", "-m", commit_message])
        subprocess.run(["git", "push", repo_url])
        print(f"File {file_path} successfully pushed to the repository.")
    except Exception as e:
        print(f"Error: {e}")


[main 70f8cf6] Update files
 2 files changed, 1272 insertions(+)
 create mode 100644 datasets/overall_scoring.csv
 create mode 100644 datasets/snap_counts.csv
File datasets/ successfully pushed to the repository.


To https://github.com/jtaylor515/FFanalysis.git
   9648ec5..70f8cf6  main -> main


Scrape weekly scoring data

Generate URLs to scrape

In [22]:
# Get user input for the number of URLs to generate
num_urls = int(input("Enter the current week: "))

# Define the base URL and the page options
base_url = "https://www.fantasypros.com/nfl/stats/"

# List of page options
pages = ['qb.php', 'wr.php', 'rb.php', 'te.php']

# Initialize the list to store the generated URLs
urls = []

# Generate URLs based on user input
for page in pages:
    for week in range(1, num_urls + 1):
        url = f"{base_url}{page}?range=week&week={week}"
        urls.append(url)


In [25]:
# Initialize an empty DataFrame to store the data
final_dataset = pd.DataFrame()

# Iterate through the URLs
for url in urls:
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Assuming the data is in a table, you may need to adjust the code based on the actual structure
        table = soup.find('table')

        # Use io.StringIO to wrap the HTML content
        table_string = str(table)
        table_io = StringIO(table_string)
        
        # Read the table into a DataFrame
        df = pd.read_html(table_io)[0]

        # Add a "LOC" column to the DataFrame
        loc = url.split('/')[-1][:2]
        df[("LOC", "POS")] = loc
        
        # Extract week value from the URL
        week_value = int(url.split('week=')[1])
        
        # Add a new 'Week' column with the week value
        df['WEEK'] = week_value
        
        # Concatenate the DataFrame to the final dataset
        final_dataset = pd.concat([final_dataset, df], ignore_index=True)
    else:
        print(f"Failed to fetch data from URL: {url}")

# Now, final_dataset contains the combined data with a 'Week' column
# final_dataset.head(10)

# Combine values in column names (headers) and row 0
final_dataset.columns = final_dataset.columns.map(' '.join)

# Reset the index
final_dataset.reset_index(drop=True, inplace=True)

# Rename columns as specified
final_dataset = final_dataset.rename(columns={"Unnamed: 0_level_0 Rank": "POS RANK", "Unnamed: 1_level_0 Player": "PLAYER", "LOC POS": "POS"})

final_dataset.to_csv('datasets/weekly_scoring.csv', index=False)

final_dataset.head(10)


Unnamed: 0,POS RANK,PLAYER,PASSING CMP,PASSING ATT,PASSING PCT,PASSING YDS,PASSING Y/A,PASSING TD,PASSING INT,PASSING SACKS,...,RECEIVING REC,RECEIVING TGT,RECEIVING YDS,RECEIVING Y/R,RECEIVING LG,RECEIVING 20+,RECEIVING TD,RUSHING Y/A,RUSHING LG,RUSHING 20+
0,1,Tua Tagovailoa (MIA),28.0,45.0,62.2,466.0,10.4,3.0,1.0,0.0,...,,,,,,,,,,
1,2,Mac Jones (NE),35.0,54.0,64.8,316.0,5.9,3.0,1.0,2.0,...,,,,,,,,,,
2,3,Jordan Love (GB),15.0,27.0,55.6,245.0,9.1,3.0,0.0,1.0,...,,,,,,,,,,
3,4,Anthony Richardson (IND),24.0,37.0,64.9,223.0,6.0,1.0,1.0,4.0,...,,,,,,,,,,
4,5,Deshaun Watson (CLE),16.0,29.0,55.2,154.0,5.3,1.0,1.0,3.0,...,,,,,,,,,,
5,6,Justin Herbert (LAC),23.0,33.0,69.7,229.0,6.9,1.0,0.0,3.0,...,,,,,,,,,,
6,7,Patrick Mahomes II (KC),21.0,39.0,53.8,226.0,5.8,2.0,1.0,0.0,...,,,,,,,,,,
7,8,Trevor Lawrence (JAC),24.0,32.0,75.0,241.0,7.5,2.0,1.0,2.0,...,,,,,,,,,,
8,9,Kirk Cousins (MIN),33.0,44.0,75.0,344.0,7.8,2.0,1.0,2.0,...,,,,,,,,,,
9,10,Brock Purdy (SF),19.0,29.0,65.5,220.0,7.6,2.0,0.0,3.0,...,,,,,,,,,,
