In [40]:
# import modules
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup
import csv
import re

In [None]:
# Create a dictionary of URLs for each position group

url_dict = {
    "QB": "https://www.fantasypros.com/nfl/projections/qb.php?week=draft",
    "RB": "https://www.fantasypros.com/nfl/projections/rb.php?week=draft&scoring=PPR&week=draft",
    "WR": "https://www.fantasypros.com/nfl/projections/wr.php?week=draft&scoring=PPR&week=draft",
    "TE": "https://www.fantasypros.com/nfl/projections/te.php?week=draft&scoring=PPR&week=draft",
    "FLEX": "https://www.fantasypros.com/nfl/projections/flex.php?week=draft&scoring=PPR&week=draft",
    "K": "https://www.fantasypros.com/nfl/projections/k.php?week=draft",
    "DEF": "https://www.fantasypros.com/nfl/projections/dst.php?week=draft",
}

In [2]:
# Step 2: Loop through each key-value pair in the dictionary
for name, url in url_dict.items():
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    # Example: Locate and extract data from a table (customize this as needed)
    table = soup.find("table")

    if table:
        headers = [header.get_text().strip() for header in table.find_all("th")]
        rows = []
        for row in table.find_all("tr"):
            cells = [cell.get_text().strip() for cell in row.find_all("td")]
            if cells:  # Only append non-empty rows
                rows.append(cells)

        # Combine headers and rows into a DataFrame
        df = pd.DataFrame(rows, columns=headers)

        # Step 3: Save each DataFrame to a separate CSV file with the variable name
        filename = os.path.join("data/scraped", f"{name}_projections.csv")
        df.to_csv(filename, index=False)

        print(f"Data from {name} ({url}) saved to {filename}")

print("Scraping and data saving completed.")

Data from QB (https://www.fantasypros.com/nfl/projections/qb.php?week=draft) saved to data/scraped/QB_projections.csv
Data from RB (https://www.fantasypros.com/nfl/projections/rb.php?week=draft&scoring=PPR&week=draft) saved to data/scraped/RB_projections.csv
Data from WR (https://www.fantasypros.com/nfl/projections/wr.php?week=draft&scoring=PPR&week=draft) saved to data/scraped/WR_projections.csv
Data from TE (https://www.fantasypros.com/nfl/projections/te.php?week=draft&scoring=PPR&week=draft) saved to data/scraped/TE_projections.csv
Data from FLEX (https://www.fantasypros.com/nfl/projections/flex.php?week=draft&scoring=PPR&week=draft) saved to data/scraped/FLEX_projections.csv
Data from K (https://www.fantasypros.com/nfl/projections/k.php?week=draft) saved to data/scraped/K_projections.csv
Data from DEF (https://www.fantasypros.com/nfl/projections/dst.php?week=draft) saved to data/scraped/DEF_projections.csv
Scraping and data saving completed.


In [50]:
# Clean the data so it can be manipulated

# Some of the positions have an extra row of column headers that need to be deleted

clean_files = [
    "QB_projections",
    "RB_projections",
    "WR_projections",
    "TE_projections",
    "FLEX_projections",
    "K_projections",
    "DEF_projections",
]

for file in clean_files:
    df = pd.read_csv(f"data/scraped/{file}.csv")

    # Make sure all columns after the 'Player' column are numerical
    df.iloc[:, 2:] = (
        df.iloc[:, 2:]
        .replace({",": ""}, regex=True)
        .apply(pd.to_numeric, errors="coerce")
        .astype(float)
    )

    if file == "RB_projections":
        df = df[1:]
        df = df.rename(
            columns={
                "YDS": "RUSH_YDS",
                "TDS": "RUSH_TDS",
                "YDS.1": "REC_YDS",
                "TDS.1": "REC_TDS",
            }
        )
    elif file == "WR_projections":
        df = df[1:]
        df = df.rename(
            columns={
                "YDS": "REC_YDS",
                "TDS": "REC_TDS",
                "YDS.1": "RUSH_YDS",
                "TDS.1": "RUSH_TDS",
            }
        )
    elif file == "TE_projections":
        df = df[1:]
        df = df.rename(columns={"YDS": "REC_YDS", "TDS": "REC_TDS"})

    elif file == "QB_projections":
        df = df[1:]
        df = df.rename(columns={"YDS": "PASS_YDS", "TDS": "PASS_TDS"})

    elif file == "FLEX_projections":
        df = df[1:]
        df = df.rename(
            columns={
                "YDS": "RUSH_YDS",
                "TDS": "RUSH_TDS",
                "YDS.1": "REC_YDS",
                "TDS.1": "REC_TDS",
            }
        )
        df["POS"] = df["POS"].astype(str).apply(lambda x: re.sub(r"[^a-zA-Z]", "", x))

    # Save cleaned files to the clean subdirectory
    filename = os.path.join("data/clean", f"{file}_clean.csv")
    df.to_csv(filename, index=False)

    print(f"Data from {file} saved to {filename}")

print("Data cleaning completed.")

Data from QB_projections saved to data/clean/QB_projections_clean.csv
Data from RB_projections saved to data/clean/RB_projections_clean.csv
Data from WR_projections saved to data/clean/WR_projections_clean.csv
Data from TE_projections saved to data/clean/TE_projections_clean.csv
Data from FLEX_projections saved to data/clean/FLEX_projections_clean.csv
Data from K_projections saved to data/clean/K_projections_clean.csv
Data from DEF_projections saved to data/clean/DEF_projections_clean.csv
Data cleaning completed.


In [57]:
df = pd.read_csv("data/clean/DEF_projections_clean.csv")
print(df.dtypes)
df.head()

Player      object
SACK       float64
INT        float64
FR         float64
FF         float64
TD         float64
SAFETY     float64
PA         float64
YDS AGN    float64
FPTS       float64
dtype: object


Unnamed: 0,Player,SACK,INT,FR,FF,TD,SAFETY,PA,YDS AGN,FPTS
0,Dallas Cowboys,50.1,14.2,10.9,16.7,3.1,0.0,347.5,5446.7,118.9
1,Baltimore Ravens,51.1,14.2,11.0,16.5,2.7,0.0,323.9,5261.9,117.4
2,New York Jets,47.9,14.5,9.7,15.5,2.8,1.0,339.6,5162.8,115.2
3,Houston Texans,49.3,13.3,10.7,15.2,2.4,0.5,392.9,5696.1,112.4
4,Philadelphia Eagles,51.0,13.1,10.2,15.2,2.5,0.0,382.4,5572.7,112.4


In [3]:
# # URLs to scrape from

# QB = "https://www.fantasypros.com/nfl/projections/qb.php?week=draft"
# RB = "https://www.fantasypros.com/nfl/projections/rb.php?week=draft&scoring=PPR&week=draft"
# WR = "https://www.fantasypros.com/nfl/projections/wr.php?week=draft&scoring=PPR&week=draft"
# TE = "https://www.fantasypros.com/nfl/projections/te.php?week=draft&scoring=PPR&week=draft"
# FLEX = "https://www.fantasypros.com/nfl/projections/flex.php?week=draft&scoring=PPR&week=draft"
# K = "https://www.fantasypros.com/nfl/projections/k.php?week=draft"
# DEF = "https://www.fantasypros.com/nfl/projections/dst.php?week=draft"

# urls = [QB, RB, WR, TE, FLEX, K, DEF]

In [4]:
# # Make a get request
# r = requests.get(url)

# # Initialize soup object
# soup = BeautifulSoup(r.text, "html.parser")

In [5]:
# # Find the table
# table = soup.find("table")

In [6]:
# # print(table)

# # Extract table data

# # Extract headers
# headers = []
# for header in table.find_all("th"):
#     headers.append(header.get_text().strip())


# # Extract rows
# rows = []
# for row in table.find_all("tr"):
#     cells = row.find_all("td")
#     cells = [cell.get_text().strip() for cell in cells]
#     if cells:
#         rows.append(cells)

In [7]:
# # Check if we can see the data

# for header in headers:
#     print(header, end=" | ")
# # print('\n' + '-'*40)

# for row in rows:
#     print(" | ".join(row))

In [8]:
# print(headers)
# print(rows[1:])

In [9]:
# # Save data as csv

# rows = rows[1:]

# with open("rb_projections", "w") as file:
#     writer = csv.writer(file)
#     writer.writerow(headers)
#     for row in rows:
#         writer.writerow(row)

In [10]:
# data = pd.read_csv("rb_projections")

# data.head(25)

# # data.dtypes

# data["YDS"] = data["YDS"].str.replace(",", "")
# data["YDS"] = pd.to_numeric(data["YDS"], errors="coerce")
# data["YDS"] = data["YDS"].astype("float64")

# data.head()

In [11]:
# # Overwrite the csv file with the new cleaned data

# data.to_csv("rb_projections", index=False)