In [1]:
!pip install selenium webdriver-manager

Collecting selenium
  Obtaining dependency information for selenium from https://files.pythonhosted.org/packages/2f/a6/fc66ea71ec0769f72abdf15cb9ec9269517abe68a160839383ddff7478f1/selenium-4.29.0-py3-none-any.whl.metadata
  Downloading selenium-4.29.0-py3-none-any.whl.metadata (7.1 kB)
Collecting webdriver-manager
  Obtaining dependency information for webdriver-manager from https://files.pythonhosted.org/packages/b5/b5/3bd0b038d80950ec13e6a2c8d03ed8354867dc60064b172f2f4ffac8afbe/webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata
  Using cached webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting trio~=0.17 (from selenium)
  Obtaining dependency information for trio~=0.17 from https://files.pythonhosted.org/packages/c9/55/c4d9bea8b3d7937901958f65124123512419ab0eb73695e5f382521abbfb/trio-0.29.0-py3-none-any.whl.metadata
  Downloading trio-0.29.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Obtaining dependency infor

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import time
from bs4 import BeautifulSoup

# Set up Chrome with WebDriver Manager
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

# Load the page and wait
driver.get("https://walterfootball.com/scoutingreports.php")
time.sleep(20)  # Wait for 20 seconds (or use explicit waits)

# Capture the page source to a variable
page_snapshot = driver.page_source

# Close the browser - we don't need it anymore
driver.quit()



print("We got the snapshot of the website in the html format. We will be working on that. ")


In [6]:

# Now work with the snapshot using BeautifulSoup
soup = BeautifulSoup(page_snapshot, 'html.parser')

# Find all scouting report links using CSS selectors
link_elements = soup.select('div.entry-content font b a')

# Extract links and text
link_list = []
for link in link_elements:
    href = link.get('href')
    text = link.get_text(strip=True)
    if href and href.startswith('/scoutingreport'):
        link_list.append({
            'text': text,
            'href': href
        })
#
# # Print results
# print(f"Found {len(link_list)} scouting reports:")
# for idx, item in enumerate(link_list, 1):
#     print(f"{idx}. {item['text']}")
#     print(f"   {item['href']}")
#     print("-" * 50)

In [7]:
#
# Print results
print(f"Found {len(link_list)} scouting reports:")
for idx, item in enumerate(link_list, 1):
    print(f"{idx}. {item['text']}")
    print(f"   {item['href']}")
    print("-" * 50)

Found 1316 scouting reports:
1. Jayden Daniels, LSU
   /scoutingreport2024jdaniels.php
--------------------------------------------------
2. Sam Hartman, Notre Dame
   /scoutingreport2024shartman.php
--------------------------------------------------
3. Drake Maye, North Carolina
   /scoutingreport2024dmaye.php
--------------------------------------------------
4. J.J. McCarthy, Michigan
   /scoutingreport2024jjmccarthy.php
--------------------------------------------------
5. Joe Milton, Tennessee
   /scoutingreport2024jmilton.php
--------------------------------------------------
6. Bo Nix, Oregon
   /scoutingreport2024bnix.php
--------------------------------------------------
7. Michael Penix Jr., Washington
   /scoutingreport2024mpenix.php
--------------------------------------------------
8. Spencer Rattler, South Carolina
   /scoutingreport2024srattler.php
--------------------------------------------------
9. Caleb Williams, USC
   /scoutingreport2024cwilliams.php
--------------

In [11]:
import pandas as pd
from urllib.parse import urljoin

# Assuming link_list contains your scraped data
base_url = "https://walterfootball.com"

# Create structured data
parsed_data = []
for item in link_list:
    # Convert to absolute URL
    full_url = urljoin(base_url, item['href'])

    # Split the text into components
    parts = [p.strip() for p in item['text'].split(',')]

    # Handle different text formats
    try:
        name = parts[0]
        position = parts[1] if len(parts) > 1 else 'N/A'
        school = parts[2] if len(parts) > 2 else 'N/A'
    except IndexError:
        name = item['text']
        position = 'N/A'
        school = 'N/A'

    parsed_data.append({
        'Name': name,
        'Position': position,
        'Full_URL': full_url
    })

# Create DataFrame
df = pd.DataFrame(parsed_data)

In [14]:
df.to_csv("all_scout_report_link.csv")

In [18]:
df_real = df.copy()

In [27]:
df = df_real

In [28]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
from selenium.common.exceptions import TimeoutException, WebDriverException

# Set up Chrome with WebDriver Manager
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

# Create list to store reports
reports = []
PAGE_LOAD_TIMEOUT = 20
for url in df['Full_URL']:
    try:
        # Set page load timeout
        driver.set_page_load_timeout(PAGE_LOAD_TIMEOUT)

        try:
            driver.get(url)
        except TimeoutException:
            print(f"Partial load: {url}")
            driver.execute_script("window.stop();")

        # Wait for content to load using generic content selector
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.entry-content")))

        # Find the report content using stable class-based selector
        content_div = driver.find_element(By.CSS_SELECTOR, 'div.entry-content')
        report_text = content_div.text.strip()

        reports.append(report_text)
        print(f"Successfully scraped: {url}")

    except Exception as e:
        print(f"Error scraping {url}: {str(e)}")
        reports.append(None)

    # Add delay between requests
    time.sleep(2)

driver.quit()

# Add reports to DataFrame
df['Scouting_Report'] = reports

# Save results


print("\nFinal DataFrame:")
print(df.head())

Successfully scraped: https://walterfootball.com/scoutingreport2024jdaniels.php
Successfully scraped: https://walterfootball.com/scoutingreport2024shartman.php
Successfully scraped: https://walterfootball.com/scoutingreport2024dmaye.php
Successfully scraped: https://walterfootball.com/scoutingreport2024jjmccarthy.php
Successfully scraped: https://walterfootball.com/scoutingreport2024jmilton.php
Successfully scraped: https://walterfootball.com/scoutingreport2024bnix.php
Successfully scraped: https://walterfootball.com/scoutingreport2024mpenix.php
Successfully scraped: https://walterfootball.com/scoutingreport2024srattler.php
Successfully scraped: https://walterfootball.com/scoutingreport2024cwilliams.php
Successfully scraped: https://walterfootball.com/scoutingreport2024ballen.php
Successfully scraped: https://walterfootball.com/scoutingreport2024tbenson.php
Successfully scraped: https://walterfootball.com/scoutingreport2024jbrooks.php
Successfully scraped: https://walterfootball.com/sc

In [40]:
import pandas as pd
import re

def extract_cleaned_report(text):
    """Extract content between 3rd 'Scouting Report' and 'Choose Player'"""
    if not isinstance(text, str):
        return ""

    # Find all occurrences of "Scouting Report"
    matches = [m.start() for m in re.finditer(r'Scouting Report', text, re.IGNORECASE)]

    # Check if we have at least 3 occurrences
    if len(matches) < 3:
        return ""

    # Start from beginning of third occurrence
    start_idx = matches[2]

    # Find next "Choose Player" after third occurrence
    end_idx = text.find("Choose Player", start_idx)

    if end_idx == -1:
        end_idx = text.find("RELATED LINKS", start_idx)

    if end_idx == -1:
        return text[start_idx:].strip()

    return text[start_idx:end_idx].strip()

# Apply to DataFrame
df['Cleaned_report'] = df['Scouting_Report'].apply(extract_cleaned_report)


print(df[['Name', 'Cleaned_report']].head(2))

             Name                                     Cleaned_report
0  Jayden Daniels  Scouting Report\nBy Charlie Campbell\nStrength...
1     Sam Hartman  Scouting Report\nBy Charlie Campbell\nStrength...


In [41]:
print(df['Cleaned_report'][10])

Scouting Report
By Charlie Campbell
Strengths:
Power build and excellent size
True power back
Keeps legs going after contact
Consistently breaks tackles
Finishes runs well
Burst to the hole
Drags tacklers
Has the build for a big work load
Runs well in the second half
Can contribute as a receiver
Durable
Ready to contribute quickly
Good knee bend
Runs behind his pads
Straight-line speed
Upside
Weaknesses:
Can be indecisive as a runner
Will have to learn NFL blitz protection
Blitz awareness needs improvement
Prospect Summary:
Florida State experienced a resurgence in the 2023 season, going undefeated before getting blown out by Georgia in a meaningless bowl game. The Seminoles’ resurgence was led by a bunch of transfer players who they landed out of the transfer portal, including Jared Verse, Braden Fiske, Keon Coleman and Trey Benson. Benson started out his collegiate career at Oregon before landing with Florida State for 2022. As a Seminole in 2022, he averaged 6.4 yards per carry for 

In [43]:
df.to_csv('scouting_reports_with_text.csv', index=False, sep="|")