In [39]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import time

# URL of the webpage to scrape
url = "https://fbref.com/en/comps/73/schedule/USL-Championship-Scores-and-Fixtures"

# Set up the Selenium WebDriver
driver = webdriver.Chrome()  # Make sure to have the ChromeDriver installed
driver.get(url)

# Wait for the page to load
time.sleep(5)

# Locate the table
try:
    table_div = driver.find_element(By.CLASS_NAME, "table_container.tabbed.current.is_setup")
    print("Table found successfully!")
    
    # Get the table rows
    rows = table_div.find_elements(By.TAG_NAME, "tr")

    # Define the data structure to store extracted data
    data = []

    for row in rows:
        # Extract data attributes
        row_id = row.get_attribute("data-row")
        try:
            date = row.find_element(By.CSS_SELECTOR, '[data-stat="date"]').text
        except:
            date = None
        try:
            start_time = row.find_element(By.CSS_SELECTOR, '[data-stat="start_time"]').text
        except:
            start_time = None
        try:
            home_team = row.find_element(By.CSS_SELECTOR, '[data-stat="home_team"]').text
        except:
            home_team = None
        try:
            away_team = row.find_element(By.CSS_SELECTOR, '[data-stat="away_team"]').text
        except:
            away_team = None        
        try:
            score = row.find_element(By.CSS_SELECTOR, '[data-stat="score"]').text
        except:
            score = None
        try:
            attendance = row.find_element(By.CSS_SELECTOR, '[data-stat="attendance"]').text
        except:
            attendance = None
        try:
            venue = row.find_element(By.CSS_SELECTOR, '[data-stat="venue"]').text
        except:
            venue = None
        try:
            referee = row.find_element(By.CSS_SELECTOR, '[data-stat="referee"]').text
        except:
            referee = None
        try:
            match_report = row.find_element(By.CSS_SELECTOR, '[data-stat="match_report"] a')
            match_report_link = match_report.get_attribute("href")
        except:
            match_report_link = None
        
        if row_id:
            data.append({
                "Row": row_id,
                "Start_Time": start_time,
                "Date": date,
                "Home_Team": home_team,
                "Away_Team": away_team,
                "Score": score,
                "Attendance": attendance,
                "Venue": venue,
                "Referee": referee,
                "Match_Report_Link": match_report_link
            })

    # Create a pandas DataFrame
    scraped_df = pd.DataFrame(data)

except Exception as e:
    print(f"Failed to locate or extract table data: {e}")

# Close the browser
driver.quit()

#drop blank/bad rows
drop_rows = ''
scraped_df = scraped_df[scraped_df["Date"] != drop_rows]
drop_rows = 'Date'
scraped_df = scraped_df[scraped_df["Date"] != drop_rows]
# Display the DataFrame
scraped_df.head(20)

Table found successfully!


Unnamed: 0,Row,Start_Time,Date,Home_Team,Away_Team,Score,Attendance,Venue,Referee,Match_Report_Link
0,0,14:00 (16:00),2024-03-09,New Mexico Utd,Pittsburgh,1–0,11347,Isotopes Park,,https://fbref.com/en/matches/b1544fc3/New-Mexi...
1,1,15:00 (16:00),2024-03-09,Memphis,LV Lights FC,2–1,3290,AutoZone Park,,https://fbref.com/en/matches/778d92a1/Memphis-...
2,2,19:00 (22:00),2024-03-09,Roots,Indy Eleven,2–1,5146,Pioneer Stadium,,https://fbref.com/en/matches/6ce79fb0/Oakland-...
3,3,19:00,2024-03-09,Miami FC,CS Switchbacks,2–0,1122,South Dade Kia Field at Pitbull Stadium,Sergii Demianchuk,https://fbref.com/en/matches/7a4237a4/Miami-FC...
4,4,19:00 (22:00),2024-03-09,Sac Republic,Orange County,2–2,11569,Heart Health Park,,https://fbref.com/en/matches/82d360f7/Sacramen...
5,5,19:00,2024-03-09,North Carolina,Charleston,0–0,2576,Sahlen's Stadium at WakeMed Soccer Park,,https://fbref.com/en/matches/860234cf/North-Ca...
6,6,19:30 (20:30),2024-03-09,San Antonio,Loudoun Utd,2–2,7263,Toyota Field,,https://fbref.com/en/matches/f8251d3a/San-Anto...
7,7,19:30 (21:30),2024-03-09,Phx Rising,B'ham Legion,0–1,8187,Wild Horse Pass Stadium,,https://fbref.com/en/matches/ff3c2f48/Phoenix-...
8,8,20:00 (21:00),2024-03-09,El Paso,Hartford,0–1,6111,Southwest University Park,,https://fbref.com/en/matches/3e0b5f68/El-Paso-...
10,10,20:00 (21:00),2024-03-13,El Paso,Monterey Bay,1–1,4566,Southwest University Park,Brad Jensen,https://fbref.com/en/matches/e0ef2679/El-Paso-...


# Add Attendance to matches

In [40]:
import numpy as np

# Create a new DataFrame with the selected columns
updatematches_df = scraped_df[['Row', 'Date', 'Score', 'Attendance']].copy()

#Cast date to date
updatematches_df['Date'] = pd.to_datetime(updatematches_df['Date'], errors='coerce')

# Create a new column 'Season' by extracting the year from 'Date'
updatematches_df['Season'] = updatematches_df['Date'].dt.year

# Function to extract home/away scores and shootout scores
def extract_scores(score):
    # Remove any text in parentheses (like shootout scores) and extra spaces
    score_cleaned = score.split('(')[0].strip()

    # Split the score into home and away scores
    scores = score_cleaned.split('–')
    
    # Check for empty or malformed scores
    home_score = int(scores[0]) if scores and scores[0].isdigit() else np.nan
    away_score = int(scores[1]) if len(scores) > 1 and scores[1].isdigit() else np.nan
    
    # Initialize shootout scores as NaN
    home_shootout_score = np.nan
    away_shootout_score = np.nan
    
    # Check if parentheses exist and extract shootout scores
    if '(' in score and ')' in score:
        shootout_scores = score.split('(')[-1].split(')')[0].split('–')
        if len(shootout_scores) == 2:
            home_shootout_score = int(shootout_scores[0]) if shootout_scores[0].isdigit() else np.nan
            away_shootout_score = int(shootout_scores[1]) if shootout_scores[1].isdigit() else np.nan

    return home_score, away_score, home_shootout_score, away_shootout_score

# Apply the function to the 'Score' column and split into new columns
updatematches_df[['Home_Score', 'Away_Score', 'Home_Shootout_Score', 'Away_Shootout_Score']] = \
    updatematches_df['Score'].apply(lambda x: pd.Series(extract_scores(x)))


# Display the new DataFrame
updatematches_df.head(20)


Unnamed: 0,Row,Date,Score,Attendance,Season,Home_Score,Away_Score,Home_Shootout_Score,Away_Shootout_Score
0,0,2024-03-09,1–0,11347,2024,1.0,0.0,,
1,1,2024-03-09,2–1,3290,2024,2.0,1.0,,
2,2,2024-03-09,2–1,5146,2024,2.0,1.0,,
3,3,2024-03-09,2–0,1122,2024,2.0,0.0,,
4,4,2024-03-09,2–2,11569,2024,2.0,2.0,,
5,5,2024-03-09,0–0,2576,2024,0.0,0.0,,
6,6,2024-03-09,2–2,7263,2024,2.0,2.0,,
7,7,2024-03-09,0–1,8187,2024,0.0,1.0,,
8,8,2024-03-09,0–1,6111,2024,0.0,1.0,,
10,10,2024-03-13,1–1,4566,2024,1.0,1.0,,


In [47]:
#Handle is cup and is playoff fields. 
#Write to matches table based on season and row matches
import sqlite3
import time

# Connect to the SQLite database
db_path = r'C:\Users\Jordan\Documents\Projects\Data Projects\USL Championship\USLChampionship.db'
conn = sqlite3.connect(db_path)
conn.execute('PRAGMA journal_mode=WAL;')  # Enable WAL mode for better concurrency
cursor = conn.cursor()

# Retry mechanism parameters
max_retries = 5
retry_delay = 1  # seconds

# Track if any updates were made
updates_made = False

# Iterate over the DataFrame rows
for _, row in updatematches_df.iterrows():
    query = """
        UPDATE matches
        SET 
            Home_Score = ?,
            Away_Score = ?,
            Attendance = ?,
            Away_Shootout_Score = ?,
            Home_Shootout_Score = ?,
            Is_Cup = FALSE,
            Is_Playoff = FALSE
        WHERE 
            Source_Row = ? AND 
            Season = ?
    """
    
    # Retry mechanism
    for attempt in range(max_retries):
        try:
            # Execute the query
            cursor.execute(query, (
                row['Home_Score'],
                row['Away_Score'],
                row['Attendance'],
                row['Away_Shootout_Score'],
                row['Home_Shootout_Score'],
                row['Row'],
                row['Season']
            ))
            if cursor.rowcount > 0:
                updates_made = True
            break  # Exit retry loop if successful
        except sqlite3.OperationalError as e:
            if "database is locked" in str(e):
                print(f"Database is locked, retrying... ({attempt + 1}/{max_retries})")
                time.sleep(retry_delay)
            else:
                raise  # Re-raise other operational errors
    else:
        print(f"Failed to update Row {row['Row']} after {max_retries} retries.")

# Commit changes if updates were made
if updates_made:
    conn.commit()
    print("Database updates completed.")
else:
    print("No updates were made.")

# Close the connection
conn.close()


OperationalError: database is locked

In [None]:
#For each value in match report, open the link, pull in stats

In [None]:
#Create match stats table, pull in data for each row in the table
#Fouls, corners, crosses, interceptions, offsites, possesion, cards?
#validate against scores table?

In [None]:
#Create player game stats table
#Name, Player_ID,Match_ID, Season, Was_Home, Position, Minutes, Goals, Assists, PK_Goals, PKGoals_Attempted, Shots
#Shots_on_Target, Yellow_Cards, Red_Cards, Fouls_Commit, Fouls_Drawns, Offsides, 
#Crosses, Tackles_Won, Interceptions, Own_Goals, PKs_Won, PKs_Conceded

#Read in home player stats
#Read in away player stats
#write to player game Stats table


In [None]:
#Create GK stats table
#Name, Player_ID,Match_ID, Season, Was_Home, Position, Minutes, Shots_on_Target_Against, 
#Goals_Against, Saves, Save_Percent

#Read in Home GK status
#Read in Away GK Stats

#write to GK game Stats table