In [9]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import time

# URL of the webpage to scrape
url = "https://fbref.com/en/comps/73/schedule/USL-Championship-Scores-and-Fixtures"

# Set up the Selenium WebDriver
driver = webdriver.Chrome()  # Make sure to have the ChromeDriver installed
driver.get(url)

# Wait for the page to load
time.sleep(5)

# Locate the table
try:
    table_div = driver.find_element(By.CLASS_NAME, "table_container.tabbed.current.is_setup")
    print("Table found successfully!")
    
    # Get the table rows
    rows = table_div.find_elements(By.TAG_NAME, "tr")

    # Define the data structure to store extracted data
    data = []

    for row in rows:
        # Extract data attributes
        row_id = row.get_attribute("data-row")
        try:
            date = row.find_element(By.CSS_SELECTOR, '[data-stat="date"]').text
        except:
            date = None
        try:
            home_team = row.find_element(By.CSS_SELECTOR, '[data-stat="home_team"]').text
        except:
            home_team = None
        try:
            away_team = row.find_element(By.CSS_SELECTOR, '[data-stat="away_team"]').text
        except:
            away_team = None
        try:
            attendance = row.find_element(By.CSS_SELECTOR, '[data-stat="attendance"]').text
        except:
            attendance = None
        try:
            match_report = row.find_element(By.CSS_SELECTOR, '[data-stat="match_report"] a')
            match_report_link = match_report.get_attribute("href")
        except:
            match_report_link = None
        
        if row_id:
            data.append({
                "Row": row_id,
                "Date": date,
                "Home_Team": home_team,
                "Away_Team": away_team,
                "Attendance": attendance,
                "Match_Report_Link": match_report_link
            })

    # Create a pandas DataFrame
    scraped_df = pd.DataFrame(data)

    # Display the DataFrame
    print(df.head())

    # Save the DataFrame to a CSV file
    df.to_csv("USL_Championship_Scores_and_Fixtures.csv", index=False)

except Exception as e:
    print(f"Failed to locate or extract table data: {e}")

# Close the browser
driver.quit()


Table found successfully!
  Row        Date       Home_Team       Away_Team Attendance  \
0   0  2024-03-09  New Mexico Utd      Pittsburgh     11,347   
1   1  2024-03-09         Memphis    LV Lights FC      3,290   
2   2  2024-03-09           Roots     Indy Eleven      5,146   
3   3  2024-03-09        Miami FC  CS Switchbacks      1,122   
4   4  2024-03-09    Sac Republic   Orange County     11,569   

                                   Match_Report_Link  
0  https://fbref.com/en/matches/b1544fc3/New-Mexi...  
1  https://fbref.com/en/matches/778d92a1/Memphis-...  
2  https://fbref.com/en/matches/6ce79fb0/Oakland-...  
3  https://fbref.com/en/matches/7a4237a4/Miami-FC...  
4  https://fbref.com/en/matches/82d360f7/Sacramen...  


In [10]:
scraped_df.head(25)

Unnamed: 0,Row,Date,Home_Team,Away_Team,Attendance,Match_Report_Link
0,0,2024-03-09,New Mexico Utd,Pittsburgh,11347,https://fbref.com/en/matches/b1544fc3/New-Mexi...
1,1,2024-03-09,Memphis,LV Lights FC,3290,https://fbref.com/en/matches/778d92a1/Memphis-...
2,2,2024-03-09,Roots,Indy Eleven,5146,https://fbref.com/en/matches/6ce79fb0/Oakland-...
3,3,2024-03-09,Miami FC,CS Switchbacks,1122,https://fbref.com/en/matches/7a4237a4/Miami-FC...
4,4,2024-03-09,Sac Republic,Orange County,11569,https://fbref.com/en/matches/82d360f7/Sacramen...
5,5,2024-03-09,North Carolina,Charleston,2576,https://fbref.com/en/matches/860234cf/North-Ca...
6,6,2024-03-09,San Antonio,Loudoun Utd,7263,https://fbref.com/en/matches/f8251d3a/San-Anto...
7,7,2024-03-09,Phx Rising,B'ham Legion,8187,https://fbref.com/en/matches/ff3c2f48/Phoenix-...
8,8,2024-03-09,El Paso,Hartford,6111,https://fbref.com/en/matches/3e0b5f68/El-Paso-...
9,9,,,,,


In [17]:
import sqlite3

# Connect to the SQLite database
db_path = r'C:\Users\Jordan\Documents\Projects\Data Projects\USL Championship\USLChampionship.db'
connection = sqlite3.connect(db_path)

# Create a cursor object to execute SQL commands
cursor = connection.cursor()

# Define the query
query = """
select tm.source_team_name, tm.team_id
from team_mapping tm
where tm.source ='https://fbref.com/en/comps/73/schedule/USL-Championship-Scores-and-Fixtures'
"""

# Execute the query
cursor.execute(query)

# Fetch all results
results = cursor.fetchall()

## Convert results to a dictionary
# Assuming each source_name is unique and serves as the dictionary key
team_id_mapping = {row[0]: row[1] for row in results}

# Close the connection
connection.close()

#replace team names with team id
scraped_df["Home_Team"] = scraped_df["Home_Team"].replace(team_id_mapping)
scraped_df["Away_Team"] = scraped_df["Away_Team"].replace(team_id_mapping)

#displaydf
scraped_df.head(10)


Unnamed: 0,Row,Date,Home_Team,Away_Team,Attendance,Match_Report_Link
0,0,2024-03-09,2.0,15.0,11347.0,https://fbref.com/en/matches/b1544fc3/New-Mexi...
1,1,2024-03-09,6.0,8.0,3290.0,https://fbref.com/en/matches/778d92a1/Memphis-...
2,2,2024-03-09,14.0,7.0,5146.0,https://fbref.com/en/matches/6ce79fb0/Oakland-...
3,3,2024-03-09,23.0,4.0,1122.0,https://fbref.com/en/matches/7a4237a4/Miami-FC...
4,4,2024-03-09,10.0,12.0,11569.0,https://fbref.com/en/matches/82d360f7/Sacramen...
5,5,2024-03-09,13.0,3.0,2576.0,https://fbref.com/en/matches/860234cf/North-Ca...
6,6,2024-03-09,18.0,21.0,7263.0,https://fbref.com/en/matches/f8251d3a/San-Anto...
7,7,2024-03-09,16.0,17.0,8187.0,https://fbref.com/en/matches/ff3c2f48/Phoenix-...
8,8,2024-03-09,24.0,19.0,6111.0,https://fbref.com/en/matches/3e0b5f68/El-Paso-...
9,9,,,,,


In [21]:
value_to_drop = ""
scraped_df = scraped_df[scraped_df["Attendance"] != value_to_drop]
value_to_drop = "Attendance"
scraped_df = scraped_df[scraped_df["Attendance"] != value_to_drop]
scraped_df.head(20)

Unnamed: 0,Row,Date,Home_Team,Away_Team,Attendance,Match_Report_Link
0,0,2024-03-09,2,15,11347,https://fbref.com/en/matches/b1544fc3/New-Mexi...
1,1,2024-03-09,6,8,3290,https://fbref.com/en/matches/778d92a1/Memphis-...
2,2,2024-03-09,14,7,5146,https://fbref.com/en/matches/6ce79fb0/Oakland-...
3,3,2024-03-09,23,4,1122,https://fbref.com/en/matches/7a4237a4/Miami-FC...
4,4,2024-03-09,10,12,11569,https://fbref.com/en/matches/82d360f7/Sacramen...
5,5,2024-03-09,13,3,2576,https://fbref.com/en/matches/860234cf/North-Ca...
6,6,2024-03-09,18,21,7263,https://fbref.com/en/matches/f8251d3a/San-Anto...
7,7,2024-03-09,16,17,8187,https://fbref.com/en/matches/ff3c2f48/Phoenix-...
8,8,2024-03-09,24,19,6111,https://fbref.com/en/matches/3e0b5f68/El-Paso-...
10,10,2024-03-13,24,22,4566,https://fbref.com/en/matches/e0ef2679/El-Paso-...


In [22]:
import sqlite3
import pandas as pd

# Connect to the SQLite database
db_path = r'C:\Users\Jordan\Documents\Projects\Data Projects\USL Championship\USLChampionship.db'
connection = sqlite3.connect(db_path)

# Create a cursor object to execute SQL commands
cursor = connection.cursor()

# Assuming 'scraped_df' is already defined and has the necessary columns ('Date', 'Home_Team', 'Away_Team')
match_ids = []

for index, row in scraped_df.iterrows():
    # Define the SQL query
    query = """
    SELECT Match_ID 
    FROM schedule
    WHERE match_date = ?
    AND home_team_id = ?
    AND away_team_id = ?
    """
    
    # Execute the query with parameters from scraped_df row
    cursor.execute(query, (row['Date'], row['Home_Team'], row['Away_Team']))
    
    # Fetch the result and append the Match_ID to the list
    match_id = cursor.fetchone()
    if match_id:
        match_ids.append(match_id[0])  # Append the Match_ID
    else:
        match_ids.append(None)  # No match found, append None

# Add the Match_ID column to scraped_df
scraped_df['Match_ID'] = match_ids

# Close the connection
connection.close()

# Print the updated scraped_df
scraped_df.head(15)


Unnamed: 0,Row,Date,Home_Team,Away_Team,Attendance,Match_Report_Link,Match_ID
0,0,2024-03-09,2,15,11347,https://fbref.com/en/matches/b1544fc3/New-Mexi...,1.0
1,1,2024-03-09,6,8,3290,https://fbref.com/en/matches/778d92a1/Memphis-...,2.0
2,2,2024-03-09,14,7,5146,https://fbref.com/en/matches/6ce79fb0/Oakland-...,3.0
3,3,2024-03-09,23,4,1122,https://fbref.com/en/matches/7a4237a4/Miami-FC...,4.0
4,4,2024-03-09,10,12,11569,https://fbref.com/en/matches/82d360f7/Sacramen...,5.0
5,5,2024-03-09,13,3,2576,https://fbref.com/en/matches/860234cf/North-Ca...,6.0
6,6,2024-03-09,18,21,7263,https://fbref.com/en/matches/f8251d3a/San-Anto...,7.0
7,7,2024-03-09,16,17,8187,https://fbref.com/en/matches/ff3c2f48/Phoenix-...,8.0
8,8,2024-03-09,24,19,6111,https://fbref.com/en/matches/3e0b5f68/El-Paso-...,9.0
10,10,2024-03-13,24,22,4566,https://fbref.com/en/matches/e0ef2679/El-Paso-...,10.0


# Add Attendance to matches

In [26]:
# Assuming scraped_df already exists
Match_Attendance = scraped_df[['Match_ID', 'Attendance']].copy()

db_path = r'C:\Users\Jordan\Documents\Projects\Data Projects\USL Championship\USLChampionship.db'
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Update each row in the Matches table where Match_ID matches, setting the Attendance value
for _, row in Match_Attendance.iterrows():
    Match_ID = row['Match_ID']
    Attendance = row['Attendance']
    
    # SQL query to update the Attendance value in Matches table
    update_query = """
    UPDATE Matches
    SET Attendance = ?
    WHERE Match_ID = ?
    """
    
    # Execute the update query
    cursor.execute(update_query, (attendance, match_id))

# Commit the changes and close the connection
conn.commit()
conn.close()

print("Attendance data has been updated successfully.")


OperationalError: no such column: Match_ID