# Scraper for Houston Dynamo

In [3]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
import time
import pandas as pd
import numpy as np

In [16]:
def scrape_houston_dynamo_games():
    # Set up Chrome options for headless mode
    options = Options()
    #options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    
    # Initialize WebDriver in headless mode
    driver = webdriver.Chrome(options=options)
    driver.get("https://www.houstondynamofc.com/schedule/#date=2019-03-01")

    # Accept cookies if the button is present
    try:
        accept_cookies = driver.find_element(By.ID, "onetrust-accept-btn-handler")
        accept_cookies.click()
        time.sleep(1)  # Brief pause to allow for any page updates
    except Exception:
        # If the cookie button is not found, continue without action
        pass

    # Data storage
    game_urls = []

    # Loop to navigate through each month
    while True:
        # Allow time for the page to load
        time.sleep(2)

        # Parse the page source with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Get the current month/year to append to each URL and check stopping condition
        current_month_element = soup.find('h2', class_="sc-dkrFOg dOlYGg")
        current_month = current_month_element.text.strip() if current_month_element else None

        # Stop if the date is beyond November 2024
        if current_month == "November 2024":
            print("Reached November 2024.")
            break

        # Find all matches at Shell Energy Stadium
        for match in soup.find_all('div', class_="sc-eJDSGI iXMEvt mls-c-match-list__match"):
            stadium = match.find('p', class_="sc-bBABsx hpINxp")
            
            if stadium and stadium.text == "Shell Energy Stadium":
                url_tag = match.find('a', href=True)
                if url_tag:
                    game_urls.append({
                        "Month": current_month,
                        "URL": url_tag['href']
                    })

        # Locate and click the "next" button if present
        try:
            next_button = driver.find_element(By.XPATH, "//button[@value='next']")
            ActionChains(driver).move_to_element(next_button).click(next_button).perform()
        except Exception:
            # Exit loop if no next button is found (end of available months)
            print("Reached the last available month.")
            break

    # Close the driver
    driver.quit()

    # Save to DataFrame
    df = pd.DataFrame(game_urls)
    df.to_csv("houston_dynamo_game_urls.csv", index=False)
    return df

In [17]:
# Call the function
df = scrape_houston_dynamo_games()
print("Scraping completed. Data saved to 'houston_dynamo_game_urls.csv'.")

Reached November 2024.
Scraping completed. Data saved to 'houston_dynamo_game_urls.csv'.


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117 entries, 0 to 116
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Month   117 non-null    object
 1   URL     117 non-null    object
dtypes: object(2)
memory usage: 2.0+ KB


In [19]:
df.head()

Unnamed: 0,Month,URL
0,March 2019,https://www.houstondynamofc.com/competitions/m...
1,March 2019,https://www.houstondynamofc.com/competitions/c...
2,March 2019,https://www.houstondynamofc.com/competitions/m...
3,March 2019,https://www.houstondynamofc.com/competitions/m...
4,April 2019,https://www.houstondynamofc.com/competitions/m...


In [12]:
def scrape_game_details(df):

    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    # Initialize WebDriver
    driver = webdriver.Chrome(options=options)

    # Data storage
    game_details = []

    # Iterate over each URL in the DataFrame
    for _, row in df.iterrows():
        year = row['Month']
        url = row['URL']

        # Open game URL
        driver.get(url)
        time.sleep(2)  # Allow time for the page to load

        # Accept cookies if the button is present
        try:
            accept_cookies = driver.find_element("id", "onetrust-accept-btn-handler")
            accept_cookies.click()
            time.sleep(1)  # Brief pause to allow for any page updates
        except Exception:
            pass  # If the cookie button is not found, continue

        # Parse page source with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Extract date
        date_element = soup.find('div', class_="sc-dkrFOg paOFY mls-c-blockheader__subtitle")
        date = date_element.text.strip() if date_element else None

        # Extract competition name
        competition_element = soup.find('div', class_="sc-gswNZR hhlcUG mls-c-blockheader__title")
        competition_name = competition_element.text.strip() if competition_element else None

        # Extract opponent (from --away div)
        opponent_element = soup.find('div', class_="sc-dmctIk kXXzOo mls-c-club --away")
        if opponent_element:
            opponent_span = opponent_element.find('span', class_="mls-c-club__shortname") or \
                            opponent_element.find('span', class_="mls-c-club__abbreviation")
            opponent = opponent_span.text.strip() if opponent_span else None
        else:
            opponent = None

        # Extract goals scored and goals received
        score_element = soup.find('div', class_="sc-kMjNwy jTBsRm mls-c-scorebug__post")
        if score_element:
            scores = score_element.find_all('span', class_="mls-c-scorebug__score")
            goals_scored = scores[0].text.strip() if len(scores) > 0 else None
            goals_received = scores[1].text.strip() if len(scores) > 1 else None
        else:
            goals_scored = None
            goals_received = None

        # Extract attendance
        attendance = np.nan
        match_facts_section = soup.find('section', class_="mls-l-module mls-l-module--match-facts")
        if match_facts_section:
            facts = match_facts_section.find_all('div', class_="mls-o-match-facts__fact")
            for fact in facts:
                label = fact.find('span', class_="mls-o-match-facts__label")
                if label and "Attendance:" in label.text:
                    value = fact.find('span', class_="mls-o-match-facts__value")
                    attendance = value.text.strip() if value else np.nan
                    break  # Stop after finding the attendance field

        # Debugging Information
        print(f"Attendance Value: {attendance}")
        print(f"Date: {date}, Competition: {competition_name}, Opponent: {opponent}, Goals Scored: {goals_scored}, Goals Received: {goals_received}")

        # Append the details to the list
        game_details.append({
            "Year": year,
            "URL": url,
            "Date": date,
            "Competition": competition_name,
            "Opponent": opponent,
            "Goals Scored": goals_scored,
            "Goals Received": goals_received,
            "Attendance": attendance
        })

    # Close driver
    driver.quit()

    # Save to DataFrame
    details_df = pd.DataFrame(game_details)
    details_df.to_csv("dynamo_game_details_complete.csv", index=False)
    return details_df

In [13]:
df = pd.read_csv("houston_dynamo_game_urls.csv")  # DataFrame with 'Month' and 'URL' columns
details_df = scrape_game_details(df)
print("Scraping completed. Data saved to 'dynamo_game_details.csv'.")

Attendance Value: 16,827
Date: Saturday March 2 2019, Competition: MLS Regular Season, Opponent: Salt Lake, Goals Scored: 1, Goals Received: 1
Attendance Value: nan
Date: Tuesday March 5 2019, Competition: CONCACAF Champions Cup, Opponent: Tigres, Goals Scored: 0, Goals Received: 2
Attendance Value: 12,601
Date: Saturday March 9 2019, Competition: MLS Regular Season, Opponent: Montréal, Goals Scored: 2, Goals Received: 1
Attendance Value: 13,549
Date: Saturday March 16 2019, Competition: MLS Regular Season, Opponent: Vancouver, Goals Scored: 3, Goals Received: 2
Attendance Value: 14,476
Date: Saturday April 13 2019, Competition: MLS Regular Season, Opponent: San Jose, Goals Scored: 2, Goals Received: 1
Attendance Value: 15,557
Date: Saturday April 27 2019, Competition: MLS Regular Season, Opponent: Columbus, Goals Scored: 2, Goals Received: 0
Attendance Value: 16,521
Date: Saturday May 4 2019, Competition: MLS Regular Season, Opponent: Dallas, Goals Scored: 2, Goals Received: 1
Attenda

In [14]:
details_df.head()

Unnamed: 0,Year,URL,Date,Competition,Opponent,Goals Scored,Goals Received,Attendance
0,March 2019,https://www.houstondynamofc.com/competitions/m...,Saturday March 2 2019,MLS Regular Season,Salt Lake,1,1,16827.0
1,March 2019,https://www.houstondynamofc.com/competitions/c...,Tuesday March 5 2019,CONCACAF Champions Cup,Tigres,0,2,
2,March 2019,https://www.houstondynamofc.com/competitions/m...,Saturday March 9 2019,MLS Regular Season,Montréal,2,1,12601.0
3,March 2019,https://www.houstondynamofc.com/competitions/m...,Saturday March 16 2019,MLS Regular Season,Vancouver,3,2,13549.0
4,April 2019,https://www.houstondynamofc.com/competitions/m...,Saturday April 13 2019,MLS Regular Season,San Jose,2,1,14476.0


# Playing with data

In [66]:
details_df = pd.read_csv("dynamo_game_details_complete.csv")

In [67]:
details_df[['Month', 'Year']] = details_df['Year'].str.split(' ', expand=True, n=1)

In [68]:
details_df.head()

Unnamed: 0,Year,URL,Date,Competition,Opponent,Goals Scored,Goals Received,Attendance,Month
0,2019,https://www.houstondynamofc.com/competitions/m...,Saturday March 2 2019,MLS Regular Season,Salt Lake,1.0,1.0,16827.0,March
1,2019,https://www.houstondynamofc.com/competitions/c...,Tuesday March 5 2019,CONCACAF Champions Cup,Tigres,0.0,2.0,,March
2,2019,https://www.houstondynamofc.com/competitions/m...,Saturday March 9 2019,MLS Regular Season,Montréal,2.0,1.0,12601.0,March
3,2019,https://www.houstondynamofc.com/competitions/m...,Saturday March 16 2019,MLS Regular Season,Vancouver,3.0,2.0,13549.0,March
4,2019,https://www.houstondynamofc.com/competitions/m...,Saturday April 13 2019,MLS Regular Season,San Jose,2.0,1.0,14476.0,April


In [69]:
details_df['DOW'] = details_df['Date'].str.split(' ').str[0]

In [70]:
details_df.head()

Unnamed: 0,Year,URL,Date,Competition,Opponent,Goals Scored,Goals Received,Attendance,Month,DOW
0,2019,https://www.houstondynamofc.com/competitions/m...,Saturday March 2 2019,MLS Regular Season,Salt Lake,1.0,1.0,16827.0,March,Saturday
1,2019,https://www.houstondynamofc.com/competitions/c...,Tuesday March 5 2019,CONCACAF Champions Cup,Tigres,0.0,2.0,,March,Tuesday
2,2019,https://www.houstondynamofc.com/competitions/m...,Saturday March 9 2019,MLS Regular Season,Montréal,2.0,1.0,12601.0,March,Saturday
3,2019,https://www.houstondynamofc.com/competitions/m...,Saturday March 16 2019,MLS Regular Season,Vancouver,3.0,2.0,13549.0,March,Saturday
4,2019,https://www.houstondynamofc.com/competitions/m...,Saturday April 13 2019,MLS Regular Season,San Jose,2.0,1.0,14476.0,April,Saturday


In [71]:
details_df['Attendance'] = details_df['Attendance'].str.replace(',', '').astype(float)

In [72]:
details_df.head()

Unnamed: 0,Year,URL,Date,Competition,Opponent,Goals Scored,Goals Received,Attendance,Month,DOW
0,2019,https://www.houstondynamofc.com/competitions/m...,Saturday March 2 2019,MLS Regular Season,Salt Lake,1.0,1.0,16827.0,March,Saturday
1,2019,https://www.houstondynamofc.com/competitions/c...,Tuesday March 5 2019,CONCACAF Champions Cup,Tigres,0.0,2.0,,March,Tuesday
2,2019,https://www.houstondynamofc.com/competitions/m...,Saturday March 9 2019,MLS Regular Season,Montréal,2.0,1.0,12601.0,March,Saturday
3,2019,https://www.houstondynamofc.com/competitions/m...,Saturday March 16 2019,MLS Regular Season,Vancouver,3.0,2.0,13549.0,March,Saturday
4,2019,https://www.houstondynamofc.com/competitions/m...,Saturday April 13 2019,MLS Regular Season,San Jose,2.0,1.0,14476.0,April,Saturday


In [73]:
grouped_df = details_df.groupby(['Year', 'Month'])['Attendance'].mean().reset_index()

In [74]:
# Create a MultiIndex Series from grouped_df
attendance_mapping = grouped_df.set_index(['Year', 'Month'])['Attendance']

In [75]:
# Map the Attendance values to details_df and fill missing values
details_df['Attendance'] = details_df.apply(
    lambda row: attendance_mapping.get((row['Year'], row['Month']), row['Attendance'])
    if pd.isna(row['Attendance']) else row['Attendance'], axis=1
)

In [77]:
details_df["Attendance"] = details_df["Attendance"].round(0)

In [78]:
details_df.head()

Unnamed: 0,Year,URL,Date,Competition,Opponent,Goals Scored,Goals Received,Attendance,Month,DOW
0,2019,https://www.houstondynamofc.com/competitions/m...,Saturday March 2 2019,MLS Regular Season,Salt Lake,1.0,1.0,16827.0,March,Saturday
1,2019,https://www.houstondynamofc.com/competitions/c...,Tuesday March 5 2019,CONCACAF Champions Cup,Tigres,0.0,2.0,14326.0,March,Tuesday
2,2019,https://www.houstondynamofc.com/competitions/m...,Saturday March 9 2019,MLS Regular Season,Montréal,2.0,1.0,12601.0,March,Saturday
3,2019,https://www.houstondynamofc.com/competitions/m...,Saturday March 16 2019,MLS Regular Season,Vancouver,3.0,2.0,13549.0,March,Saturday
4,2019,https://www.houstondynamofc.com/competitions/m...,Saturday April 13 2019,MLS Regular Season,San Jose,2.0,1.0,14476.0,April,Saturday


In [80]:
details_df.isna().sum()

Year              0
URL               0
Date              3
Competition       3
Opponent          3
Goals Scored      3
Goals Received    3
Attendance        2
Month             0
DOW               3
dtype: int64

In [81]:
#Select rows where attendance is null
details_df[details_df['Attendance'].isnull()]

Unnamed: 0,Year,URL,Date,Competition,Opponent,Goals Scored,Goals Received,Attendance,Month,DOW
22,2020,https://www.houstondynamofc.com/competitions/m...,Friday August 21 2020,MLS Regular Season,Dallas,0.0,0.0,,August,Friday
68,2023,https://www.houstondynamofc.com/competitions/u...,,,,,,,February,


In [82]:
#Drop rows where attendance is null
details_df = details_df.dropna(subset=['Attendance'])

In [83]:
details_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 115 entries, 0 to 116
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Year            115 non-null    object 
 1   URL             115 non-null    object 
 2   Date            113 non-null    object 
 3   Competition     113 non-null    object 
 4   Opponent        113 non-null    object 
 5   Goals Scored    113 non-null    float64
 6   Goals Received  113 non-null    float64
 7   Attendance      115 non-null    float64
 8   Month           115 non-null    object 
 9   DOW             113 non-null    object 
dtypes: float64(3), object(7)
memory usage: 9.9+ KB


In [84]:
#Drop rows where attendance is null
details_df = details_df.dropna(subset=['Date'])

In [85]:
details_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 113 entries, 0 to 116
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Year            113 non-null    object 
 1   URL             113 non-null    object 
 2   Date            113 non-null    object 
 3   Competition     113 non-null    object 
 4   Opponent        112 non-null    object 
 5   Goals Scored    112 non-null    float64
 6   Goals Received  112 non-null    float64
 7   Attendance      113 non-null    float64
 8   Month           113 non-null    object 
 9   DOW             113 non-null    object 
dtypes: float64(3), object(7)
memory usage: 9.7+ KB


In [87]:
#if goals scored > goals received then W, if == then D, if < then L
details_df['Result'] = np.where(details_df['Goals Scored'] > details_df['Goals Received'], 'W', np.where(details_df['Goals Scored'] == details_df['Goals Received'], 'D', 'L'))

In [89]:
details_df.head()

Unnamed: 0,Year,URL,Date,Competition,Opponent,Goals Scored,Goals Received,Attendance,Month,DOW,Result
0,2019,https://www.houstondynamofc.com/competitions/m...,Saturday March 2 2019,MLS Regular Season,Salt Lake,1.0,1.0,16827.0,March,Saturday,D
1,2019,https://www.houstondynamofc.com/competitions/c...,Tuesday March 5 2019,CONCACAF Champions Cup,Tigres,0.0,2.0,14326.0,March,Tuesday,L
2,2019,https://www.houstondynamofc.com/competitions/m...,Saturday March 9 2019,MLS Regular Season,Montréal,2.0,1.0,12601.0,March,Saturday,W
3,2019,https://www.houstondynamofc.com/competitions/m...,Saturday March 16 2019,MLS Regular Season,Vancouver,3.0,2.0,13549.0,March,Saturday,W
4,2019,https://www.houstondynamofc.com/competitions/m...,Saturday April 13 2019,MLS Regular Season,San Jose,2.0,1.0,14476.0,April,Saturday,W


In [90]:
details_df.to_csv("dynamo_games_initial.csv", index=False)