In [33]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common import exceptions  
import time
import re

### Premier League 2023-2024 xG

In [29]:
url = "https://fbref.com/en/comps/9/2023-2024/schedule/2023-2024-Premier-League-Scores-and-Fixtures"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

In [15]:
# Find the tbody tag (assuming there's only one <tbody> element)
tbody = soup.find('tbody')

# Initialize a list to hold rows of data
data = []

# Check if tbody exists
if tbody:
    # Iterate through each row in tbody
    for row in tbody.find_all('tr'):
        # Extract data from each cell in the row
        cells = row.find_all('td')
        
        # Check if the row contains cells (to avoid empty rows)
        if cells:
            # For each cell, get the text and add it to the row data
            row_data = [cell.get_text(strip=True) for cell in cells]
            data.append(row_data)  # Add the row data to the data list

# Convert the list of rows into a pandas DataFrame
if data:
    columns = ['Day', 'Date', 'Time', 'Home', 'xG_Home', 'Score', 'xG_Away', 'Away', 'Attendance', 'Venue', 'Referee', 'Match Report', 'Notes'] 
    
    df = pd.DataFrame(data, columns=columns)
else:
    print("No data found in the <tbody>.")

df.head()

Unnamed: 0,Day,Date,Time,Home,xG_Home,Score,xG_Away,Away,Attendance,Venue,Referee,Match Report,Notes
0,Fri,2023-08-11,20:00,Burnley,0.3,0–3,1.9,Manchester City,21572,Turf Moor,Craig Pawson,Match Report,
1,Sat,2023-08-12,12:30,Arsenal,0.8,2–1,1.2,Nott'ham Forest,59984,Emirates Stadium,Michael Oliver,Match Report,
2,Sat,2023-08-12,15:00,Everton,2.7,0–1,1.5,Fulham,39940,Goodison Park,Stuart Attwell,Match Report,
3,Sat,2023-08-12,15:00,Sheffield Utd,0.5,0–1,1.9,Crystal Palace,31194,Bramall Lane,John Brooks,Match Report,
4,Sat,2023-08-12,15:00,Brighton,4.0,4–1,1.5,Luton Town,31872,The American Express Community Stadium,David Coote,Match Report,


In [16]:
df.to_csv('../data/raw/xG.csv', index=False)

### Premier League 2023-2024 Head-to-Head

In [57]:
head_to_head_df = pd.DataFrame()

for pg_num in range(0, 37):
    data = []

    driver = webdriver.Chrome()
    url = f"https://www.fotmob.com/leagues/47/matches/premier-league?season=2023-2024&page={pg_num}"
    
    driver.get(url)
    
    # Wait for the page to load and JavaScript-rendered content to appear
    WebDriverWait(driver, 15).until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a.css-s4hjf6-MatchWrapper.e1ek4pst2"))
    )
    
    # Get all anchor tags matching the specified class
    anchor_tags = driver.find_elements(By.CSS_SELECTOR, "a.css-s4hjf6-MatchWrapper.e1ek4pst2")
    
    # Iterate through the links
    for idx in range(len(anchor_tags)):
        # Re-fetch the list of anchor tags before each click to avoid stale references
        anchor_tags = driver.find_elements(By.CSS_SELECTOR, "a.css-s4hjf6-MatchWrapper.e1ek4pst2")
        link = anchor_tags[idx]
    
        # Extract and print the href attribute
        href = link.get_attribute("href")
        link_text = link.text
        print(f"Visiting link {idx + 1}: {href}")
    
        # Open the link
        link.click()
    
        # Wait for the new page to load (use an appropriate selector for the target page)
        WebDriverWait(driver, 45).until(
            EC.presence_of_element_located((By.XPATH, "//button[text()='Head-to-Head']"))
        )
    
        time.sleep(3)
    
        # Wait for the "Head-to-Head" button to be clickable
        head_to_head = WebDriverWait(driver, 15).until(
            EC.element_to_be_clickable((By.XPATH, "//button[text()='Head-to-Head']"))
        )
        head_to_head.click()
    
        # Wait for the "wins_draws_wins" section to appear and scrape its content
        wins_draws_wins = WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.css-1e2kjkx-H2hNumbers.e3lka4k2"))
        )
        head_to_head_data = wins_draws_wins.text
    
        # Append the scraped data as a dictionary to the list
        data.append({
            "Match": link_text,
            "Head-to-Head Data": head_to_head_data
        })
    
        # Navigate back to the previous page
        driver.back()
    
        # Wait for the anchor tags to reappear on the main page
        WebDriverWait(driver, 45).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a.css-s4hjf6-MatchWrapper.e1ek4pst2"))
        )
    
    driver.quit()

    cleaned_data = [
        {
            'Match': entry['Match'].replace('\n', ' ').replace(' FT',''),
            'Head-to-Head Data': entry['Head-to-Head Data'].replace('\n', ' ')
        }
        for entry in data
    ]

    temp_df = pd.DataFrame(cleaned_data, columns = ['Match','Head-to-Head Data'])
    
    # Apply extraction
    temp_df[['Team1', 'Team2']] = temp_df['Match'].apply(lambda x: pd.Series(extract_teams(x)))
    temp_df[['Wins1', 'Draws', 'Wins2']] = temp_df['Head-to-Head Data'].apply(lambda x: pd.Series(extract_results(x), dtype=int))
    temp_df.drop(columns='Head-to-Head Data',inplace=True)
    
    head_to_head_df = pd.concat([head_to_head_df, temp_df])

head_to_head_df

Visiting link 1: https://www.fotmob.com/matches/burnley-vs-manchester-city/2ai7j8#4193450
Visiting link 2: https://www.fotmob.com/matches/arsenal-vs-nottingham-forest/3bf4p3#4193451
Visiting link 3: https://www.fotmob.com/matches/west-ham-united-vs-afc-bournemouth/2hfnrc#4193452
Visiting link 4: https://www.fotmob.com/matches/luton-town-vs-brighton-hove-albion/2ug16v#4193453
Visiting link 5: https://www.fotmob.com/matches/everton-vs-fulham/2uevfp#4193454
Visiting link 6: https://www.fotmob.com/matches/sheffield-united-vs-crystal-palace/2tph1s#4193455
Visiting link 7: https://www.fotmob.com/matches/aston-villa-vs-newcastle-united/3h9v0m#4193456
Visiting link 8: https://www.fotmob.com/matches/tottenham-hotspur-vs-brentford/2u5c8b#4193457
Visiting link 9: https://www.fotmob.com/matches/chelsea-vs-liverpool/2f3vr7#4193458
Visiting link 10: https://www.fotmob.com/matches/wolverhampton-wanderers-vs-manchester-united/2xx4sz#4193459
Visiting link 1: https://www.fotmob.com/matches/sheffield-uni

Unnamed: 0,Match,Team1,Team2,Wins1,Draws,Wins2
0,Burnley 0 - 3 Manchester City,Burnley,Manchester City,1,2,15
1,Arsenal 2 - 1 Nottingham Forest,Arsenal,Nottingham Forest,3,0,3
2,AFC Bournemouth 1 - 1 West Ham United,AFC Bournemouth,West Ham United,5,3,6
3,Brighton & Hove Albion 4 - 1 Luton Town,Brighton & Hove Albion,Luton Town,1,0,1
4,Everton 0 - 1 Fulham,Everton,Fulham,9,3,4
...,...,...,...,...,...,...
9,Crystal Palace 5 - 0 Aston Villa,Crystal Palace,Aston Villa,6,2,7
10,Liverpool 2 - 0 Wolverhampton Wanderers,Liverpool,Wolverhampton Wanderers,15,1,4
11,Luton Town 2 - 4 Fulham,Luton Town,Fulham,0,2,3
12,Manchester City 3 - 1 West Ham United,Manchester City,West Ham United,24,5,2


In [39]:
# Function to extract team names
def extract_teams(match):
    pattern = r'(.+?)\s\d+\s-\s\d+\s(.+)'
    match_result = re.match(pattern, match)
    if match_result:
        return match_result.groups()
    return None, None

In [47]:
# Function to extract wins, draws, and loses
def extract_results(record):
    pattern = r'(\d+)\sWins\s(\d+)\sDraws\s(\d+)\sWins'
    match = re.match(pattern, record)
    if match:
        return match.groups()
    return None, None, None

In [62]:
head_to_head_df['Match'] = head_to_head_df['Match'].apply(lambda x: re.sub(r'(\d+)\s*-\s*(\d+)', r'\1-\2', x))

In [63]:
head_to_head_df.to_csv('../data/raw/head_to_head_df.csv', index=False)