# Installing Libraries and tools

In [None]:
# All needed Libraries and tools
!pip install requests
!pip install beautifulsoup4
!pip install pandas
!pip3 install lxml
!pip install selenium==3.5

# Downloading HTML pages for all the seasons starting from 2003

In [1]:
## To execute webChromeDriver
## Allows developers to automate interactions with Chrome
## Opening web pages, filling forms, clicking buttons, and extracting information
import os
## Js framework for automating web browsers
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time

# Create a new instance of the Chrome driver
driver = webdriver.Chrome(
    executable_path="./chromedriver.exe"
    )

In [11]:
# List of years for the seasons
years = list(range(2003, 2025))

In [10]:
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

# Define the base directory for storing the data
base_directory = "Html_Data"

# Create the base directory if it doesn't exist
if not os.path.exists(base_directory):
    os.makedirs(base_directory)

for year in years:

    # Define the full path for the season folder
    season_folder = os.path.join(base_directory, f"Premier_League_{year}_{year+1}")
    
    if not os.path.exists(season_folder):
        os.makedirs(season_folder)

    # URL of the page by season
    url_historical_data = f"https://www.oddsportal.com/football/england/premier-league-{year}-{year+1}/results/"

    # URL of the live season page to be extracted separately 
    ## url_historical_data = "https://www.oddsportal.com/football/england/premier-league/results/"
    
    driver.get(url_historical_data)

    SCROLL_PAUSE_TIME = 2
    data = []
    
    while True:
        # Scroll dynamically to the bottom of the page
        last_height = driver.execute_script("return document.body.scrollHeight")
        while True:
            driver.execute_script("window.scrollTo(0, 2200);")
            time.sleep(SCROLL_PAUSE_TIME)  # Wait for the page to load new content
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height
    
        # Wait for pagination and collect data
        try:
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "pagination"))
            )
        except Exception as e:
            print(f"Error while waiting for pagination: {e}")
            break
    
        # Save the HTML content of the current page
        data.append(driver.page_source)
    
        # Locate and click the 'Next' button
        try:
            next_button = driver.find_element(By.XPATH, '//a[contains(@class, "pagination-link") and contains(text(), "Next")]')
            next_button.click()
            time.sleep(SCROLL_PAUSE_TIME)  # Wait for the next page to load
        except Exception:
            print("No more pages to navigate.")
            break

    # Save all HTML pages to the season's folder
    for i, page in enumerate(data):
        file_path = os.path.join(season_folder, f"Page_{i+1}.html")
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(page)


No more pages to navigate.


# Scrapping HTML pages and creating a Dataset

In [2]:
import os
from bs4 import BeautifulSoup
import pandas as pd

# Path to the directory containing all folders
base_path = "./Html_Data"

# Initialize an empty DataFrame to collect data from all seasons
all_seasons_data = pd.DataFrame()
skipped_matches = []

# Iterate through all folders (each representing a season)
for folder_name in os.listdir(base_path):
    
    folder_path = os.path.join(base_path, folder_name)
    
    # Skip if not a directory
    if not os.path.isdir(folder_path):
        continue

    print(f"Processing folder: {folder_name}")
    
    # Initialize an empty list to collect data for the current folder
    all_data = []

    # Get all HTML files in the folder
    html_files = [file for file in os.listdir(folder_path) if file.endswith(".html")]

    # Sort files to process them in order
    html_files.sort()

    # Process each HTML file
    for html_file in html_files:
        
        file_path = os.path.join(folder_path, html_file)
        
        # Load the HTML file
        with open(file_path, "r", encoding="utf-8") as f:
            
            soup = BeautifulSoup(f, "html.parser")
    
            # Extract match details (start time, teams, odds)
            target_div = soup.find("div", class_="min-h-[206px]")
            
            match_data = []
            
            if target_div:
                paragraphs = target_div.find_all("p")
                for p in paragraphs:
                    # Check if the text is "-" and replace it with "0"
                    text = p.text.strip()
                    if text == "-":
                        text = "0"
                    match_data.append(text)
            else:
                print("Target div not found.")
            
            # Remove the first element (e.g., "England")
            match_data.pop(0)
            
            # Extract scores
            score_divs = soup.select('div[class^="min-mt:!hidden ml-auto mr-3 flex font-bold"]')
            scores = [div.text.strip() for div in score_divs]
            
            # Group scores into pairs (Team1Score, Team2Score)
            team_scores = [scores[i:i+2] for i in range(0, len(scores), 2)]
            
            # Initialize the list for results (win conditions)
            results = []
            
            if target_div:
                paragraphs = target_div.find_all("p")
            
                # Go through each <p> tag one by one
                for p in paragraphs:
                    # Get the class of the paragraph
                    class_attr = " ".join(p.get("class", []))  # Join the class list to a string for easier checks

                    # Handle specific edge case: "-" with "height-content" class
                    if class_attr == "height-content" and p.text.strip() == "-":
                        results.append(0)  # Add 0 as placeholder for missing data
                        continue
            
                    # Check if the class matches one of the two conditions
                    if class_attr == "height-content !text-black-main next-m:min-w-[100%] flex-center min-h-full min-w-[50px] gradient-green hover:!bg-gray-medium default-odds-bg-bgcolor border gradient-green-added-border" or \
                       class_attr == "height-content !text-black-main next-m:min-w-[100%] flex-center min-h-full min-w-[50px] hover:!bg-gray-medium default-odds-bg-bgcolor border gradient-green-added-border":
                        
                        # Check if "gradient-green" appears between "min-w-[50px]" and "hover:!bg-gray-medium"
                        if "min-w-[50px] gradient-green hover:!bg-gray-medium" in class_attr:
                            results.append(1)  # Add 1 if gradient-green is in the right position
                        else:
                            results.append(0)  # Add 0 otherwise
            
            # Group results into tuples of three (home, draw, away)
            tuples_of_three = [tuple(results[i:i+3]) for i in range(0, len(results), 3)]
            
            # Group match data into structured format
            structured_data = []
            
            for i in range(0, len(match_data), 6):  # Each match block has 6 entries
                try:
                    match_start_time = match_data[i]
                    team1 = match_data[i+1]
                    team2 = match_data[i+2]
                    home_odd = match_data[i+3]
                    draw_odd = match_data[i+4]
                    away_odd = match_data[i+5]
                    team1_score, team2_score = team_scores.pop(0)  # Extract corresponding scores
                    home_odd_win, draw_odd_win, away_odd_win = tuples_of_three.pop(0)  # Extract win conditions
                    
                    structured_data.append({
                        "team1": team1,
                        "team2": team2,
                        "matchStartTime": match_start_time,
                        "team1score": team1_score,
                        "team2score": team2_score,
                        "homeodd": home_odd,
                        "drawodd": draw_odd,
                        "awayodd": away_odd,
                        "home_odd_win": home_odd_win,
                        "draw_odd_win": draw_odd_win,
                        "away_odd_win": away_odd_win,
                    })
                except IndexError:
                    print(f"Incomplete data for match starting at index {i}. Skipping.")
                    skipped_matches.append(match_data[i:i+6])  # Collect the incomplete match
                    continue
            
            # Add page data to the all_data list
            all_data.extend(structured_data)

    # Convert to a DataFrame for the current folder
    season_data = pd.DataFrame(all_data)
   
    # Save the DataFrame to a CSV file
    csv_path = os.path.join("./Data_By_Season", f"{folder_name}_data.csv")
    season_data.to_csv(csv_path, index=False)
    print(f"Saved data for {folder_name} to {csv_path}")

    # Concatenate the season data to the all_seasons_data DataFrame
    all_seasons_data = pd.concat([all_seasons_data, season_data], ignore_index=True)

Processing folder: Premier_League_2003_2004
Saved data for Premier_League_2003_2004 to ./Data_By_Season\Premier_League_2003_2004_data.csv
Processing folder: Premier_League_2004_2005
Saved data for Premier_League_2004_2005 to ./Data_By_Season\Premier_League_2004_2005_data.csv
Processing folder: Premier_League_2005_2006
Saved data for Premier_League_2005_2006 to ./Data_By_Season\Premier_League_2005_2006_data.csv
Processing folder: Premier_League_2006_2007
Saved data for Premier_League_2006_2007 to ./Data_By_Season\Premier_League_2006_2007_data.csv
Processing folder: Premier_League_2007_2008
Saved data for Premier_League_2007_2008 to ./Data_By_Season\Premier_League_2007_2008_data.csv
Processing folder: Premier_League_2008_2009
Saved data for Premier_League_2008_2009 to ./Data_By_Season\Premier_League_2008_2009_data.csv
Processing folder: Premier_League_2009_2010
Saved data for Premier_League_2009_2010 to ./Data_By_Season\Premier_League_2009_2010_data.csv
Processing folder: Premier_League_

In [3]:
columns_to_check = ["homeodd", "drawodd", "awayodd", "home_odd_win", "draw_odd_win", "away_odd_win"]
all_seasons_data[columns_to_check] = all_seasons_data[columns_to_check].apply(pd.to_numeric, errors='coerce')

# Delete rows with null odds
all_seasons_data = all_seasons_data[
    ~(
        (all_seasons_data["homeodd"] == 0) &
        (all_seasons_data["drawodd"] == 0) &
        (all_seasons_data["awayodd"] == 0) &
        (all_seasons_data["home_odd_win"] == 0) &
        (all_seasons_data["draw_odd_win"] == 0) &
        (all_seasons_data["away_odd_win"] == 0)
    )
]

In [4]:
# Function to determine if the bookmaker predicted correctly
def bookmakers_prediction(row):
    # Find the smallest odd
    smallest_odd = min(row['homeodd'], row['drawodd'], row['awayodd'])

    # Determine the winning odd
    if row['home_odd_win'] == 1:
        winning_odd = row['homeodd']
    elif row['draw_odd_win'] == 1:
        winning_odd = row['drawodd']
    elif row['away_odd_win'] == 1:
        winning_odd = row['awayodd']
    else:
        winning_odd = None  # Handle cases where no winning odd is found

    # Compare smallest odd with winning odd
    return 1 if smallest_odd == winning_odd else 0

# Apply the function row by row to create the new column
all_seasons_data['bookmakers_prediction'] = all_seasons_data.apply(bookmakers_prediction, axis=1)

# Save the combined DataFrame for all seasons
all_seasons_data.to_csv("all_seasons_data.csv", index=False)
print(f"Saved combined data for all seasons to all_seasons_data.csv")

# Display the updated DataFrame
display(all_seasons_data.head(50))

Saved combined data for all seasons to all_seasons_data.csv


Unnamed: 0,team1,team2,matchStartTime,team1score,team2score,homeodd,drawodd,awayodd,home_odd_win,draw_odd_win,away_odd_win,bookmakers_prediction
0,Arsenal,Leicester,16:00,2,1,1.26,4.57,9.0,1,0,0,1
1,Aston Villa,Manchester Utd,16:00,0,2,2.03,3.28,3.1,0,0,1,0
2,Blackburn,Birmingham,16:00,1,1,2.08,3.17,3.1,0,1,0,0
3,Bolton,Fulham,16:00,0,2,1.95,3.22,3.37,0,0,1,0
4,Charlton,Southampton,16:00,2,1,2.0,3.22,3.25,1,0,0,1
5,Chelsea,Leeds,16:00,1,0,1.34,4.04,7.83,1,0,0,1
6,Liverpool,Newcastle,16:00,1,1,2.03,3.25,3.13,0,1,0,0
7,Manchester City,Everton,16:00,5,1,1.97,3.2,3.35,1,0,0,1
8,Portsmouth,Middlesbrough,16:00,5,1,2.08,3.18,3.08,1,0,0,1
9,Wolves,Tottenham,16:00,0,2,2.26,3.25,2.66,0,0,1,0
