<a href="https://colab.research.google.com/github/leakysam/Soccer-Scraper/blob/League-%26-HT-score/Scrapper_Branch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from datetime import datetime, timedelta
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Function to format date as needed in the URL
def format_date_url(date):
    return date.strftime("%Y-%m-%d")

# Specify the starting date
start_date = datetime(2024, 1, 1)

# Number of days to scrape
num_days = 21

# Create an empty list to store the extracted data
all_data = []

# Iterate over the specified number of days
for day in range(num_days):
    # Calculate the current date
    current_date = start_date + timedelta(days=day)

    # Construct the URL with the dynamic date
    url = f"https://www.forebet.com/en/football-predictions/under-over-25-goals/{format_date_url(current_date)}"

    # Set the user-agent header
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }

    # Send an HTTP request to the URL with the specified headers
    response = requests.get(url, headers=headers)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        try:
            # Parse the HTML content of the page
            soup = BeautifulSoup(response.text, 'html.parser')

            # Extract data from specific HTML elements
            team_divs = soup.find_all('div', class_='tnms')
            avg_goals_divs = soup.find_all('div', class_='avg_sc tabonly')

            # Extract coefficients from both div and span structures
            coef_values = []
            coef_divs = soup.find_all('div', class_='bigOnly prmod')
            for div in coef_divs:
                span = div.find('span', class_='lscrsp')
                coef = div.find('span', class_='lscrsp') or div.find('span')
                match = re.search(r'\d+\.\d+', coef.get_text(strip=True)) if coef else None
                coef_values.append(match.group() if match else 'N/A')

            # Extract score values from both div and span structures
            score_values = []
            score_divs = soup.find_all('div', class_='lscr_td')
            for div in score_divs:
                b = div.find('b', class_='l_scr')
                score_text = b.get_text(strip=True) if b else 'N/A'
                score_values.append(score_text)

            # Extract ht_scr values from both div and span structures
            ht_values = []
            ht_divs = soup.find_all('div', class_='lscr_td')
            for div in ht_divs:
                ht_span = div.find('span', class_='ht_scr')
                ht_text = ht_span.get_text(strip=True) if ht_span else 'N/A'
                ht_values.append(ht_text)

            # Extract league_country values from both div and span structures
            league_country_values = []
            league_country_divs = soup.find_all('div', class_='shortagDiv tghov')
            for div in league_country_divs:
                short_tag_span = div.find('span', class_='shortTag')
                short_tag_text = short_tag_span.get_text(strip=True) if short_tag_span else 'N/A'
                league_country_values.append(short_tag_text)

            # Create lists to store the extracted data
            teams = [div.get_text(strip=True) for div in team_divs]
            avg_goals = [div.get_text(strip=True) for div in avg_goals_divs]

            # Determine the minimum length of all lists
            min_length = min(len(teams), len(avg_goals), len(coef_values), len(score_values), len(ht_values), len(league_country_values))

            # Append the extracted data to the list
            for i in range(min_length):
                data = {
                    'Date': format_date_url(current_date),
                    'Team': teams[i],
                    'Avg. Goals': avg_goals[i],
                    'Coef. Value': coef_values[i],
                    'Score': score_values[i],
                    'HT Score': ht_values[i],
                    'League_Country': league_country_values[i]  # Add the new column
                }
                all_data.append(data)

        except AttributeError as e:
            print(f"Error extracting data: {e}")
    else:
        print(f"Error: Unable to fetch data. Status code: {response.status_code}")

# Convert the list of dictionaries to a Pandas DataFrame
df = pd.DataFrame(all_data)

# Save the DataFrame to an Excel file
output_file = 'output_data.xlsx'
df.to_excel(output_file, index=False)

print(f"Data has been successfully scraped and stored in '{output_file}'")
