<a href="https://colab.research.google.com/github/leakysam/Soccer-Scraper/blob/main/Soccer_Scrapper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import datetime, timedelta

# Function to format date as needed in the URL
def format_date_url(date):
    return date.strftime("%Y-%m-%d")

# Specify the starting date
start_date = datetime(2024, 1, 1)

# Number of days to scrape
num_days = 21

# Create an empty list to store the extracted data
all_data = []

# Iterate over the specified number of days
for day in range(num_days):
    # Calculate the current date
    current_date = start_date + timedelta(days=day)

    # Construct the URL with the dynamic date
    url = f"https://www.forebet.com/en/football-predictions/under-over-25-goals/{format_date_url(current_date)}"

    # Set the user-agent header
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }

    # Send an HTTP request to the URL with the specified headers
    response = requests.get(url, headers=headers)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        try:
            # Parse the HTML content of the page
            soup = BeautifulSoup(response.text, 'html.parser')

            # Extract data from specific HTML elements
            team_divs = soup.find_all('div', class_='tnms')
            avg_goals_divs = soup.find_all('div', class_='avg_sc tabonly')

            # Extract coefficients from both div and span structures
            coef_values = []
            coef_divs = soup.find_all('div', class_='bigOnly prmod')
            for div in coef_divs:
                span = div.find('span', class_='lscrsp')
                coef = div.find('span', class_='lscrsp') or div.find('span')
                match = re.search(r'\d+\.\d+', coef.get_text(strip=True)) if coef else None
                coef_values.append(match.group() if match else 'N/A')

            # Extract score values from both div and span structures
            score_values = []
            score_divs = soup.find_all('div', class_='lscr_td')
            for div in score_divs:
                b = div.find('b', class_='l_scr')
                score_text = b.get_text(strip=True) if b else 'N/A'
                score_values.append(score_text)

            # Create lists to store the extracted data
            teams = [div.get_text(strip=True) for div in team_divs]
            avg_goals = [div.get_text(strip=True) for div in avg_goals_divs]

            # Determine the minimum length of all lists
            min_length = min(len(teams), len(avg_goals), len(coef_values), len(score_values))

            # Append the extracted data to the list
            for i in range(min_length):
                data = {
                    'Date': format_date_url(current_date),
                    'Team': teams[i],
                    'Avg. Goals': avg_goals[i],
                    'Coef. Value': coef_values[i],
                    'Score': score_values[i]
                }
                all_data.append(data)

        except AttributeError as e:
            print(f"Error extracting data: {e}")
    else:
        print(f"Error: Unable to fetch data. Status code: {response.status_code}")

# Convert the list of dictionaries to a Pandas DataFrame
df = pd.DataFrame(all_data)

# Save the DataFrame to an Excel file
output_file = 'output_data.xlsx'
df.to_excel(output_file, index=False)

print(f"Data has been successfully scraped and stored in '{output_file}'")


Data has been successfully scraped and stored in 'output_data.xlsx'


In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta

# Function to format date as needed in the URL
def format_date_url(date):
    return date.strftime("%Y-%m-%d")

# Specify the starting date
start_date = datetime(2024, 1, 1)

# Number of days to scrape
num_days = 21

# Create an empty list to store the extracted data
all_data = []

# Iterate over the specified number of days
for day in range(num_days):
    # Calculate the current date
    current_date = start_date + timedelta(days=day)

    # Construct the URL with the dynamic date
    url = f"https://www.forebet.com/en/football-predictions/corners/{format_date_url(current_date)}"

    # Set the user-agent header
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }

    # Send an HTTP request to the URL with the specified headers
    response = requests.get(url, headers=headers)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        try:
            # Parse the HTML content of the page
            soup = BeautifulSoup(response.text, 'html.parser')

            # Extract data from specific HTML elements
            team_divs = soup.find_all('div', class_='tnms')
            avg_corners_divs = soup.find_all('div', class_='avg_sc tabonly')
            corners_divs = soup.find_all('div', class_='lscr_td lResTdSmall')

            # Extract Avg. Corners values
            avg_corners_values = [div.get_text(strip=True) for div in avg_corners_divs]

            # Extract Corners values
            corners_values = []
            for div in corners_divs:
                b = div.find('b', class_='l_scr')
                corners_text = b.get_text(strip=True) if b else 'N/A'
                corners_values.append(corners_text)

            # Create lists to store the extracted data
            teams = [div.get_text(strip=True) for div in team_divs]

            # Determine the minimum length of all lists
            min_length = min(len(teams), len(avg_corners_values), len(corners_values))

            # Append the extracted data to the list
            for i in range(min_length):
                data = {
                    'Date': format_date_url(current_date),
                    'Home Team': teams[i].split('vs.')[0].strip(),
                    'Away Team': teams[i].split('vs.')[1].strip() if 'vs.' in teams[i] else 'N/A',
                    'Avg. Corners': avg_corners_values[i],
                    'Corners': corners_values[i]
                }
                all_data.append(data)

        except AttributeError as e:
            print(f"Error extracting data: {e}")
    else:
        print(f"Error: Unable to fetch data. Status code: {response.status_code}")


# Convert the list of dictionaries to a Pandas DataFrame
df = pd.DataFrame(all_data)

# Print the DataFrame to see if the data looks correct
print(df)

# Save the DataFrame to an Excel file
output_file = 'output_corners.xlsx'
df.to_excel(output_file, index=False)

print(f"Data has been successfully scraped and stored in '{output_file}'")



           Date                                    Home Team Away Team  \
0    2024-01-01                           Home teamAway team       N/A   
1    2024-01-01     WS WanderersMacarthur FC01/01/2024 07:00       N/A   
2    2024-01-01  SunderlandPreston North End01/01/2024 13:30       N/A   
3    2024-01-01    Swansea CityWest Bromwich01/01/2024 16:00       N/A   
4    2024-01-01       Stoke CityIpswich Town01/01/2024 16:00       N/A   
..          ...                                          ...       ...   
428  2024-01-21             GD ChavesRio Ave21/01/2024 19:00       N/A   
429  2024-01-21   Aris SalonicaOlympiacos FC21/01/2024 19:30       N/A   
430  2024-01-21          US LecceJuventus FC21/01/2024 20:45       N/A   
431  2024-01-21          Girona FCSevilla FC21/01/2024 21:00       N/A   
432  2024-01-21      Elche CFReal Valladolid21/01/2024 21:00       N/A   

     Avg. Corners Corners  
0    Avg. corners       9  
1           11.98       5  
2           10.45       8  