In [206]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
from bs4 import BeautifulSoup
import requests
import time
import random
from collections import Counter
from itertools import groupby
from concurrent.futures import ThreadPoolExecutor


In [227]:
flat_file = pd.read_csv('fotmob_odds_df.csv', index_col = [0])

In [228]:
def calculate_streaks(form):
    win_streak = 0
    loss_streak = 0
    winless_streak = 0
    
    for result in reversed(form):
        if result == 'W':
            win_streak += 1
            winless_streak = 0
            if loss_streak > 0:
                loss_streak = 0
        elif result == 'L':
            loss_streak += 1
            winless_streak += 1
            if win_streak > 0:
                win_streak = 0
        elif result == 'D':
            winless_streak += 1
            win_streak = 0
            loss_streak = 0
            if win_streak > 0:
                win_streak = 0
    
    return win_streak, loss_streak, winless_streak

def calculate_team_stats(matches, team):
    goals_scored = 0
    goals_conceded = 0
    wins = 0
    draws = 0
    losses = 0
    clean_sheets = 0
    failed_to_score = 0
    form = []

    for match in matches:
        is_home_team = match['home_team'] == team
        team_goals, opponent_goals = map(int, match['score'].split(' - '))
        if not is_home_team:
            team_goals, opponent_goals = opponent_goals, team_goals

        goals_scored += team_goals
        goals_conceded += opponent_goals

        if team_goals > opponent_goals:
            form.append('W')
            wins += 1
        elif team_goals < opponent_goals:
            form.append('L')
            losses += 1
        else:
            form.append('D')
            draws += 1

        if opponent_goals == 0:
            clean_sheets += 1

        if team_goals == 0:
            failed_to_score += 1

    win_streak, loss_streak, winless_streak = calculate_streaks(form)

    features = {
        'goals_scored': goals_scored,
        'goals_conceded': goals_conceded,
        'goal_difference': goals_scored - goals_conceded,
        'wins': wins,
        'draws': draws,
        'losses': losses,
        'points_gained': 3*wins + 1*draws + 0*losses,
        'win_ratio': wins / len(matches) if len(matches) else 0,
        'draw_ratio': draws / len(matches) if len(matches) else 0,
        'loss_ratio': losses / len(matches) if len(matches) else 0,
        'win_streak': win_streak,
        'loss_streak': loss_streak,
        'winless_streak': winless_streak,
        'average_goals_scored': goals_scored / len(matches) if len(matches) else 0,
        'average_goals_conceded': goals_conceded / len(matches) if len(matches) else 0,
        'clean_sheets': clean_sheets,
        'failed_to_score': failed_to_score,
        'scoring_ratio': (len(matches) - failed_to_score) / len(matches) if len(matches) else 0,
        'conceding_ratio': (len(matches) - clean_sheets) / len(matches) if len(matches) else 0,
    }

    return features


def summarize_team_performance(data):
    team_counter = Counter([match['home_team'] for match in data] + [match['away_team'] for match in data])
    teams = [team for team, freq in team_counter.most_common() if freq >= 5]

    home_team, away_team = teams[0], teams[1]

    home_matches, away_matches = data[:5], data[5:10]

    home_form = calculate_team_stats(home_matches, home_team)
    away_form = calculate_team_stats(away_matches, away_team)

    home_form = {'home_form_' + key: value for key, value in home_form.items()}
    away_form = {'away_form_' + key: value for key, value in away_form.items()}

    return {**home_form, **away_form}

In [229]:
def scrape_form(url):
#     url = flat_file['url'][3]
    page = requests.get(url)

    # pause for interval between 0 and 1 seconds to avoid getting banned
    time.sleep(random.randint(0, 2))

    soup = BeautifulSoup(page.content, 'html.parser')

    # find all 'ul' elements
    elements = soup.find_all('ul')

    matches = []
    for element in elements:
        # find all 'li' within the 'ul'
        li_elements = element.find_all('li')

        for li_element in li_elements:
            # find 'a' with the specific classes for right and left containers, and for the bottom elements
            a_element = li_element.find('a', {'class': ['right css-skyz2k-TeamFormContainer e3w5gu46',
                                                        'left css-skyz2k-TeamFormContainer e3w5gu46',
                                                        'right css-1ac4ee9-TeamFormContainer e3w5gu46',
                                                        'left css-1ac4ee9-TeamFormContainer e3w5gu46']})
            if a_element is not None:
                teams = a_element.find_all('span', class_='css-1lje8ql-TeamName e3w5gu40')
                score_div = a_element.find('div', {'class': ['css-la90e9-ResultBox ecz4wo12',
                                                             'css-udltjo-ResultBox ecz4wo12',
                                                             'css-1ef1lvo-ResultBox ecz4wo12']})
                if score_div is not None and len(teams) == 2:
                    score = score_div.span.text
                    match = {
                        'home_team': teams[0].text,
                        'away_team': teams[1].text,
                        'score': score,
                    }
                    matches.append(match)
    matches = matches[:10]
    return matches

In [230]:
def process_url(url):
    matches = scrape_form(url)
    stats = summarize_team_performance(matches)
    return {'url': url, 'stats': stats}

def chunked_scrape(urls, chunk_size=50, sleep_interval=2):
    num_urls = len(urls)
    stats_list = []

    for i in range(0, num_urls, chunk_size):
        print(f"processing {i} to {i + chunk_size}...")
        chunk_urls = urls[i:i+chunk_size]

        with ThreadPoolExecutor(max_workers=5) as executor:
            chunk_stats = list(executor.map(process_url, chunk_urls))

        stats_list.extend(chunk_stats)
        if i + chunk_size < num_urls:
            time.sleep(sleep_interval)

    return stats_list

urls = flat_file['url'].tolist()

stats_list = chunked_scrape(urls)

url_list = [stat['url'] for stat in stats_list]
stats_list = [stat['stats'] for stat in stats_list]

stats_df = pd.DataFrame(stats_list)
stats_df['url'] = url_list

Processing 0 to 50...
Processing 50 to 100...
Processing 100 to 150...
Processing 150 to 200...
Processing 200 to 250...
Processing 250 to 300...
Processing 300 to 350...
Processing 350 to 400...
Processing 400 to 450...
Processing 450 to 500...
Processing 500 to 550...
Processing 550 to 600...
Processing 600 to 650...
Processing 650 to 700...
Processing 700 to 750...
Processing 750 to 800...
Processing 800 to 850...
Processing 850 to 900...
Processing 900 to 950...
Processing 950 to 1000...
Processing 1000 to 1050...
Processing 1050 to 1100...
Processing 1100 to 1150...
Processing 1150 to 1200...
Processing 1200 to 1250...
Processing 1250 to 1300...
Processing 1300 to 1350...
Processing 1350 to 1400...
Processing 1400 to 1450...
Processing 1450 to 1500...
Processing 1500 to 1550...
Processing 1550 to 1600...
Processing 1600 to 1650...
Processing 1650 to 1700...
Processing 1700 to 1750...
Processing 1750 to 1800...
Processing 1800 to 1850...
Processing 1850 to 1900...
Processing 1900 t

In [238]:
combined_df = pd.concat([flat_file.reset_index(drop=True), stats_df.reset_index(drop=True)], axis=1)

In [239]:
combined_df.to_csv('ml_df.csv')