In [4]:
from selenium import webdriver
import pandas as pd
import numpy as np
from selenium.webdriver.common.by import By
import random
from datetime import datetime, timedelta
from pycaret.classification import *
import pickle
from fuzzywuzzy import fuzz, process
from fractions import Fraction
import time
import collections
import concurrent.futures
from collections import defaultdict
import json
import requests
from bs4 import BeautifulSoup
import statistics
import math
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
today = f"{datetime.now():%Y%m%d}"
today = '20221024'
open_file = open(f'links/{today}_links.pkl', "rb")
days_links = pickle.load(open_file)
open_file.close()

In [1]:
c = days_links.copy()
todays_data = []

while True:
    for clock in c.copy().keys():
        if datetime.strptime(clock, "%Y-%m-%d %H:%M:%S") <= datetime.now():
            match_names = [x[0] for x in c[clock]]
            league_names = [x[1] for x in c[clock]]
            stats_urls = [x[2] for x in c[clock]]
            odds_urls = [x[3] for x in c[clock]]
            print(stats_urls, odds_urls)
            with concurrent.futures.ThreadPoolExecutor() as executor:
                r = executor.map(stats_odds, match_names, league_names, stats_urls, odds_urls)
            for result in r:
                todays_data.append(result)
            del c[clock]
    print(f'{len(c)} halftimes left -- {len(todays_data)}\n')
    if len(c) == 0:
        print('all done')
        break
    time.sleep(60)

open_file = open(f'data/{today}_data.pkl', "wb")
pickle.dump(todays_data, open_file)
open_file.close()

In [2]:
def getname(name, l):
    if len(l) == 0:
        return ''
    highest = process.extractOne(name,l)
    return highest[0]

In [20]:
def get_stats(url):
    try:
        page = requests.get(url)
        time.sleep(random.randint(0, 4))
        soup = BeautifulSoup(page.content, 'html.parser')
        ht = soup.find("span", {"class": "css-lv1jm0-bottomRow"}).text
        if ht != 'Half-Time':
            print('not halftime just yet ... sleeping\n')
            time.sleep(2*60)
            ht = soup.find("span", {"class": "css-lv1jm0-bottomRow"}).text
        div1 = soup.find_all("li", {"class": "css-radwzz-Stat e683amr6"})
        all_stats = {k[1]: [k[0], k[2]] for k in [z.split('|') for z in [x for x in [d.get_text('|') for d in div1]
                  if x not in ['Shots','Passes','Expected goals (xG)','Discipline','Defence','Duels']]]}
        score = soup.find_all("span", {"class": "css-bw7eig-topRow"})[0].text.split(' - ')
        score = [score[0], score[1]]
        posession = soup.find("div", {"class": "css-7s52se-PossessionWheel e683amr3"}).get_text('|').split('|')
        posession = [posession[1], posession[3]]
        all_stats['posession'] = posession
        all_stats['score'] = score
        game_stats = {}
        for name, stats in all_stats.items():
            if '(' in ''.join(stats):
                s = stats[0].split(' ') + stats[1].split(' ')
                if len(s) == 4:
                    s = [i.replace('(', '').replace(')','').replace('%','') for i in s]
                    game_stats[name] = [s[0], s[2]]
                    game_stats[name + ' percentage'] = [int(s[1])/100, int(s[3])/100]
                else:
                    s = [i for i in s if '%' not in i]
                    game_stats[name] = s
            else:
                game_stats[name] = stats
        match_name = url.split('/')[-1]
        game_stats['match_name'] = match_name
        game_stats['half_time'] = ht
        game_stats['url'] = url
    except Exception as e:
        print(e)
        game_stats = {}
    return game_stats

In [19]:
def get_odds(url):
    try:
        driver = webdriver.Chrome(ChromeDriverManager().install())
        if url == '':
            return {'odds_url':url, 'odds': '', 'full_odds': ''}
        driver.implicitly_wait(1)
        driver.get(url)
        html = driver.page_source
        soup = BeautifulSoup(html, features="html.parser")
        odds_containers = soup.find_all("div", {"class": "oddsAreaWrapper_o17xb9rs RowLayout_refg9ta"})
        names = soup.find_all("div", {"class": "BetRowLeftBetContent_b2f00kt"})
        names = [n.text for n in names]
        odds_containers = [container.find_all('button') for container in odds_containers]
        odds = [[z.text for z in container] for container in odds_containers]
        odds = [[float(Fraction(j.replace(' ',''))) + 1.0 for j in o][:-3] for o in odds]
        median_odds = [statistics.median(o) for o in odds]
        median_odds = dict(zip(names, median_odds))
        odds_sum = np.sum([1/o for o in median_odds.values()])
        odds = {'odds_url':url, 'odds': median_odds, 'full_odds': odds,'odds_sum':odds_sum}
    except Exception as e:
        print(e)
        return {'odds_url':url, 'odds': '', 'full_odds': '','odds_sum': ''}
    driver.quit()
    return odds

In [18]:
def epoch_dt(t):
    return datetime.utcfromtimestamp(int(t)).strftime('%Y-%m-%d %H:%M:%S')

In [17]:
def odds_api_odds(match_name, league_name):
    try:
        MARKET = 'h2h' # h2h | spreads | totals
        odds_response = requests.get('https://api.the-odds-api.com/v3/odds',
                                     params={'api_key': '346a781fdbfebac05ca9ff0cffb0c71a','sport': league_name,
                                             'region': ['uk','eu','us'], # uk | us | eu | au,
                                             'mkt': 'h2h'})
        odds_json = json.loads(odds_response.text)
        data = {}
        for game in odds_json['data']:
            if game['teams'].index(game['home_team']) == 1:
                teams = game['teams'][::-1]
                idx = [1,2,0]
            else:
                teams = game['teams']
                idx = [0,2,1]
            game_name = "-vs-".join(teams).lower().replace(' ', '-')
            odds = defaultdict(list)
            for d in game['sites']:
                odds[epoch_dt(d['last_update'])].append([d['odds']['h2h'][i] for i in idx])
            data[game_name] =  dict(sorted(odds.items(), key = lambda x:x[0]))
        best_match = getname(match_name, list(data.keys()))
        match = data[best_match]
        match['best_match'] = best_match
    except Exception as e:
        print(e)
        return {'api_odds': 'Austrian Bundespoopoo/Conpoorence league'}
    return {'api_odds': match}

In [16]:
def stats_odds(match_name, league_name, stats_url, odds_url):
    stats = get_stats(stats_url)
    oddschecker_odds = get_odds(odds_url)
    api_league_name = name_match[league_name]
    api_odds = odds_api_odds(match_name, api_league_name)
    row = {**stats, **oddschecker_odds, **api_odds}
    row['league_name'] = league_name
    make_pred(row.copy(), gbc)
    return row

In [15]:
gbc = load_model('gbc')
def make_pred(row, pipeline):
    try:
        print('\033[1m' + row['match_name'] + '\033[0m')
        print(f'current score is {row["score"][0]} - {row["score"][1]}\n')
        e_dict = {}
        for name, tup in row.items():
            name = name.replace(' ','_').lower()
            try:
                if len(tup) == 2 and isinstance(tup, list) and (isinstance(tup[0], str) or 
                                                               isinstance(tup[0], float)):
                    e_dict[f'{name}_diff'] = float(tup[0]) - float(tup[1])
                else:
                    e_dict[name] = tup
            except Exception as e:
                e_dict[name] = tup
        e_dict['median_odds'] = e_dict.pop('odds')
        if len(e_dict['median_odds']) > 0:
            e_dict['std_odds'] = dict(zip(list(e_dict['median_odds'].keys()),
                                          [np.std(sorted(x)[2:-2]) for x in e_dict['full_odds']]))
        print_pred(e_dict.copy(), pipeline)
    except Exception as e:
        print(e)

Transformation Pipeline and Model Successfully Loaded


In [14]:
def print_pred(row, pipeline):
    del row['api_odds']
    del row['full_odds']
    d_h_a = ['draw'] + row['odds_url'].split('/')[-2].split('-v-')
    order_decimal_odds = [getname(odds_name, list(row['median_odds'].keys())) for odds_name in d_h_a]
    decimal_odds = [row['median_odds'][key] for key in order_decimal_odds]
    df = pd.DataFrame.from_dict(row, orient = 'index').T
    df['o_names'] = df['odds_url'].apply(lambda x: ['draw'] + x.split('/')[-2].split('-v-'))
    for name in ['std','median']:
        r = []
        for i in df[[f'{name}_odds','o_names']].to_numpy():
            match = [getname(name, list(i[0].keys())) for name in i[1]]
            match = [i[0][key] for key in match]
            r.append(match)
        df[f'{name}_odds'] = r
    ordo_df = pd.DataFrame(df['median_odds'].tolist(), columns=['med_0', 'med_1', 'med_2'])
    std_o = pd.DataFrame(df['std_odds'].tolist(), columns=['std_0', 'std_1', 'std_2'])
    df = pd.concat([df, std_o, ordo_df], axis=1)
    p_probs = pipeline.predict_proba(df)[0]
    predicted = np.argmax(p_probs)
    print(f'Predicted winner is {predicted}\n')
    for p_prob, d_odd, win in zip(p_probs, decimal_odds, ['draw', 'home','away']):
        p_prob = round(p_prob, 4)
        d_odd = round(d_odd, 4)
        size = kelly_bet(d_odd, p_prob, 0.05)
        print(f'{win}: predicted prob: {p_prob*100}% - oddschecker: {round(1/d_odd*100,2)}% ({d_odd}) ---- bet {size}')

In [13]:
def kelly_bet(decimal_odds, predicted_prob, k_weight):
    if 1/decimal_odds >= predicted_prob:
        return 'no value'
    else:
        return kelly(predicted_prob, decimal_odds, k_weight)
    
def kelly(p, b, weight):
    stake = round((p * (b - 1) - (1 - p)) / (b - 1), 4) * weight
    return stake if stake > 0.0 else 'no value'

In [12]:
name_match = {'UNITED STATES - MAJOR LEAGUE SOCCER': 'soccer_usa_mls',
 'ENGLAND - CHAMPIONSHIP': 'soccer_efl_champ',
 'ENGLAND - PREMIER LEAGUE': 'soccer_epl',
 'GERMANY - BUNDESLIGA': 'soccer_germany_bundesliga',
 'FRANCE - LIGUE 1': 'soccer_france_ligue_one',
 'AUSTRIA - BUNDESLIGA': 'xxx',
 'PORTUGAL - LIGA PORTUGAL': 'soccer_portugal_primeira_liga',
 'BELGIUM - FIRST DIVISION A': 'soccer_belgium_first_div',
 'SWITZERLAND - SUPER LEAGUE': 'soccer_switzerland_superleague',
 'SPAIN - LALIGA': 'soccer_spain_la_liga',
 'DENMARK - SUPERLIGAEN': 'soccer_denmark_superliga',
 'MEXICO - LIGA MX APERTURA': 'soccer_mexico_ligamx',
 'NETHERLANDS - EREDIVISIE': 'soccer_netherlands_eredivisie',
 'SCOTLAND - PREMIERSHIP': 'soccer_spl',
 'ITALY - SERIE A': 'soccer_italy_serie_a',
 'CHAMPIONS LEAGUE GRP. A': 'soccer_uefa_champs_league',
 'CHAMPIONS LEAGUE GRP. B': 'soccer_uefa_champs_league',
 'CHAMPIONS LEAGUE GRP. C': 'soccer_uefa_champs_league',
 'CHAMPIONS LEAGUE GRP. D': 'soccer_uefa_champs_league',
 'CHAMPIONS LEAGUE GRP. E': 'soccer_uefa_champs_league',
 'CHAMPIONS LEAGUE GRP. F': 'soccer_uefa_champs_league',
 'CHAMPIONS LEAGUE GRP. G': 'soccer_uefa_champs_league',
 'CHAMPIONS LEAGUE GRP. H': 'soccer_uefa_champs_league',
 'EUROPA LEAGUE GRP. A': 'soccer_uefa_europa_league',
 'EUROPA LEAGUE GRP. B': 'soccer_uefa_europa_league',
 'EUROPA LEAGUE GRP. C': 'soccer_uefa_europa_league',
 'EUROPA LEAGUE GRP. D': 'soccer_uefa_europa_league',
 'EUROPA LEAGUE GRP. E': 'soccer_uefa_europa_league',
 'EUROPA LEAGUE GRP. F': 'soccer_uefa_europa_league',
 'EUROPA LEAGUE GRP. G': 'soccer_uefa_europa_league',
 'EUROPA LEAGUE GRP. H': 'soccer_uefa_europa_league',
 'EUROPA CONFERENCE LEAGUE GRP. A': 'xx',
 'EUROPA CONFERENCE LEAGUE GRP. B': 'xx',
 'EUROPA CONFERENCE LEAGUE GRP. C': 'xx',
 'EUROPA CONFERENCE LEAGUE GRP. D': 'xx',
 'EUROPA CONFERENCE LEAGUE GRP. E': 'xx',
 'EUROPA CONFERENCE LEAGUE GRP. F': 'xx',
 'EUROPA CONFERENCE LEAGUE GRP. G': 'xx',
 'EUROPA CONFERENCE LEAGUE GRP. H': 'xx',
 'NORWAY - ELITESERIEN': 'soccer_norway_eliteserien',
 'TÜRKIYE - SUPER LIG': 'soccer_turkey_super_league',
 'GERMANY - 2. BUNDESLIGA': 'soccer_germany_bundesliga2',
 'SWEDEN - ALLSVENSKAN': 'soccer_sweden_allsvenskan',
 'ENGLAND - LEAGUE ONE': 'soccer_england_league1',
 'ENGLAND - LEAGUE TWO': 'soccer_england_league2',
             'AUSTRALIA - A-LEAGUE MEN': 'xx'}