# Fixture Observation

#### This script creates a fixture observation for each match, combining the home and away teams statistics, as well as the prediction target - pre-match odds

In [1]:
import boto3
import pandas as pd
import numpy as np
import json
from io import StringIO


s3 = boto3.resource('s3')
bucket_name = 'betfairex'


def read_csv_from_s3(bucket_name, object_key):
    response = s3.Object(bucket_name, object_key).get()

    s3_contents = response['Body'].read()
    df = pd.read_csv(StringIO(s3_contents.decode('utf-8')))

    return df

In [2]:
bucket = s3.Bucket(bucket_name)

In [3]:
all_keys = [obj.key for obj in bucket.objects.all()]

# keys are in the regular expression as follows:
# YYYY-MM-DD_[HOME_TEAM] v [AWAY_TEAM].csv

In [4]:
LEAGUE_INFO = pd.read_csv('league_info.csv')
TEAM_INFO = pd.read_csv('team_info.csv')

def get_match_info(object_key):
    match_name = object_key[:-4]
    match_date_str = match_name.split('_')[0]
    fixture = match_name.split('_')[1]
    home_team = fixture.split(' v ')[0]
    away_team = fixture.split(' v ')[1]
    
    match_info = {
        'match_date': match_date_str,
        'home_team': home_team,
        'away_team': away_team,
    }    
    return match_info

def get_betfair_exchange_prematch_odds(bucket_name, object_key):
    match_info = get_match_info(object_key)
    
    odds_df = read_csv_from_s3(bucket_name, object_key)
    
    odds_df = odds_df.sort_values("timestamp", ascending=True)
    prematch_df = odds_df[~odds_df['inplay']]
    
    home_team_df = prematch_df[prematch_df['selection'] == match_info['home_team']]
    away_team_df = prematch_df[prematch_df['selection'] == match_info['away_team']]
    draw_df = prematch_df[prematch_df['selection'] == "Draw"]
    
    home_prematch_odds = home_team_df.tail(1)['odds'].iloc[0]
    away_prematch_odds = away_team_df.tail(1)['odds'].iloc[0]
    draw_prematch_odds = draw_df.tail(1)['odds'].iloc[0]
    
    prematch_odds = {
        'match_date': match_info['match_date'],
        match_info['home_team']: home_prematch_odds,
        match_info['away_team']: away_prematch_odds,
        'Draw': draw_prematch_odds
    }
    return prematch_odds    
    

In [5]:
TEAM_MAPPING_BETFAIR_TO_APIFOOTBALL = {
    "Athletic Bilbao": "Athletic Club",
    "Betis": "Real Betis"
}

In [6]:
# Configuration constants

SERVER = "v3.football.api-sports.io"
API_KEY = "1f5008c4f33481203631d90c7a81c5e5"

from pandas.tseries.offsets import Day
import http.client
import datetime as dt

SERVER = "v3.football.api-sports.io"



conn = http.client.HTTPSConnection(SERVER)

headers = {
  'x-rapidapi-host': SERVER,
  'x-rapidapi-key': API_KEY
}


def get_team_stats(season, league_id, team_id, as_of_date):
    # as_of_date in format YYYY-MM-DD
    conn.request("GET", "/teams/statistics?season=%s&league=%s&team=%s&date=%s" % (season, league_id, team_id, as_of_date), headers=headers)
    res = conn.getresponse()
    data = res.read()

    team_stats_raw = data.decode("utf-8")
    team_stats = json.loads(team_stats_raw)["response"]
    
    return team_stats
    
def get_team_stats_for_fixture(object_key):
    match_info = get_match_info(object_key)
    
    match_date = match_info['match_date']
    match_date = dt.datetime.strptime(match_date, "%Y-%m-%d")
    as_of_date = match_date - Day(1)
    as_of_date = as_of_date.strftime("%Y-%m-%d")

    match_year = match_date.year
    match_month = match_date.month
    
    if match_month <= 6:
        match_year -= 1

    
    home_team = TEAM_MAPPING_BETFAIR_TO_APIFOOTBALL.get(match_info['home_team'], match_info['home_team'])
    home_team_info = TEAM_INFO.query("name == @home_team").to_dict('records')[0]
    home_team_id = home_team_info['team_id']
    home_team_league_id = home_team_info['league_id']
    home_team_stats = get_team_stats(match_year, home_team_league_id, home_team_id, as_of_date)
    
    
    away_team = TEAM_MAPPING_BETFAIR_TO_APIFOOTBALL.get(match_info['away_team'], match_info['away_team'])
    away_team_info = TEAM_INFO.query("name == @away_team").to_dict('records')[0]
    away_team_id = away_team_info['team_id']
    away_team_league_id = away_team_info['league_id']
    away_team_stats = get_team_stats(match_year, away_team_league_id, away_team_id, as_of_date)
    
    return home_team_stats, away_team_stats
    

In [7]:
def calc_form_score(form):
    form = form[-6:]
    form_dict = {
        'W': 1,
        'D': 0,
        'L': -1
    }
    
    form_score = 0
    for res in form[:3]:
        form_score += 0.5 * form_dict[res]
    for res in form[3:]:
        form_score += 1 * form_dict[res]
    return form_score

def generate_features(object_key):
    row = {}
    match_info = get_match_info(object_key)
    
    # Index
    row['match_date'] = match_info['match_date']
    row['home_team'] = match_info['home_team']
    row['away_team'] = match_info['away_team']
    
    # Features
    home_team_stats, away_team_stats = get_team_stats_for_fixture(object_key)
    
    row['home_team_avg_home_goals'] = home_team_stats['goals']['for']['average']['home']
    row['home_team_avg_total_goals'] = home_team_stats['goals']['for']['average']['total']
    row['away_team_avg_away_goals'] = away_team_stats['goals']['for']['average']['away']
    row['away_team_avg_total_goals'] = away_team_stats['goals']['for']['average']['total']
    
    row['home_team_form_score'] = calc_form_score(home_team_stats['form'])
    row['away_team_form_score'] = calc_form_score(away_team_stats['form'])
    
    # Labels
    prematch_odds = get_betfair_exchange_prematch_odds(bucket_name, object_key)
    row['odds_home_team'] = prematch_odds[match_info['home_team']]
    row['odds_away_team'] = prematch_odds[match_info['away_team']]
    row['odds_draw'] = prematch_odds['Draw']
    
    return row

In [8]:
from tqdm import tqdm
import time
rows = []


for i in tqdm(range(1, len(all_keys) + 1)):
    if i % 50 == 0:
        time.sleep(10)
    
    key = all_keys[-i]
    
    match_info = get_match_info(key)
    if match_info['match_date'][5:7] == '08':
        continue
    
    try:
        row = generate_features(key)
        rows.append(row)
    except:
        print("Failed to generate feature for %s." % key)

 43%|█████████████████▌                       | 201/469 [01:13<10:19,  2.31s/it]

Failed to generate feature for 2024-05-11_Valladolid v Espanyol.csv.


 49%|███████████████████▉                     | 228/469 [01:19<00:59,  4.08it/s]

Failed to generate feature for 2024-04-12_Leganes v Espanyol.csv.


 62%|█████████████████████████▎               | 290/469 [01:42<00:37,  4.82it/s]

Failed to generate feature for 2024-02-03_Leganes v Valladolid.csv.


 70%|████████████████████████████▊            | 329/469 [02:00<00:23,  6.05it/s]

Failed to generate feature for 2024-01-06_Espanyol v Getafe.csv.


 77%|███████████████████████████████▍         | 360/469 [02:16<00:28,  3.81it/s]

Failed to generate feature for 2023-12-05_Espanyol v Valladolid.csv.


 80%|████████████████████████████████▊        | 375/469 [02:19<00:15,  6.04it/s]

Failed to generate feature for 2023-11-17_Valladolid v Leganes.csv.


 86%|███████████████████████████████████▎     | 404/469 [02:34<01:01,  1.06it/s]

Failed to generate feature for 2023-10-20_Espanyol v Leganes.csv.
Failed to generate feature for 2023-10-14_Espanyol v Valladolid.csv.


100%|█████████████████████████████████████████| 469/469 [02:52<00:00,  2.71it/s]


In [9]:
df = pd.DataFrame.from_records(rows)

In [16]:
df.to_csv('basic_dataset.csv')

In [17]:
# remove outliers
df = df.query("odds_home_team <= 20 and odds_away_team <= 20")

# sort by match date
df = df.sort_values("match_date")

# calculate implied probability
df['home_team_probability'] = 1 / df['odds_home_team']

In [24]:
# Ridge regression

from sklearn.linear_model import Ridge
df = df.set_index(['match_date', 'home_team', 'away_team'])
X_train = df.values[:300, :-1]
Y_train = df.values[:300, -1]
X_val = df.values[300:, :-1]
Y_val = df.values[300:, -1]

# Fit data
clf = Ridge(alpha=1.0)
clf.fit(X_train, Y_train)

# Prediction
Y_pred = clf.predict(df.values[:, :-1])
df['home_team_probability_pred'] = Y_pred

# Evaluation
def rmse(series_a, series_b):
    return np.sqrt(np.mean((series_a - series_b) ** 2))

df['odds_home_team_pred'] = 1 / df['home_team_probability_pred']

In [46]:
rmse(df[:300]['home_team_probability'], df[:300]['home_team_probability_pred'])

0.032771465894335876

In [47]:
rmse(df[300:]['home_team_probability'], df[300:]['home_team_probability_pred'])

0.04058550463962281