In [1]:
import boto3
from dotenv import load_dotenv
import os
import warnings
from io import StringIO, BytesIO
import pandas as pd
import socceraction.spadl as spadl
from tqdm import tqdm
import numpy as np
import requests
import importlib
from unidecode import unidecode
from fuzzywuzzy import fuzz, process
import pickle


import sys
sys.path.append('..')
# import data.utils
tqdm.pandas()

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
warnings.filterwarnings('ignore')

load_dotenv()
aws_access_key = os.getenv('AWS_ACCESS_KEY')
aws_secret_access = os.getenv('AWS_SECRET_ACCESS')
aws_region = os.getenv('AWS_REGION')

s3 = boto3.client('s3',
                aws_access_key_id=aws_access_key,
                aws_secret_access_key=aws_secret_access,
                region_name=aws_region)

bucket = 'footballbets'
league = "ENG-Premier League"
season = 2223

In [4]:
ger_pickles = s3.get_object(
              Bucket='footballbets',
              Key=f"season_pickles/GER-Bundesliga.pkl",
          )

ger_seasons = pickle.load(BytesIO(ger_pickles['Body'].read()))

ita_pickles = s3.get_object(
              Bucket='footballbets',
              Key=f"season_pickles/ITA-Serie A.pkl",
          )

ita_seasons = pickle.load(BytesIO(ita_pickles['Body'].read()))

master_season = {'GER-Bundesliga':ger_seasons,
                 'ITA-Serie A':ita_seasons}

In [5]:
from types import SimpleNamespace
from dataset.team_features import TeamFeatures

def get_team_features(master, diff, norm, feats):
  master_df = pd.DataFrame()
  for league, sezon in master.items():
    for key, data in sezon.items():

      team_feats = TeamFeatures(SimpleNamespace(**data), 4, 50,
                                use_diff=diff,
                                use_dist=False,
                                normalize=norm,
                                feat_group=feats)
      master_df = pd.concat([master_df.reset_index(drop=True), team_feats.features], ignore_index=True)
  return master_df

In [None]:
import itertools
from sklearn.model_selection import train_test_split
import statistics
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss, accuracy_score

config_cols = ["season", "game", "date", "home_team", "away_team", "matchday"]
cols =  ["last_cols", "momentum", "venue", "general", "elo"]

combinations_list = []
for r in range(1, len(cols) + 1):
    combinations_list.extend(list(itertools.combinations(cols, r)))

best_accuracy = -float('inf')
best_log_loss = float('inf')
best_config = None

# Convert each combination (tuple) into a list
combinations_list = [list(comb) for comb in combinations_list]
for diff in [True, False]:
  for norm in [True, False]:
    for combo in combinations_list:

      master_df = get_team_features(master_season, diff, norm, combo)
      lookback_data = master_df[master_df['lookback'] == 0]
      X = lookback_data.drop(config_cols + ['target'], axis=1)
      y = lookback_data['target']

      clf = RandomForestClassifier()

      test_accuracies = []
      test_log_losses = []

      for i in range(10):  # Repeat experiment 10 times
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
        clf.fit(X_train, y_train)

        y_pred = clf.predict(X_test)

        y_pred_proba = clf.predict_proba(X_test)

        test_accuracies.append(accuracy_score(y_test, y_pred))
        test_log_losses.append(log_loss(y_test, y_pred_proba))

      test_acc_mean = statistics.mean(test_accuracies)
      test_acc_std = statistics.stdev(test_accuracies)

      test_log_mean = statistics.mean(test_log_losses)
      test_log_std = statistics.stdev(test_log_losses)

      current_time = datetime.now().strftime("%H:%M:%S")

      if test_acc_mean > best_accuracy or (test_acc_mean - best_accuracy < 2 and test_log_mean < best_log_loss):
        best_accuracy = test_acc_mean
        best_log_loss = test_log_mean
        best_config = {
            'features': combo,
            'normalization': norm,
            'differences': diff,
            'accuracy': test_acc_mean,
            'log_loss': test_log_mean,
            'accuracy_std': test_acc_std,
            'log_loss_std': test_log_std,
            'time': datetime.now().strftime("%H:%M:%S")
        }

      with open('experiments_run.txt', 'a') as file:
        file.write(f"{current_time} | feature_list: {combo} | normalization: {norm} | differences: {diff} \n")
        file.write(f"accuracy: {round(test_acc_mean, 3)} +- {round(test_acc_std, 3)} | log loss: {round(test_log_mean, 3)} +- {round(test_log_std, 3)} \n")

with open('experiments_run.txt', 'a') as file:
    file.write("\nBest Experiment:\n")
    file.write(f"{best_config['time']} | feature_list: {best_config['features']} | normalization: {best_config['normalization']} | differences: {best_config['differences']}\n")
    file.write(f"accuracy: {round(best_config['accuracy'], 3)} +- {round(best_config['accuracy_std'], 3)} | log loss: {round(best_config['log_loss'], 3)} +- {round(best_config['log_loss_std'], 3)}\n")