# MODULE INSTALLATIONS

In [38]:
import datetime
import traceback
import time
import requests
import pandas as pd
from sklearn.model_selection import train_test_split

# INITIALIZE THE WORKSPACE

In [40]:
MODEL_NAME = "premier_league"
MODEL_VERSION = "v1.0"
MODEL_TYPE = "win_outcome"

# FUNCTION DEFINATIONS

In [9]:
# INDICATOR PERIODS
MA_RSI_PERIOD = 14
BOLLINGER_PERIOD = 20
MACD_SIGNAL_PERIOD = 9
MACD_SHORT_PERIOD = 12
MACD_LONG_PERIOD = 26
BOLLINGER_N_STD = 2


# FINANCIAL INDICATORS
def rsi(data, window = MA_RSI_PERIOD, modify = True):
    delta = data['close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))

    if modify:
      data["rsi"] = rsi
      return data
    else:
      return rsi


def sma(data, window = MA_RSI_PERIOD, modify = True):
    _sma = data['close'].rolling(window=window).mean()

    if modify:
      data["sma"] = _sma
      return data
    else:
      return _sma


def ema(data, span = MA_RSI_PERIOD, modify = True):
    _ema = data['close'].ewm(span=span, adjust=False).mean()

    if modify:
      data["ema"] = _ema
      return data
    else:
      return _ema


def stochastic_oscillator(data, window = MA_RSI_PERIOD, modify = True):
    lowest_low = data['low'].rolling(window=window).min()
    highest_high = data['high'].rolling(window=window).max()
    _stochastic = ((data['close'] - lowest_low) / (highest_high - lowest_low)) * 100

    if modify:
      data["stochastic"] = _stochastic
      return data
    else:
      return _stochastic


def bollinger_bands(data, window = BOLLINGER_PERIOD, num_std = 2, modify = True):
    rolling_mean = data['close'].rolling(window=window).mean()
    rolling_std = data['close'].rolling(window=window).std()

    _lower_band = rolling_mean - (rolling_std * num_std)
    _upper_band = rolling_mean + (rolling_std * num_std)

    if modify:
      data["bollinger_lower"] = _lower_band
      data["bollinger_upper"] = _upper_band
      return data
    else:
      return _lower_band, _upper_band


def macd(data, short_window = MACD_SHORT_PERIOD, long_window = MACD_LONG_PERIOD, signal_window = MACD_SIGNAL_PERIOD, modify = True):
    _ema_short = ema(data, span=short_window, modify=False)
    _ema_long = ema(data, span=long_window, modify=False)

    macd_line = _ema_short - _ema_long
    signal_line = macd_line.rolling(window=signal_window).mean()

    if modify:
      data["macd"] = macd_line
      data["macd_signal"] = signal_line
      return data
    else:
      return macd_line, signal_line


def to_lowercase_columns(data):
    data.columns = [column.replace(" ", "_").lower() for column in data.columns]
    return data


def append_technical_indicators(data, add_outcomes = True):
  # data = to_lowercase_columns(data)
  data = rsi(data)
  data = sma(data)
  data = ema(data)
  data = stochastic_oscillator(data)
  data = bollinger_bands(data)
  data = macd(data)

  data.index = pd.to_datetime(data.pop("timestamp"))
  data.index.names = ["timestamp"]
  data = data.sort_values("timestamp")

  data["hour"] = data.index.hour
  data["day_of_week"] = data.index.dayofweek
  data = data.dropna()

  # OUTCOMES OF THE TICKER
  if add_outcomes:
    data["target"] = (data.close > data.open).apply(lambda x: 1 if x > 0 else 0)
  return data.astype(float)

# DATA ENGENEERING

In [11]:
DATAFILE_PATH = "../relative_datasets/raw/all_leagues_final_raw.csv"
dataframe = pd.read_csv(f"{DATAFILE_PATH}")
dataframe = dataframe.dropna().drop_duplicates().reset_index(drop=True)
dataframe = dataframe[['id', 'day', 'month', 'year', 'weekday', 'hour', 'minute', 'matchday',
       'status', 'leagueId', 'leagueType',
       'homeId', 'awayId', 'scoreHomeHt', 'scoreAwayHt',
       'scoreHomeFt', 'scoreAwayFt']]
dataframe


Unnamed: 0,id,day,month,year,weekday,hour,minute,matchday,status,leagueId,leagueType,homeId,awayId,scoreHomeHt,scoreAwayHt,scoreHomeFt,scoreAwayFt
0,285418,11,6,2021,5,19,0,1.0,FINISHED,2018,CUP,803,784,0.0,0.0,0.0,3.0
1,285419,12,6,2021,6,13,0,1.0,FINISHED,2018,CUP,833,788,0.0,0.0,1.0,1.0
2,285420,16,6,2021,3,16,0,2.0,FINISHED,2018,CUP,803,833,0.0,1.0,0.0,2.0
3,285421,16,6,2021,3,19,0,2.0,FINISHED,2018,CUP,784,788,1.0,0.0,3.0,0.0
4,285422,20,6,2021,7,16,0,3.0,FINISHED,2018,CUP,784,833,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6924,494884,25,4,2024,4,0,30,3.0,FINISHED,2152,CUP,9379,6667,1.0,1.0,1.0,2.0
6925,494885,8,5,2024,3,0,0,4.0,FINISHED,2152,CUP,5171,9379,1.0,0.0,1.0,1.0
6926,494886,8,5,2024,3,0,0,4.0,FINISHED,2152,CUP,7055,6667,0.0,2.0,2.0,2.0
6927,495747,12,5,2024,7,11,0,1.0,FINISHED,2016,LEAGUE,68,341,0.0,0.0,0.0,0.0


In [None]:
try:
    # Add head2head stats for better comparability
    copy = dataframe.copy()
    copy2 = pd.DataFrame([], columns=[*copy.columns, "h2hHomeWins", "h2hHomeLosses", "h2hAwayWins", "h2hAwayLosses", "h2hDraws", "h2hAvgGoals", "h2hMatches"])
    count = 0
    for index in range(3699, len(copy)-1):
        row = copy.iloc[index]
        head2head = get_head2head(row.id)
        count = index
        print(f"Progress: {index}")
        print(head2head)
        if head2head.get("numberOfMatches"):
            copy.loc[index, "h2hHomeWins"] = head2head["homeTeam"]["wins"]
            copy.loc[index, "h2hHomeLosses"] = -head2head["homeTeam"]["losses"]
            copy.loc[index, "h2hAwayWins"] = head2head["awayTeam"]["wins"]
            copy.loc[index, "h2hAwayLosses"] = -head2head["awayTeam"]["losses"]
            copy.loc[index, "h2hDraws"] = head2head["homeTeam"]["draws"]
            copy.loc[index, "h2hAvgGoals"] = head2head["totalGoals"]
            copy.loc[index, "h2hMatches"] = head2head["numberOfMatches"]
            copy2 = pd.concat([copy2, copy.iloc[[index]]], ignore_index=True)
except:
    copy2.to_csv(f"../relative_datasets/raw/{MODEL_NAME}-{MODEL_VERSION}_{count}_copy.csv")
    raise Exception(traceback.format_exc())

copy2.to_csv(f"../relative_datasets/raw/{MODEL_NAME}-{MODEL_VERSION}_{count}_copy.csv")

In [18]:
df1 = pd.read_csv("../relative_datasets/raw/all_leagues-v1.0_6927_copy.csv")
df2 = pd.read_csv("../relative_datasets/raw/all_leagues-v1.0_3699_copy.csv")
dataframe = pd.concat([df1, df2]).drop_duplicates().dropna().reset_index(drop=True).drop(["Unnamed: 0"], axis=1)
dataframe

Unnamed: 0,id,day,month,year,weekday,hour,minute,matchday,status,leagueId,...,scoreAwayHt,scoreHomeFt,scoreAwayFt,h2hHomeWins,h2hHomeLosses,h2hAwayWins,h2hAwayLosses,h2hDraws,h2hAvgGoals,h2hMatches
0,417220,31,8,2022,3,19,0,5.0,FINISHED,2015,...,0.0,3.0,1.0,3.0,-1.0,1.0,-3.0,1.0,18.0,5.0
1,417221,31,8,2022,3,17,0,5.0,FINISHED,2015,...,0.0,1.0,1.0,3.0,-1.0,1.0,-3.0,1.0,12.0,5.0
2,417222,31,8,2022,3,19,0,5.0,FINISHED,2015,...,1.0,5.0,2.0,3.0,-1.0,1.0,-3.0,1.0,15.0,5.0
3,417223,31,8,2022,3,19,0,5.0,FINISHED,2015,...,0.0,1.0,0.0,4.0,-1.0,1.0,-4.0,0.0,14.0,5.0
4,417224,31,8,2022,3,17,0,5.0,FINISHED,2015,...,0.0,2.0,1.0,3.0,-2.0,2.0,-3.0,0.0,16.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6742,417216,4,9,2022,7,13,0,6.0,FINISHED,2015,...,0.0,2.0,0.0,2.0,-2.0,2.0,-2.0,1.0,15.0,5.0
6743,417217,3,9,2022,6,15,0,6.0,FINISHED,2015,...,1.0,0.0,2.0,0.0,-4.0,4.0,0.0,1.0,14.0,5.0
6744,417218,4,9,2022,7,13,0,6.0,FINISHED,2015,...,0.0,0.0,1.0,1.0,-3.0,3.0,-1.0,1.0,6.0,5.0
6745,417219,31,8,2022,3,19,0,5.0,FINISHED,2015,...,1.0,0.0,3.0,1.0,-3.0,3.0,-1.0,1.0,16.0,5.0


In [19]:
# Clean the data and remove duplicated values in process of combining datasets
dataframe = dataframe.sort_values("id").reset_index(drop=True)
duplicated = dataframe.id.duplicated(keep="first")
for duplicate_index in duplicated.loc[duplicated == True].index:
    dataframe = dataframe.drop(index=duplicate_index)
dataframe = dataframe.reset_index(drop=True)
dataframe

Unnamed: 0,id,day,month,year,weekday,hour,minute,matchday,status,leagueId,...,scoreAwayHt,scoreHomeFt,scoreAwayFt,h2hHomeWins,h2hHomeLosses,h2hAwayWins,h2hAwayLosses,h2hDraws,h2hAvgGoals,h2hMatches
0,285418,11,6,2021,5,19,0,1.0,FINISHED,2018,...,0.0,0.0,3.0,0.0,-2.0,2.0,0.0,0.0,6.0,2.0
1,285419,12,6,2021,6,13,0,1.0,FINISHED,2018,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0
2,285420,16,6,2021,3,16,0,2.0,FINISHED,2018,...,1.0,0.0,2.0,1.0,-1.0,1.0,-1.0,1.0,6.0,3.0
3,285421,16,6,2021,3,19,0,2.0,FINISHED,2018,...,0.0,3.0,0.0,2.0,-1.0,1.0,-2.0,2.0,13.0,5.0
4,285422,20,6,2021,7,16,0,3.0,FINISHED,2018,...,0.0,1.0,0.0,1.0,0.0,0.0,-1.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6741,494883,24,4,2024,3,22,0,3.0,FINISHED,2152,...,1.0,2.0,1.0,2.0,0.0,0.0,-2.0,0.0,4.0,2.0
6742,494884,25,4,2024,4,0,30,3.0,FINISHED,2152,...,1.0,1.0,2.0,0.0,-2.0,2.0,0.0,0.0,5.0,2.0
6743,494885,8,5,2024,3,0,0,4.0,FINISHED,2152,...,0.0,1.0,1.0,0.0,-1.0,1.0,0.0,1.0,5.0,2.0
6744,494886,8,5,2024,3,0,0,4.0,FINISHED,2152,...,2.0,2.0,2.0,0.0,-1.0,1.0,0.0,1.0,6.0,2.0


# MANIPULATIONS

In [20]:
# REMOVE INPLAY MATCHES
dataframe = dataframe.loc[dataframe.status == "FINISHED"].reset_index(drop=True)
dataframe

Unnamed: 0,id,day,month,year,weekday,hour,minute,matchday,status,leagueId,...,scoreAwayHt,scoreHomeFt,scoreAwayFt,h2hHomeWins,h2hHomeLosses,h2hAwayWins,h2hAwayLosses,h2hDraws,h2hAvgGoals,h2hMatches
0,285418,11,6,2021,5,19,0,1.0,FINISHED,2018,...,0.0,0.0,3.0,0.0,-2.0,2.0,0.0,0.0,6.0,2.0
1,285419,12,6,2021,6,13,0,1.0,FINISHED,2018,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0
2,285420,16,6,2021,3,16,0,2.0,FINISHED,2018,...,1.0,0.0,2.0,1.0,-1.0,1.0,-1.0,1.0,6.0,3.0
3,285421,16,6,2021,3,19,0,2.0,FINISHED,2018,...,0.0,3.0,0.0,2.0,-1.0,1.0,-2.0,2.0,13.0,5.0
4,285422,20,6,2021,7,16,0,3.0,FINISHED,2018,...,0.0,1.0,0.0,1.0,0.0,0.0,-1.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6736,494883,24,4,2024,3,22,0,3.0,FINISHED,2152,...,1.0,2.0,1.0,2.0,0.0,0.0,-2.0,0.0,4.0,2.0
6737,494884,25,4,2024,4,0,30,3.0,FINISHED,2152,...,1.0,1.0,2.0,0.0,-2.0,2.0,0.0,0.0,5.0,2.0
6738,494885,8,5,2024,3,0,0,4.0,FINISHED,2152,...,0.0,1.0,1.0,0.0,-1.0,1.0,0.0,1.0,5.0,2.0
6739,494886,8,5,2024,3,0,0,4.0,FINISHED,2152,...,2.0,2.0,2.0,0.0,-1.0,1.0,0.0,1.0,6.0,2.0


In [21]:
dataframe = dataframe.reset_index(drop=True)
dataframe = dataframe[["id", "day", "month", "year", "weekday", "hour", "minute", "matchday",
                      "leagueType", "homeId", "awayId", "scoreHomeHt", "scoreAwayHt",
                      "scoreHomeFt", "scoreAwayFt", "h2hHomeWins", "h2hHomeLosses", "h2hAwayWins",
                      "h2hAwayLosses", "h2hDraws", "h2hAvgGoals", "h2hMatches"]]
dataframe.leagueType = (dataframe.leagueType == "CUP").astype(float)
dataframe

Unnamed: 0,id,day,month,year,weekday,hour,minute,matchday,leagueType,homeId,...,scoreAwayHt,scoreHomeFt,scoreAwayFt,h2hHomeWins,h2hHomeLosses,h2hAwayWins,h2hAwayLosses,h2hDraws,h2hAvgGoals,h2hMatches
0,285418,11,6,2021,5,19,0,1.0,1.0,803,...,0.0,0.0,3.0,0.0,-2.0,2.0,0.0,0.0,6.0,2.0
1,285419,12,6,2021,6,13,0,1.0,1.0,833,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0
2,285420,16,6,2021,3,16,0,2.0,1.0,803,...,1.0,0.0,2.0,1.0,-1.0,1.0,-1.0,1.0,6.0,3.0
3,285421,16,6,2021,3,19,0,2.0,1.0,784,...,0.0,3.0,0.0,2.0,-1.0,1.0,-2.0,2.0,13.0,5.0
4,285422,20,6,2021,7,16,0,3.0,1.0,784,...,0.0,1.0,0.0,1.0,0.0,0.0,-1.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6736,494883,24,4,2024,3,22,0,3.0,1.0,7055,...,1.0,2.0,1.0,2.0,0.0,0.0,-2.0,0.0,4.0,2.0
6737,494884,25,4,2024,4,0,30,3.0,1.0,9379,...,1.0,1.0,2.0,0.0,-2.0,2.0,0.0,0.0,5.0,2.0
6738,494885,8,5,2024,3,0,0,4.0,1.0,5171,...,0.0,1.0,1.0,0.0,-1.0,1.0,0.0,1.0,5.0,2.0
6739,494886,8,5,2024,3,0,0,4.0,1.0,7055,...,2.0,2.0,2.0,0.0,-1.0,1.0,0.0,1.0,6.0,2.0


In [24]:
dataframe.to_csv(f"../relative_datasets/raw/{MODEL_NAME}-{MODEL_VERSION}-final.csv", index=False)

In [32]:
# dataframe = dataframe[dataframe.scoreHomeFt != dataframe.scoreAwayFt]
dataframe["target"] = (dataframe.scoreHomeFt > dataframe.scoreAwayFt).astype(int)
# dataframe["target"] = ((dataframe.scoreHomeFt + dataframe.scoreAwayFt)/2 > 0.5).astype(int)
dataframe = dataframe.astype(float)
dataframe

Unnamed: 0,id,day,month,year,weekday,hour,minute,matchday,leagueType,homeId,...,scoreHomeFt,scoreAwayFt,h2hHomeWins,h2hHomeLosses,h2hAwayWins,h2hAwayLosses,h2hDraws,h2hAvgGoals,h2hMatches,target
0,285418.0,11.0,6.0,2021.0,5.0,19.0,0.0,1.0,1.0,803.0,...,0.0,3.0,0.0,-2.0,2.0,0.0,0.0,6.0,2.0,0.0
1,285419.0,12.0,6.0,2021.0,6.0,13.0,0.0,1.0,1.0,833.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0,0.0
2,285420.0,16.0,6.0,2021.0,3.0,16.0,0.0,2.0,1.0,803.0,...,0.0,2.0,1.0,-1.0,1.0,-1.0,1.0,6.0,3.0,0.0
3,285421.0,16.0,6.0,2021.0,3.0,19.0,0.0,2.0,1.0,784.0,...,3.0,0.0,2.0,-1.0,1.0,-2.0,2.0,13.0,5.0,1.0
4,285422.0,20.0,6.0,2021.0,7.0,16.0,0.0,3.0,1.0,784.0,...,1.0,0.0,1.0,0.0,0.0,-1.0,0.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6736,494883.0,24.0,4.0,2024.0,3.0,22.0,0.0,3.0,1.0,7055.0,...,2.0,1.0,2.0,0.0,0.0,-2.0,0.0,4.0,2.0,1.0
6737,494884.0,25.0,4.0,2024.0,4.0,0.0,30.0,3.0,1.0,9379.0,...,1.0,2.0,0.0,-2.0,2.0,0.0,0.0,5.0,2.0,0.0
6738,494885.0,8.0,5.0,2024.0,3.0,0.0,0.0,4.0,1.0,5171.0,...,1.0,1.0,0.0,-1.0,1.0,0.0,1.0,5.0,2.0,0.0
6739,494886.0,8.0,5.0,2024.0,3.0,0.0,0.0,4.0,1.0,7055.0,...,2.0,2.0,0.0,-1.0,1.0,0.0,1.0,6.0,2.0,0.0


In [33]:
dataframe.h2hAvgGoals = dataframe.h2hAvgGoals/dataframe.h2hMatches
dataframe

Unnamed: 0,id,day,month,year,weekday,hour,minute,matchday,leagueType,homeId,...,scoreHomeFt,scoreAwayFt,h2hHomeWins,h2hHomeLosses,h2hAwayWins,h2hAwayLosses,h2hDraws,h2hAvgGoals,h2hMatches,target
0,285418.0,11.0,6.0,2021.0,5.0,19.0,0.0,1.0,1.0,803.0,...,0.0,3.0,0.0,-2.0,2.0,0.0,0.0,3.0,2.0,0.0
1,285419.0,12.0,6.0,2021.0,6.0,13.0,0.0,1.0,1.0,833.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0,0.0
2,285420.0,16.0,6.0,2021.0,3.0,16.0,0.0,2.0,1.0,803.0,...,0.0,2.0,1.0,-1.0,1.0,-1.0,1.0,2.0,3.0,0.0
3,285421.0,16.0,6.0,2021.0,3.0,19.0,0.0,2.0,1.0,784.0,...,3.0,0.0,2.0,-1.0,1.0,-2.0,2.0,2.6,5.0,1.0
4,285422.0,20.0,6.0,2021.0,7.0,16.0,0.0,3.0,1.0,784.0,...,1.0,0.0,1.0,0.0,0.0,-1.0,0.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6736,494883.0,24.0,4.0,2024.0,3.0,22.0,0.0,3.0,1.0,7055.0,...,2.0,1.0,2.0,0.0,0.0,-2.0,0.0,2.0,2.0,1.0
6737,494884.0,25.0,4.0,2024.0,4.0,0.0,30.0,3.0,1.0,9379.0,...,1.0,2.0,0.0,-2.0,2.0,0.0,0.0,2.5,2.0,0.0
6738,494885.0,8.0,5.0,2024.0,3.0,0.0,0.0,4.0,1.0,5171.0,...,1.0,1.0,0.0,-1.0,1.0,0.0,1.0,2.5,2.0,0.0
6739,494886.0,8.0,5.0,2024.0,3.0,0.0,0.0,4.0,1.0,7055.0,...,2.0,2.0,0.0,-1.0,1.0,0.0,1.0,3.0,2.0,0.0


In [34]:
del dataframe["h2hMatches"]
dataframe

Unnamed: 0,id,day,month,year,weekday,hour,minute,matchday,leagueType,homeId,...,scoreAwayHt,scoreHomeFt,scoreAwayFt,h2hHomeWins,h2hHomeLosses,h2hAwayWins,h2hAwayLosses,h2hDraws,h2hAvgGoals,target
0,285418.0,11.0,6.0,2021.0,5.0,19.0,0.0,1.0,1.0,803.0,...,0.0,0.0,3.0,0.0,-2.0,2.0,0.0,0.0,3.0,0.0
1,285419.0,12.0,6.0,2021.0,6.0,13.0,0.0,1.0,1.0,833.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0
2,285420.0,16.0,6.0,2021.0,3.0,16.0,0.0,2.0,1.0,803.0,...,1.0,0.0,2.0,1.0,-1.0,1.0,-1.0,1.0,2.0,0.0
3,285421.0,16.0,6.0,2021.0,3.0,19.0,0.0,2.0,1.0,784.0,...,0.0,3.0,0.0,2.0,-1.0,1.0,-2.0,2.0,2.6,1.0
4,285422.0,20.0,6.0,2021.0,7.0,16.0,0.0,3.0,1.0,784.0,...,0.0,1.0,0.0,1.0,0.0,0.0,-1.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6736,494883.0,24.0,4.0,2024.0,3.0,22.0,0.0,3.0,1.0,7055.0,...,1.0,2.0,1.0,2.0,0.0,0.0,-2.0,0.0,2.0,1.0
6737,494884.0,25.0,4.0,2024.0,4.0,0.0,30.0,3.0,1.0,9379.0,...,1.0,1.0,2.0,0.0,-2.0,2.0,0.0,0.0,2.5,0.0
6738,494885.0,8.0,5.0,2024.0,3.0,0.0,0.0,4.0,1.0,5171.0,...,0.0,1.0,1.0,0.0,-1.0,1.0,0.0,1.0,2.5,0.0
6739,494886.0,8.0,5.0,2024.0,3.0,0.0,0.0,4.0,1.0,7055.0,...,2.0,2.0,2.0,0.0,-1.0,1.0,0.0,1.0,3.0,0.0


# SPORTS FEATURE ENGINEERING

In [None]:
# CONVERT DATES TO DATETIME TO EXTRACT MORE FEATURES FROM THEM
dataframe[["HourOfDay", "MinuteOfHour"]] = dataframe["Time"].str.split(":", expand=True)
del dataframe["Time"]

dataframe.index = pd.to_datetime(dataframe.pop("Date"), format="mixed")
dataframe["DayOfWeek"] = dataframe.index.day_of_week
dataframe["DayOfYear"] = dataframe.index.day_of_year
dataframe = dataframe.sort_index()
dataframe

Unnamed: 0_level_0,AC,AS,Avg<2.5,Avg>2.5,AvgA,AvgD,AvgH,AwayTeam,Div,FTAG,FTHG,FTR,HC,HS,HTR,HomeTeam,HourOfDay,MinuteOfHour,DayOfWeek,DayOfYear
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2019-01-09,7,8,1.66,2.25,3.65,3.29,2.17,Wolves,E0,2,3,H,7,15,H,Everton,14,00,2,9
2019-01-09,6,13,2.40,1.57,2.90,3.63,2.38,Tottenham,E0,2,2,D,11,26,A,Arsenal,16,30,2,9
2019-01-10,3,12,1.96,1.85,2.54,3.42,2.75,Sheffield Weds,E1,0,1,H,8,8,D,Hull,19,45,3,10
2019-01-10,3,10,1.89,1.92,3.56,3.46,2.08,Nott'm Forest,E1,1,1,D,11,21,D,Blackburn,19,45,3,10
2019-01-10,6,10,2.15,1.70,4.42,3.75,1.80,West Brom,E1,0,1,H,7,19,H,Leeds,19,45,3,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-03,2,5,1.87,1.89,4.94,3.80,1.66,Reading,E2,1,2,H,7,18,D,Derby,19,45,1,338
2024-12-03,9,24,2.07,1.73,1.68,3.87,4.68,Barnsley,E2,3,2,A,6,7,D,Carlisle,19,45,1,338
2024-12-03,4,12,2.04,1.75,2.38,3.52,2.87,Middlesbrough,E1,1,0,A,3,9,A,Birmingham,19,45,1,338
2024-12-03,5,13,1.66,2.18,2.86,3.16,2.50,Exeter,E2,3,0,A,0,6,A,Shrewsbury,19,45,1,338


In [None]:
# ENCODING THE TEAMS
team_encodings = dict()
div_encodings = {"E0": 0, "E1": 1, "E2": 2}
result_encoding = dict(H=0, A=1, D=-1)
encodings_counter = 0
div_counter = 0
for index, row in dataframe.iterrows():
    if team_encodings.get(row.HomeTeam) is None:
        team_encodings[row.HomeTeam] = encodings_counter
        encodings_counter += 1

    if team_encodings.get(row.AwayTeam) is None:
        team_encodings[row.AwayTeam] = encodings_counter
        encodings_counter += 1

    if div_encodings.get(row.Div) is None:
        div_encodings[row.Div] = encodings_counter
        encodings_counter += 1

    dataframe.loc[index, 'HomeTeam'] = team_encodings[row.HomeTeam]
    dataframe.loc[index, 'AwayTeam'] = team_encodings[row.AwayTeam]
    dataframe.loc[index, 'Div'] = div_encodings[row.Div]
    dataframe.loc[index, 'FTR'] = result_encoding[row.FTR]
    dataframe.loc[index, 'HTR'] = result_encoding[row.HTR]
dataframe = dataframe.astype(float)
dataframe

Unnamed: 0_level_0,AC,AS,Avg<2.5,Avg>2.5,AvgA,AvgD,AvgH,AwayTeam,Div,FTAG,FTHG,FTR,HC,HS,HTR,HomeTeam,HourOfDay,MinuteOfHour,DayOfWeek,DayOfYear
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2019-01-09,7.0,8.0,1.66,2.25,3.65,3.29,2.17,3.0,0.0,2.0,3.0,-1.0,7.0,15.0,1.0,2.0,14.0,0.0,2.0,9.0
2019-01-09,6.0,13.0,2.40,1.57,2.90,3.63,2.38,3.0,0.0,2.0,2.0,-1.0,11.0,26.0,1.0,2.0,16.0,30.0,2.0,9.0
2019-01-10,3.0,12.0,1.96,1.85,2.54,3.42,2.75,17.0,1.0,0.0,1.0,0.0,8.0,8.0,-1.0,16.0,19.0,45.0,3.0,10.0
2019-01-10,3.0,10.0,1.89,1.92,3.56,3.46,2.08,17.0,1.0,1.0,1.0,0.0,11.0,21.0,-1.0,16.0,19.0,45.0,3.0,10.0
2019-01-10,6.0,10.0,2.15,1.70,4.42,3.75,1.80,17.0,1.0,0.0,1.0,0.0,7.0,19.0,-1.0,16.0,19.0,45.0,3.0,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-03,2.0,5.0,1.87,1.89,4.94,3.80,1.66,75.0,2.0,1.0,2.0,1.0,7.0,18.0,1.0,54.0,19.0,45.0,1.0,338.0
2024-12-03,9.0,24.0,2.07,1.73,1.68,3.87,4.68,75.0,2.0,3.0,2.0,1.0,6.0,7.0,1.0,54.0,19.0,45.0,1.0,338.0
2024-12-03,4.0,12.0,2.04,1.75,2.38,3.52,2.87,75.0,2.0,1.0,0.0,1.0,3.0,9.0,1.0,54.0,19.0,45.0,1.0,338.0
2024-12-03,5.0,13.0,1.66,2.18,2.86,3.16,2.50,75.0,2.0,3.0,0.0,1.0,0.0,6.0,1.0,54.0,19.0,45.0,1.0,338.0


In [None]:
dataframe["target"] = dataframe.pop("FTR")
dataframe = dataframe[["HomeTeam", "AwayTeam", "AvgH", "AvgD", "AvgA", "DayOfYear", "DayOfWeek", "HourOfDay", "MinuteOfHour", "target"]]
dataframe

Unnamed: 0_level_0,HomeTeam,AwayTeam,AvgH,AvgD,AvgA,DayOfYear,DayOfWeek,HourOfDay,MinuteOfHour,target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-01-09,2.0,3.0,2.17,3.29,3.65,9.0,2.0,14.0,0.0,-1.0
2019-01-09,2.0,3.0,2.38,3.63,2.90,9.0,2.0,16.0,30.0,-1.0
2019-01-10,16.0,17.0,2.75,3.42,2.54,10.0,3.0,19.0,45.0,0.0
2019-01-10,16.0,17.0,2.08,3.46,3.56,10.0,3.0,19.0,45.0,0.0
2019-01-10,16.0,17.0,1.80,3.75,4.42,10.0,3.0,19.0,45.0,0.0
...,...,...,...,...,...,...,...,...,...,...
2024-12-03,54.0,75.0,1.66,3.80,4.94,338.0,1.0,19.0,45.0,1.0
2024-12-03,54.0,75.0,4.68,3.87,1.68,338.0,1.0,19.0,45.0,1.0
2024-12-03,54.0,75.0,2.87,3.52,2.38,338.0,1.0,19.0,45.0,1.0
2024-12-03,54.0,75.0,2.50,3.16,2.86,338.0,1.0,19.0,45.0,1.0


In [None]:
dataframe = dataframe.loc[dataframe.target != -1]
dataframe

Unnamed: 0_level_0,HomeTeam,AwayTeam,AvgH,AvgD,AvgA,DayOfYear,DayOfWeek,HourOfDay,MinuteOfHour,target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-01-10,16.0,17.0,2.75,3.42,2.54,10.0,3.0,19.0,45.0,0.0
2019-01-10,16.0,17.0,2.08,3.46,3.56,10.0,3.0,19.0,45.0,0.0
2019-01-10,16.0,17.0,1.80,3.75,4.42,10.0,3.0,19.0,45.0,0.0
2019-01-10,16.0,17.0,2.66,3.20,2.75,10.0,3.0,19.0,45.0,0.0
2019-01-10,16.0,17.0,1.79,3.57,4.73,10.0,3.0,20.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
2024-12-03,54.0,75.0,1.66,3.80,4.94,338.0,1.0,19.0,45.0,1.0
2024-12-03,54.0,75.0,4.68,3.87,1.68,338.0,1.0,19.0,45.0,1.0
2024-12-03,54.0,75.0,2.87,3.52,2.38,338.0,1.0,19.0,45.0,1.0
2024-12-03,54.0,75.0,2.50,3.16,2.86,338.0,1.0,19.0,45.0,1.0


In [None]:
# ADD FINANCIAL INDICATORS AND TARGET CLASS
dataframe = append_technical_indicators(dataframe, add_outcomes=True)

# DATA SPLITTING

In [36]:
# TRAIN AND TEST DATASET SPLIT
targets = dataframe.pop("target")
X, X_test, Y, Y_test = train_test_split(dataframe, targets, test_size=0.2, random_state=False)
f"""TRAIN: {X.shape}, TEST: {X_test.shape}"""

'TRAIN: (5392, 21), TEST: (1349, 21)'

In [37]:
# SAVING THE TRAIN TEST SPLITS TO A FINALS FOLDER
X.join(Y).to_csv(f"../relative_datasets/cleaned/{MODEL_NAME}-{MODEL_VERSION}-train-set.csv", index=False)
X_test.join(Y_test).to_csv(f"../relative_datasets/cleaned/{MODEL_NAME}-{MODEL_VERSION}-test-set.csv", index=False)

In [163]:
import numpy as np
np.where(dataframe.homeId.unique().astype(int) == )

(array([93]),)

# SAVING THE SOCCER LABELS

In [None]:
teams_encoding_dataframe = pd.DataFrame(
                [[team_name, team_encodings[team_name]] for team_name in team_encodings.keys()],
                columns=["TeamName", "Encoding"])
teams_encoding_dataframe.to_csv(f"../relative_datasets/cleaned/{MODEL_NAME}-{MODEL_VERSION}-team-labels-set.csv", index=False)

In [None]:
# UPDATING THE GITHUB OF THE DATASETS
!git add -A
!git commit -m "updates: {MODEL_NAME}-{MODEL_VERSION}: bot commit"
!git push

[main 15598e9] updates: eng_prem_league-v1.0: bot commit
 3 files changed, 19 insertions(+), 10 deletions(-)
 create mode 100644 relative_datasets/cleaned/mapping_labels_bank.json
Enumerating objects: 14, done.
Counting objects: 100% (14/14), done.
Delta compression using up to 2 threads
Compressing objects: 100% (8/8), done.
Writing objects: 100% (8/8), 1.03 KiB | 1.03 MiB/s, done.
Total 8 (delta 3), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (3/3), completed with 3 local objects.[K
To https://github.com/lebyanelm/neural-trained-models.git
   13805c1..15598e9  main -> main
