# MODULE INSTALLATIONS

In [1]:
import pathlib
import pandas as pd
from sklearn.model_selection import train_test_split

# INITIALIZE THE WORKSPACE

In [2]:
MODEL_NAME = "eng_prem_league"
MODEL_VERSION = "v1.0"

# FUNCTION DEFINATIONS

In [5]:
# INDICATOR PERIODS
MA_RSI_PERIOD = 14
BOLLINGER_PERIOD = 20
MACD_SIGNAL_PERIOD = 9
MACD_SHORT_PERIOD = 12
MACD_LONG_PERIOD = 26
BOLLINGER_N_STD = 2


# FINANCIAL INDICATORS
def rsi(data, window = MA_RSI_PERIOD, modify = True):
    delta = data['close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))

    if modify:
      data["rsi"] = rsi
      return data
    else:
      return rsi


def sma(data, window = MA_RSI_PERIOD, modify = True):
    _sma = data['close'].rolling(window=window).mean()

    if modify:
      data["sma"] = _sma
      return data
    else:
      return _sma


def ema(data, span = MA_RSI_PERIOD, modify = True):
    _ema = data['close'].ewm(span=span, adjust=False).mean()

    if modify:
      data["ema"] = _ema
      return data
    else:
      return _ema


def stochastic_oscillator(data, window = MA_RSI_PERIOD, modify = True):
    lowest_low = data['low'].rolling(window=window).min()
    highest_high = data['high'].rolling(window=window).max()
    _stochastic = ((data['close'] - lowest_low) / (highest_high - lowest_low)) * 100

    if modify:
      data["stochastic"] = _stochastic
      return data
    else:
      return _stochastic


def bollinger_bands(data, window = BOLLINGER_PERIOD, num_std = 2, modify = True):
    rolling_mean = data['close'].rolling(window=window).mean()
    rolling_std = data['close'].rolling(window=window).std()

    _lower_band = rolling_mean - (rolling_std * num_std)
    _upper_band = rolling_mean + (rolling_std * num_std)

    if modify:
      data["bollinger_lower"] = _lower_band
      data["bollinger_upper"] = _upper_band
      return data
    else:
      return _lower_band, _upper_band


def macd(data, short_window = MACD_SHORT_PERIOD, long_window = MACD_LONG_PERIOD, signal_window = MACD_SIGNAL_PERIOD, modify = True):
    _ema_short = ema(data, span=short_window, modify=False)
    _ema_long = ema(data, span=long_window, modify=False)

    macd_line = _ema_short - _ema_long
    signal_line = macd_line.rolling(window=signal_window).mean()

    if modify:
      data["macd"] = macd_line
      data["macd_signal"] = signal_line
      return data
    else:
      return macd_line, signal_line


def to_lowercase_columns(data):
    data.columns = [column.replace(" ", "_").lower() for column in data.columns]
    return data


def append_technical_indicators(data, add_outcomes = True):
  # data = to_lowercase_columns(data)
  data = rsi(data)
  data = sma(data)
  data = ema(data)
  data = stochastic_oscillator(data)
  data = bollinger_bands(data)
  data = macd(data)

  data.index = pd.to_datetime(data.pop("timestamp"))
  data.index.names = ["timestamp"]
  data = data.sort_values("timestamp")

  data["hour"] = data.index.hour
  data["day_of_week"] = data.index.dayofweek
  data = data.dropna()

  # OUTCOMES OF THE TICKER
  if add_outcomes:
    data["target"] = (data.close > data.open).apply(lambda x: 1 if x > 0 else 0)
  return data.astype(float)

# DATA ENGENEERING AND TRAIN/TEST SPLIT

In [34]:
DATAFILE_PATH = "../relative_datasets/raw/eng-prem-league-v1-data-all.csv"
dataframe = pd.read_csv(f"{DATAFILE_PATH}")
dataframe = dataframe.dropna().drop_duplicates()
dataframe

Unnamed: 0,AC,AS,Avg<2.5,Avg>2.5,AvgA,AvgD,AvgH,AwayTeam,Date,Div,FTAG,FTHG,FTR,HC,HS,HTR,HomeTeam,Time
0,5,17,2.27,1.65,1.35,5.35,9.02,Man City,11/08/2023,E0,3,0,A,6,6,A,Burnley,20:00
1,3,6,2.85,1.42,15.67,7.64,1.18,Nott'm Forest,12/08/2023,E0,1,2,H,8,15,H,Arsenal,12:30
2,4,16,1.94,1.88,2.64,3.44,2.69,West Ham,12/08/2023,E0,1,1,D,10,14,D,Bournemouth,15:00
3,7,9,2.34,1.61,9.61,5.52,1.33,Luton,12/08/2023,E0,1,4,H,6,27,H,Brighton,15:00
4,4,9,1.86,1.97,3.30,3.43,2.24,Fulham,12/08/2023,E0,1,0,A,10,19,D,Everton,15:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6687,6,14,1.58,2.31,4.58,3.41,1.81,Gillingham,07/03/2020,E2,2,2,D,9,13,D,Sunderland,15:00
6688,6,9,1.89,1.89,5.12,3.74,1.67,Tranmere,10/03/2020,E2,2,1,A,8,16,A,Blackpool,19:45
6689,2,6,1.73,2.08,1.65,3.69,5.35,Sunderland,10/03/2020,E2,0,2,H,5,8,H,Bristol Rvs,19:45
6690,9,11,1.89,1.89,6.11,4.14,1.52,Bolton,10/03/2020,E2,2,2,D,7,16,A,Burton,19:45


# SPORTS FEATURE ENGINEERING

In [35]:
# CONVERT DATES TO DATETIME TO EXTRACT MORE FEATURES FROM THEM
dataframe[["HourOfDay", "MinuteOfHour"]] = dataframe["Time"].str.split(":", expand=True)
del dataframe["Time"]

dataframe.index = pd.to_datetime(dataframe.pop("Date"), format="mixed")
dataframe["DayOfWeek"] = dataframe.index.day_of_week
dataframe["DayOfYear"] = dataframe.index.day_of_year
dataframe = dataframe.sort_index()
dataframe

Unnamed: 0_level_0,AC,AS,Avg<2.5,Avg>2.5,AvgA,AvgD,AvgH,AwayTeam,Div,FTAG,FTHG,FTR,HC,HS,HTR,HomeTeam,HourOfDay,MinuteOfHour,DayOfWeek,DayOfYear
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2019-01-09,7,8,1.66,2.25,3.65,3.29,2.17,Wolves,E0,2,3,H,7,15,H,Everton,14,00,2,9
2019-01-09,6,13,2.40,1.57,2.90,3.63,2.38,Tottenham,E0,2,2,D,11,26,A,Arsenal,16,30,2,9
2019-01-10,3,12,1.96,1.85,2.54,3.42,2.75,Sheffield Weds,E1,0,1,H,8,8,D,Hull,19,45,3,10
2019-01-10,3,10,1.89,1.92,3.56,3.46,2.08,Nott'm Forest,E1,1,1,D,11,21,D,Blackburn,19,45,3,10
2019-01-10,6,10,2.15,1.70,4.42,3.75,1.80,West Brom,E1,0,1,H,7,19,H,Leeds,19,45,3,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-03,2,5,1.87,1.89,4.94,3.80,1.66,Reading,E2,1,2,H,7,18,D,Derby,19,45,1,338
2024-12-03,9,24,2.07,1.73,1.68,3.87,4.68,Barnsley,E2,3,2,A,6,7,D,Carlisle,19,45,1,338
2024-12-03,4,12,2.04,1.75,2.38,3.52,2.87,Middlesbrough,E1,1,0,A,3,9,A,Birmingham,19,45,1,338
2024-12-03,5,13,1.66,2.18,2.86,3.16,2.50,Exeter,E2,3,0,A,0,6,A,Shrewsbury,19,45,1,338


In [36]:
# ENCODING THE TEAMS
team_encodings = dict()
div_encodings = {"E0": 0, "E1": 1, "E2": 2}
result_encoding = dict(H=0, A=1, D=-1)
encodings_counter = 0
div_counter = 0
for index, row in dataframe.iterrows():
    if team_encodings.get(row.HomeTeam) is None:
        team_encodings[row.HomeTeam] = encodings_counter
        encodings_counter += 1

    if team_encodings.get(row.AwayTeam) is None:
        team_encodings[row.AwayTeam] = encodings_counter
        encodings_counter += 1

    if div_encodings.get(row.Div) is None:
        div_encodings[row.Div] = encodings_counter
        encodings_counter += 1

    dataframe.loc[index, 'HomeTeam'] = team_encodings[row.HomeTeam]
    dataframe.loc[index, 'AwayTeam'] = team_encodings[row.AwayTeam]
    dataframe.loc[index, 'Div'] = div_encodings[row.Div]
    dataframe.loc[index, 'FTR'] = result_encoding[row.FTR]
    dataframe.loc[index, 'HTR'] = result_encoding[row.HTR]
dataframe = dataframe.astype(float)
dataframe

Unnamed: 0_level_0,AC,AS,Avg<2.5,Avg>2.5,AvgA,AvgD,AvgH,AwayTeam,Div,FTAG,FTHG,FTR,HC,HS,HTR,HomeTeam,HourOfDay,MinuteOfHour,DayOfWeek,DayOfYear
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2019-01-09,7.0,8.0,1.66,2.25,3.65,3.29,2.17,3.0,0.0,2.0,3.0,-1.0,7.0,15.0,1.0,2.0,14.0,0.0,2.0,9.0
2019-01-09,6.0,13.0,2.40,1.57,2.90,3.63,2.38,3.0,0.0,2.0,2.0,-1.0,11.0,26.0,1.0,2.0,16.0,30.0,2.0,9.0
2019-01-10,3.0,12.0,1.96,1.85,2.54,3.42,2.75,17.0,1.0,0.0,1.0,0.0,8.0,8.0,-1.0,16.0,19.0,45.0,3.0,10.0
2019-01-10,3.0,10.0,1.89,1.92,3.56,3.46,2.08,17.0,1.0,1.0,1.0,0.0,11.0,21.0,-1.0,16.0,19.0,45.0,3.0,10.0
2019-01-10,6.0,10.0,2.15,1.70,4.42,3.75,1.80,17.0,1.0,0.0,1.0,0.0,7.0,19.0,-1.0,16.0,19.0,45.0,3.0,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-03,2.0,5.0,1.87,1.89,4.94,3.80,1.66,75.0,2.0,1.0,2.0,1.0,7.0,18.0,1.0,54.0,19.0,45.0,1.0,338.0
2024-12-03,9.0,24.0,2.07,1.73,1.68,3.87,4.68,75.0,2.0,3.0,2.0,1.0,6.0,7.0,1.0,54.0,19.0,45.0,1.0,338.0
2024-12-03,4.0,12.0,2.04,1.75,2.38,3.52,2.87,75.0,2.0,1.0,0.0,1.0,3.0,9.0,1.0,54.0,19.0,45.0,1.0,338.0
2024-12-03,5.0,13.0,1.66,2.18,2.86,3.16,2.50,75.0,2.0,3.0,0.0,1.0,0.0,6.0,1.0,54.0,19.0,45.0,1.0,338.0


In [37]:
dataframe["target"] = dataframe.pop("FTR")
dataframe = dataframe[["HomeTeam", "AwayTeam", "AvgH", "AvgD", "AvgA", "DayOfYear", "DayOfWeek", "HourOfDay", "MinuteOfHour", "target"]]
dataframe

Unnamed: 0_level_0,HomeTeam,AwayTeam,AvgH,AvgD,AvgA,DayOfYear,DayOfWeek,HourOfDay,MinuteOfHour,target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-01-09,2.0,3.0,2.17,3.29,3.65,9.0,2.0,14.0,0.0,-1.0
2019-01-09,2.0,3.0,2.38,3.63,2.90,9.0,2.0,16.0,30.0,-1.0
2019-01-10,16.0,17.0,2.75,3.42,2.54,10.0,3.0,19.0,45.0,0.0
2019-01-10,16.0,17.0,2.08,3.46,3.56,10.0,3.0,19.0,45.0,0.0
2019-01-10,16.0,17.0,1.80,3.75,4.42,10.0,3.0,19.0,45.0,0.0
...,...,...,...,...,...,...,...,...,...,...
2024-12-03,54.0,75.0,1.66,3.80,4.94,338.0,1.0,19.0,45.0,1.0
2024-12-03,54.0,75.0,4.68,3.87,1.68,338.0,1.0,19.0,45.0,1.0
2024-12-03,54.0,75.0,2.87,3.52,2.38,338.0,1.0,19.0,45.0,1.0
2024-12-03,54.0,75.0,2.50,3.16,2.86,338.0,1.0,19.0,45.0,1.0


In [38]:
dataframe = dataframe.loc[dataframe["target"] != -1]
dataframe

Unnamed: 0_level_0,HomeTeam,AwayTeam,AvgH,AvgD,AvgA,DayOfYear,DayOfWeek,HourOfDay,MinuteOfHour,target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-01-10,16.0,17.0,2.75,3.42,2.54,10.0,3.0,19.0,45.0,0.0
2019-01-10,16.0,17.0,2.08,3.46,3.56,10.0,3.0,19.0,45.0,0.0
2019-01-10,16.0,17.0,1.80,3.75,4.42,10.0,3.0,19.0,45.0,0.0
2019-01-10,16.0,17.0,2.66,3.20,2.75,10.0,3.0,19.0,45.0,0.0
2019-01-10,16.0,17.0,1.79,3.57,4.73,10.0,3.0,20.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
2024-12-03,54.0,75.0,1.66,3.80,4.94,338.0,1.0,19.0,45.0,1.0
2024-12-03,54.0,75.0,4.68,3.87,1.68,338.0,1.0,19.0,45.0,1.0
2024-12-03,54.0,75.0,2.87,3.52,2.38,338.0,1.0,19.0,45.0,1.0
2024-12-03,54.0,75.0,2.50,3.16,2.86,338.0,1.0,19.0,45.0,1.0


In [None]:
# ADD FINANCIAL INDICATORS AND TARGET CLASS
dataframe = append_technical_indicators(dataframe, add_outcomes=True)

In [39]:
# TRAIN AND TEST DATASET SPLIT
targets = dataframe.pop("target")
X, X_test, Y, Y_test = train_test_split(dataframe, targets, test_size=0.2, random_state=False)
f"""TRAIN: {X.shape}, TEST: {X_test.shape}"""

'TRAIN: (4088, 9), TEST: (1023, 9)'

In [40]:
# SAVING THE TRAIN TEST SPLITS TO A FINALS FOLDER
X.join(Y).to_csv(f"../relative_datasets/cleaned/{MODEL_NAME}-{MODEL_VERSION}-train-set.csv", index=False)
X_test.join(Y_test).to_csv(f"../relative_datasets/cleaned/{MODEL_NAME}-{MODEL_VERSION}-test-set.csv", index=False)

# SAVING THE SOCCER LABELS

In [41]:
teams_encoding_dataframe = pd.DataFrame(
                [[team_name, team_encodings[team_name]] for team_name in team_encodings.keys()],
                columns=["TeamName", "Encoding"])
teams_encoding_dataframe.to_csv(f"../relative_datasets/cleaned/{MODEL_NAME}-{MODEL_VERSION}-team-labels-set.csv", index=False)

In [44]:
# UPDATING THE GITHUB OF THE DATASETS
!git add -A
!git commit -m "updates: {MODEL_NAME}-{MODEL_VERSION}: bot commit"
!git push

[main ecba617] updates: eng_prem_league-v1.0: bot commit
 2 files changed, 259 insertions(+), 28 deletions(-)
Enumerating objects: 11, done.
Counting objects: 100% (11/11), done.
Delta compression using up to 2 threads
Compressing objects: 100% (6/6), done.
Writing objects: 100% (6/6), 1.32 KiB | 1.32 MiB/s, done.
Total 6 (delta 2), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
To https://github.com/lebyanelm/neural-trained-models.git
   2ddb3ef..ecba617  main -> main
