In [9]:
import logging

import pandas as pd
import numpy as np

import core.finance as cofinanc
import dataflow.core as dtfcore
import helpers.hdbg as hdbg
import helpers.henv as henv
import helpers.hpandas as hpandas
import helpers.hprint as hprint

import research_amp.soccer_prediction.utils as rasoprut

In [2]:
hdbg.init_logger(verbosity=logging.DEBUG)

_LOG = logging.getLogger(__name__)

_LOG.info("%s", henv.get_system_signature()[0])

hprint.config_notebook()

DEBUG Effective logging level=10
DEBUG Shut up 116 modules: botocore.loaders, matplotlib.ticker, botocore.args, botocore.retries.adaptive, matplotlib.axis, matplotlib.lines, botocore.endpoint, matplotlib.axes._axes, matplotlib._constrained_layout, botocore.endpoint_provider, matplotlib.dates, matplotlib._layoutgrid, urllib3.poolmanager, botocore.handlers, boto3.resources.factory, urllib3, matplotlib.axes, urllib3.util, botocore.regions, invoke, boto3.resources.base, botocore.waiter, matplotlib.pyplot, boto3, botocore.retryhandler, botocore.configprovider, boto3.resources.action, matplotlib.dviread, boto3.resources, matplotlib.figure, matplotlib.font_manager, botocore.auth, matplotlib.category, botocore.history, botocore.parsers, botocore.utils, matplotlib.text, botocore.compat, fsspec, botocore.awsrequest, matplotlib.mathtext, matplotlib._afm, urllib3.connectionpool, botocore.retries, asyncio, botocore.monitoring, matplotlib.textpath, botocore.discovery, botocore.credentials, botocore.

In [52]:
def preprocess_data(df: pd.DataFrame()) -> pd.DataFrame():
    """
    Preprocess the loaded ISDB dataframe of interest.
        - Filter and select match from seasons starting from 2009.
        - Convert column formats.
        - Add epsilon = 0.5 to scores with value as `0` to avoid log(0).
        - Check for NaN and infinite values and drop the rows.
    
    :param df: Input DataFrame. 
    :return: Preprocessed DataFrame.
    """
    df["season"] = df["Sea"].apply(lambda x: int("20" + str(x)[:2]))
    filtered_df = df[df["season"] >= 2009]
    # Preprocess the dataset.
    df["Date"] = pd.to_datetime(df["Date"], dayfirst=True)
    df.sort_values(by="Date", inplace=True)
    # Covert the categorical columns to category type. 
    categorical_columns = ["HT", "AT"]
    for col in categorical_columns:
        filtered_df[col] = filtered_df[col].astype("category")
    # Adding a small constant to goals to avoid log(0).
    columns = ['AS', 'HS']
    epsilon = 0.0
    for column in columns:    
        filtered_df[column] = filtered_df[column].apply(lambda x: x + epsilon if x == 0 else x)
        # Check if there are any infinite or NaN weights and handle them.
        if filtered_df.isna().sum().sum() > 0:
            _LOG.debug("NaN values found in the data. Removing rows with NaNs.")
            filtered_df.dropna(inplace=True)
        if filtered_df.isin([-np.inf, np.inf]).sum().sum() > 0:
            _LOG.debug("Infinite values found in the data. Removing rows with Infs.")
            filtered_df = filtered_df[~np.isinf(filtered_df.select_dtypes(include=[np.number])).any(1)]
    # Return the preprocessed DataFrame.
    return filtered_df

In [None]:
# Define the S3 Buckets, dataset path and local directory for download.
bucket = "cryptokaizen-data-test"
dataset_path = "kaizen_ai/soccer_prediction/datasets/OSF_football/"
local_dir = "datasets/OSF_football"
# Download data from S3.
rasoprut.download_data_from_s3(
    bucket_name=bucket, dataset_path=dataset_path, local_path=local_dir
)
# Load the data from S3 into pandas dataframe objects.
dataframes = rasoprut.load_data_to_dataframe(local_path=local_dir)

In [54]:
# Access the dataframes directly from the dictionary.
ISDBv1_df = dataframes.get("ISDBv1_df")
ISDBv2_df = dataframes.get("ISDBv2_df")
# Preprocess the selected dataframe (ISDBv2_df).
preprocessed_df = preprocess_data(ISDBv2_df)
# preprocessed_df.set_index('Date', inplace=True)

In [60]:
preprocessed_df[preprocessed_df["AT"] == 'FC Barcelona']

Unnamed: 0,Sea,Lge,Date,HT,AT,HS,AS,GD,WDL,season
102926,09-10,SPA1,12/09/2009,Getafe,FC Barcelona,0.0,2.0,-2,L,2009
102944,09-10,SPA1,22/09/2009,Santander,FC Barcelona,1.0,4.0,-3,L,2009
102955,09-10,SPA1,26/09/2009,Malaga,FC Barcelona,0.0,2.0,-2,L,2009
102976,09-10,SPA1,17/10/2009,Valencia,FC Barcelona,0.0,0.0,0,D,2009
102995,09-10,SPA1,31/10/2009,Osasuna,FC Barcelona,1.0,1.0,0,D,2009
103014,09-10,SPA1,21/11/2009,Athletic Bilbao,FC Barcelona,1.0,1.0,0,D,2009
103034,09-10,SPA1,02/12/2009,Xerez,FC Barcelona,0.0,2.0,-2,L,2009
103035,09-10,SPA1,05/12/2009,La Coruna,FC Barcelona,1.0,3.0,-2,L,2009
103082,09-10,SPA1,10/01/2010,Tenerife,FC Barcelona,0.0,5.0,-5,L,2009
103096,09-10,SPA1,23/01/2010,Valladolid,FC Barcelona,0.0,3.0,-3,L,2009


In [61]:
def get_home_team(df, team):
    df = df[df["HT"] == team]
    df["is_home"] = 1
    df = df.rename(columns={"AT" : "opponent", "HS" : "goals_scored", "AS": "goals_scored_by_opponent"})
    df = df.drop(["HT"], axis=1)
    return df.to_dict(orient='records')

def get_away_team(df, team):
    df = df[df["AT"] == team]
    df["is_home"] = 0
    df = df.rename(columns={"HT" : "opponent", "AS" : "goals_scored", "HS": "goals_scored_by_opponent"})
    df = df.drop(["AT"], axis=1)
    # Define the mapping to flip 'W' and 'L'
    flip_mapping = {'W': 'L', 'L': 'W'}
    # Apply the mapping to the 'WDL' column
    df['WDL'] = df['WDL'].replace(flip_mapping)
    df['GD'] = df['GD'].apply(lambda x : -x)
    return df.to_dict(orient='records')

get_away_team(preprocessed_df, 'FC Barcelona')

[{'Sea': '09-10',
  'Lge': 'SPA1',
  'Date': '12/09/2009',
  'opponent': 'Getafe',
  'goals_scored_by_opponent': 0.0,
  'goals_scored': 2.0,
  'GD': 2,
  'WDL': 'W',
  'season': 2009,
  'is_home': 0},
 {'Sea': '09-10',
  'Lge': 'SPA1',
  'Date': '22/09/2009',
  'opponent': 'Santander',
  'goals_scored_by_opponent': 1.0,
  'goals_scored': 4.0,
  'GD': 3,
  'WDL': 'W',
  'season': 2009,
  'is_home': 0},
 {'Sea': '09-10',
  'Lge': 'SPA1',
  'Date': '26/09/2009',
  'opponent': 'Malaga',
  'goals_scored_by_opponent': 0.0,
  'goals_scored': 2.0,
  'GD': 2,
  'WDL': 'W',
  'season': 2009,
  'is_home': 0},
 {'Sea': '09-10',
  'Lge': 'SPA1',
  'Date': '17/10/2009',
  'opponent': 'Valencia',
  'goals_scored_by_opponent': 0.0,
  'goals_scored': 0.0,
  'GD': 0,
  'WDL': 'D',
  'season': 2009,
  'is_home': 0},
 {'Sea': '09-10',
  'Lge': 'SPA1',
  'Date': '31/10/2009',
  'opponent': 'Osasuna',
  'goals_scored_by_opponent': 1.0,
  'goals_scored': 1.0,
  'GD': 0,
  'WDL': 'D',
  'season': 2009,
  'is_

In [67]:
teams = set(preprocessed_df["HT"].to_list() +  preprocessed_df["AT"].to_list())
data = {}
teams = ['FC Barcelona', 'Real Madrid']
for team in teams:
    data[team] = []
    data[team].extend(get_home_team(preprocessed_df, team))
    data[team].extend(get_away_team(preprocessed_df, team))


In [72]:
data

{'FC Barcelona': [{'Sea': '09-10',
   'Lge': 'SPA1',
   'Date': '31/08/2009',
   'opponent': 'Sporting Gijon',
   'goals_scored': 3.0,
   'goals_scored_by_opponent': 0.0,
   'GD': 3,
   'WDL': 'W',
   'season': 2009,
   'is_home': 1},
  {'Sea': '09-10',
   'Lge': 'SPA1',
   'Date': '19/09/2009',
   'opponent': 'Athletico Madrid',
   'goals_scored': 5.0,
   'goals_scored_by_opponent': 2.0,
   'GD': 3,
   'WDL': 'W',
   'season': 2009,
   'is_home': 1},
  {'Sea': '09-10',
   'Lge': 'SPA1',
   'Date': '03/10/2009',
   'opponent': 'Almeria',
   'goals_scored': 1.0,
   'goals_scored_by_opponent': 0.0,
   'GD': 1,
   'WDL': 'W',
   'season': 2009,
   'is_home': 1},
  {'Sea': '09-10',
   'Lge': 'SPA1',
   'Date': '25/10/2009',
   'opponent': 'Zaragoza',
   'goals_scored': 6.0,
   'goals_scored_by_opponent': 1.0,
   'GD': 5,
   'WDL': 'W',
   'season': 2009,
   'is_home': 1},
  {'Sea': '09-10',
   'Lge': 'SPA1',
   'Date': '07/11/2009',
   'opponent': 'Mallorca',
   'goals_scored': 4.0,
   'go

In [74]:
# Flatten the dictionary of lists of dictionaries to create a list of DataFrames
flattened_data = {
    (outer_key, inner_key): [d.get(inner_key, None) for d in outer_list]
    for outer_key, outer_list in data.items()
    for inner_key in {k for d in outer_list for k in d.keys()}
}

# Convert the flattened dictionary to a DataFrame
df = pd.DataFrame(flattened_data)

# Sort the columns to ensure the outer keys are grouped together
df = df.sort_index(axis=1, level=0)
# Switch outer and inner levels
df.stack(level=0).unstack()

Unnamed: 0_level_0,Date,Date,GD,GD,Lge,Lge,Sea,Sea,WDL,WDL,goals_scored,goals_scored,goals_scored_by_opponent,goals_scored_by_opponent,is_home,is_home,opponent,opponent,season,season
Unnamed: 0_level_1,FC Barcelona,Real Madrid,FC Barcelona,Real Madrid,FC Barcelona,Real Madrid,FC Barcelona,Real Madrid,FC Barcelona,Real Madrid,FC Barcelona,Real Madrid,FC Barcelona,Real Madrid,FC Barcelona,Real Madrid,FC Barcelona,Real Madrid,FC Barcelona,Real Madrid
0,31/08/2009,29/08/2009,3,1,SPA1,SPA1,09-10,09-10,W,W,3.0,3.0,0.0,2.0,1,1,Sporting Gijon,La Coruna,2009,2009
1,19/09/2009,20/09/2009,3,5,SPA1,SPA1,09-10,09-10,W,W,5.0,5.0,2.0,0.0,1,1,Athletico Madrid,Xerez,2009,2009
2,03/10/2009,26/09/2009,1,3,SPA1,SPA1,09-10,09-10,W,W,1.0,3.0,0.0,0.0,1,1,Almeria,Tenerife,2009,2009
3,25/10/2009,17/10/2009,5,2,SPA1,SPA1,09-10,09-10,W,W,6.0,4.0,1.0,2.0,1,1,Zaragoza,Valladolid,2009,2009
4,07/11/2009,31/10/2009,2,2,SPA1,SPA1,09-10,09-10,W,W,4.0,2.0,2.0,0.0,1,1,Mallorca,Getafe,2009,2009
5,29/11/2009,21/11/2009,1,1,SPA1,SPA1,09-10,09-10,W,W,1.0,1.0,0.0,0.0,1,1,Real Madrid,Santander,2009,2009
6,12/12/2009,05/12/2009,1,2,SPA1,SPA1,09-10,09-10,W,W,1.0,4.0,0.0,2.0,1,1,Espanyol Barcelona,Almeria,2009,2009
7,02/01/2010,19/12/2009,0,6,SPA1,SPA1,09-10,09-10,D,W,1.0,6.0,1.0,0.0,1,1,Villarreal,Zaragoza,2009,2009
8,16/01/2010,10/01/2010,4,2,SPA1,SPA1,09-10,09-10,W,W,4.0,2.0,0.0,0.0,1,1,Sevilla FC,Mallorca,2009,2009
9,06/02/2010,24/01/2010,1,2,SPA1,SPA1,09-10,09-10,W,W,2.0,2.0,1.0,0.0,1,1,Getafe,Malaga,2009,2009


In [23]:
# `nid` is short for "node id"
nid = "df_data_source"
df_data_source = dtfcore.DfDataSource(nid, preprocessed_df)

In [24]:
df_out_fit = df_data_source.fit()["df_out"]
_LOG.debug(hpandas.df_to_str(df_out_fit))

Unnamed: 0,Sea,Lge,HT,AT,HS,AS,GD,WDL,season
2009-01-31 00:00:00,09-10,ECU1,Manta FC,Barcelona SC,0.5,0.5,0,D,2009
2009-01-31 00:00:00,09-10,CHL1,Curico Unido,Colo Colo,2.0,2.0,0,D,2009
2009-01-31 00:00:00,09-10,CHL1,Everton CD,Union Espanola,0.5,1.0,-1,L,2009
,...,...,...,...,...,...,...,...,...
2017-06-27 00:00:00,16-17,ARG1,Estudiantes,Quilmes AC,1.0,0.5,1,W,2016
2017-06-28 00:00:00,16-17,ARG1,Talleres Cordoba,San Lorenzo,1.0,1.0,0,D,2016
2017-06-28 00:00:00,16-17,ARG1,Independiente,CA Lanus,1.0,1.0,0,D,2016


DEBUG None


In [None]:
"""
  is_home              GS
Manta FC

"""

In [25]:
def unravel_df(df: pd.DataFrame()) -> pd.DataFrame():
    """    
    Unravel the dataset by creating two entries for each row as team-opponent
    pair.
    
    :param df: Input dataframe.
    :return: unraveled dataframe.
    """
    # Create entry for home team `HT`.
    home_df = df[["Sea", "Lge", "HT", "AT", "HS"]].copy()
    home_df.rename(
                columns={"HT": "team", "AT": "opponent", "HS": "goals"}, inplace=True
                  )
    home_df["is_home"] = 1
    # Create entry for away team `AT`.
    away_df = df[["Sea", "Lge", "HT", "AT", "AS"]].copy()
    away_df.rename(
                columns={"AT": "team", "HT": "opponent", "AS": "goals"}, inplace=True
                  )
    away_df["is_home"] = 0
    # Concatenate the two splits.
    unraveled_df = pd.concat([home_df, away_df], ignore_index=True)
    # return the unraveled dataframe.
    return unraveled_df

In [40]:
# Prepare a DataFlow transformer node to compute percentage returns
#  and only return percentage returns.
nid = "unravel_dataset"
unravel_node_v2 = dtfcore.GroupedColDfToDfTransformer(
    nid,
    transformer_func=unravel_df,
    in_col_groups=[('Sea', 'Lge', 'HT', 'AT', 'HS', 'AS', 'GD', 'WDL', 'season')],
    out_col_group=(),
)

In [41]:
unravel_node_v2.fit(preprocessed_df)["df_out"]

AssertionError: 
################################################################################
* Failed assertion *
'('Sea', 'Lge', 'HT', 'AT', 'HS', 'AS', 'GD', 'WDL', 'season')' in 'Index(['Sea', 'Lge', 'HT', 'AT', 'HS', 'AS', 'GD', 'WDL', 'season'], dtype='object')'
################################################################################
