In [9]:
import logging

import pandas as pd
import numpy as np

import core.finance as cofinanc
import dataflow.core as dtfcore
import helpers.hdbg as hdbg
import helpers.henv as henv
import helpers.hpandas as hpandas
import helpers.hprint as hprint

import research_amp.soccer_prediction.utils as rasoprut

In [2]:
hdbg.init_logger(verbosity=logging.DEBUG)

_LOG = logging.getLogger(__name__)

_LOG.info("%s", henv.get_system_signature()[0])

hprint.config_notebook()

DEBUG Effective logging level=10
DEBUG Shut up 116 modules: botocore.loaders, matplotlib.ticker, botocore.args, botocore.retries.adaptive, matplotlib.axis, matplotlib.lines, botocore.endpoint, matplotlib.axes._axes, matplotlib._constrained_layout, botocore.endpoint_provider, matplotlib.dates, matplotlib._layoutgrid, urllib3.poolmanager, botocore.handlers, boto3.resources.factory, urllib3, matplotlib.axes, urllib3.util, botocore.regions, invoke, boto3.resources.base, botocore.waiter, matplotlib.pyplot, boto3, botocore.retryhandler, botocore.configprovider, boto3.resources.action, matplotlib.dviread, boto3.resources, matplotlib.figure, matplotlib.font_manager, botocore.auth, matplotlib.category, botocore.history, botocore.parsers, botocore.utils, matplotlib.text, botocore.compat, fsspec, botocore.awsrequest, matplotlib.mathtext, matplotlib._afm, urllib3.connectionpool, botocore.retries, asyncio, botocore.monitoring, matplotlib.textpath, botocore.discovery, botocore.credentials, botocore.

In [52]:
def preprocess_data(df: pd.DataFrame()) -> pd.DataFrame():
    """
    Preprocess the loaded ISDB dataframe of interest.
        - Filter and select match from seasons starting from 2009.
        - Convert column formats.
        - Add epsilon = 0.5 to scores with value as `0` to avoid log(0).
        - Check for NaN and infinite values and drop the rows.
    
    :param df: Input DataFrame. 
    :return: Preprocessed DataFrame.
    """
    df["season"] = df["Sea"].apply(lambda x: int("20" + str(x)[:2]))
    filtered_df = df[df["season"] >= 2009]
    # Preprocess the dataset.
    df["Date"] = pd.to_datetime(df["Date"], dayfirst=True)
    df.sort_values(by="Date", inplace=True)
    # Covert the categorical columns to category type. 
    categorical_columns = ["HT", "AT"]
    for col in categorical_columns:
        filtered_df[col] = filtered_df[col].astype("category")
    # Adding a small constant to goals to avoid log(0).
    columns = ['AS', 'HS']
    epsilon = 0.0
    for column in columns:    
        filtered_df[column] = filtered_df[column].apply(lambda x: x + epsilon if x == 0 else x)
        # Check if there are any infinite or NaN weights and handle them.
        if filtered_df.isna().sum().sum() > 0:
            _LOG.debug("NaN values found in the data. Removing rows with NaNs.")
            filtered_df.dropna(inplace=True)
        if filtered_df.isin([-np.inf, np.inf]).sum().sum() > 0:
            _LOG.debug("Infinite values found in the data. Removing rows with Infs.")
            filtered_df = filtered_df[~np.isinf(filtered_df.select_dtypes(include=[np.number])).any(1)]
    # Return the preprocessed DataFrame.
    return filtered_df

In [None]:
# Define the S3 Buckets, dataset path and local directory for download.
bucket = "cryptokaizen-data-test"
dataset_path = "kaizen_ai/soccer_prediction/datasets/OSF_football/"
local_dir = "datasets/OSF_football"
# Download data from S3.
rasoprut.download_data_from_s3(
    bucket_name=bucket, dataset_path=dataset_path, local_path=local_dir
)
# Load the data from S3 into pandas dataframe objects.
dataframes = rasoprut.load_data_to_dataframe(local_path=local_dir)

In [54]:
# Access the dataframes directly from the dictionary.
ISDBv1_df = dataframes.get("ISDBv1_df")
ISDBv2_df = dataframes.get("ISDBv2_df")
# Preprocess the selected dataframe (ISDBv2_df).
preprocessed_df = preprocess_data(ISDBv2_df)
# preprocessed_df.set_index('Date', inplace=True)

In [84]:
preprocessed_df["Lge"].value_counts()

Lge
ENG3    4416
ENG4    4416
ENG2    4416
ENG5    3818
ITA2    3696
SPA2    3696
GER3    3040
ENG1    3040
FRA1    3040
FRA2    3040
SPA1    3040
ITA1    3040
ARG1    3040
BRA1    3039
RUS2    2819
FRA3    2784
JPN2    2690
MEX1    2476
VEN1    2449
GER2    2448
HOL1    2448
GER1    2448
JPN1    2448
USA1    2420
CHL1    2316
BRA2    2280
POR1    2118
ECU1    2052
DZA1    1985
KOR1    1962
MAR1    1920
SWE1    1920
NOR1    1920
ZAF1    1920
RUS1    1920
CHN1    1920
BEL1    1890
GRE1    1811
TUR1    1802
SCO1    1794
ISR1    1630
DNK1    1568
FIN1    1552
TUN1    1546
SCO2    1440
SCO3    1440
AUT1    1440
CHE1    1422
SCO4    1260
AUS1    1110
USA2     809
NZL1     498
Name: count, dtype: int64

In [85]:
SPA1_df = preprocessed_df[preprocessed_df["Lge"] == "SPA1"]
SPA1_df

Unnamed: 0,Sea,Lge,Date,HT,AT,HS,AS,GD,WDL,season
102914,09-10,SPA1,29/08/2009,Real Madrid,La Coruna,3.0,2.0,1,W,2009
102915,09-10,SPA1,29/08/2009,Zaragoza,Tenerife,1.0,0.0,1,W,2009
102916,09-10,SPA1,30/08/2009,Almeria,Valladolid,0.0,0.0,0,D,2009
102917,09-10,SPA1,30/08/2009,Athletic Bilbao,Espanyol Barcelona,1.0,0.0,1,W,2009
102918,09-10,SPA1,30/08/2009,Malaga,Athletico Madrid,3.0,0.0,3,W,2009
...,...,...,...,...,...,...,...,...,...,...
216920,16-17,SPA1,21/05/2017,Athletico Madrid,Athletic Bilbao,3.0,1.0,2,W,2016
216921,16-17,SPA1,21/05/2017,Valencia,Villarreal,1.0,3.0,-2,L,2016
216922,16-17,SPA1,21/05/2017,Celta de Vigo,Real Sociedad,2.0,2.0,0,D,2016
216923,16-17,SPA1,21/05/2017,FC Barcelona,Eibar,4.0,2.0,2,W,2016


In [89]:
SPA1_df["HT"].value_counts()[:50]

HT
FC Barcelona              152
Malaga                    152
Athletic Bilbao           152
Athletico Madrid          152
Sevilla FC                152
Valencia                  152
Real Madrid               152
Espanyol Barcelona        152
Real Sociedad             133
Villarreal                133
Getafe                    133
Osasuna                   114
La Coruna                 114
Granada                   114
Levante                   114
Celta de Vigo              95
Sporting Gijon             95
Real Betis                 95
Vallecano                  95
Mallorca                   76
Zaragoza                   76
Almeria                    76
Valladolid                 57
Santander                  57
Eibar                      57
Las Palmas                 38
Elche                      38
Xerez                      19
Hercules                   19
Tenerife                   19
Alaves                     19
Leganes                    19
Cordoba                    19
Olympiq

In [61]:
def get_home_team(df, team):
    df = df[df["HT"] == team]
    df["is_home"] = 1
    df = df.rename(columns={"AT" : "opponent", "HS" : "goals_scored", "AS": "goals_scored_by_opponent"})
    df = df.drop(["HT"], axis=1)
    return df.to_dict(orient='records')

def get_away_team(df, team):
    df = df[df["AT"] == team]
    df["is_home"] = 0
    df = df.rename(columns={"HT" : "opponent", "AS" : "goals_scored", "HS": "goals_scored_by_opponent"})
    df = df.drop(["AT"], axis=1)
    # Define the mapping to flip 'W' and 'L'
    flip_mapping = {'W': 'L', 'L': 'W'}
    # Apply the mapping to the 'WDL' column
    df['WDL'] = df['WDL'].replace(flip_mapping)
    df['GD'] = df['GD'].apply(lambda x : -x)
    return df.to_dict(orient='records')

get_away_team(preprocessed_df, 'FC Barcelona')

[{'Sea': '09-10',
  'Lge': 'SPA1',
  'Date': '12/09/2009',
  'opponent': 'Getafe',
  'goals_scored_by_opponent': 0.0,
  'goals_scored': 2.0,
  'GD': 2,
  'WDL': 'W',
  'season': 2009,
  'is_home': 0},
 {'Sea': '09-10',
  'Lge': 'SPA1',
  'Date': '22/09/2009',
  'opponent': 'Santander',
  'goals_scored_by_opponent': 1.0,
  'goals_scored': 4.0,
  'GD': 3,
  'WDL': 'W',
  'season': 2009,
  'is_home': 0},
 {'Sea': '09-10',
  'Lge': 'SPA1',
  'Date': '26/09/2009',
  'opponent': 'Malaga',
  'goals_scored_by_opponent': 0.0,
  'goals_scored': 2.0,
  'GD': 2,
  'WDL': 'W',
  'season': 2009,
  'is_home': 0},
 {'Sea': '09-10',
  'Lge': 'SPA1',
  'Date': '17/10/2009',
  'opponent': 'Valencia',
  'goals_scored_by_opponent': 0.0,
  'goals_scored': 0.0,
  'GD': 0,
  'WDL': 'D',
  'season': 2009,
  'is_home': 0},
 {'Sea': '09-10',
  'Lge': 'SPA1',
  'Date': '31/10/2009',
  'opponent': 'Osasuna',
  'goals_scored_by_opponent': 1.0,
  'goals_scored': 1.0,
  'GD': 0,
  'WDL': 'D',
  'season': 2009,
  'is_

In [93]:
teams = set(preprocessed_df["HT"].to_list() +  preprocessed_df["AT"].to_list())
data = {}
# teams = ['FC Barcelona', 'Real Madrid']
for team in teams:
    data[team] = []
    data[team].extend(get_home_team(SPA1_df, team))
    data[team].extend(get_away_team(SPA1_df, team))
    if len(data[team]) == 0:
        del data[team]

In [94]:
data

{'Real Betis': [{'Sea': '11-12',
   'Lge': 'SPA1',
   'Date': '11/09/2011',
   'opponent': 'Mallorca',
   'goals_scored': 1.0,
   'goals_scored_by_opponent': 0.0,
   'GD': 1,
   'WDL': 'W',
   'season': 2011,
   'is_home': 1},
  {'Sea': '11-12',
   'Lge': 'SPA1',
   'Date': '22/09/2011',
   'opponent': 'Zaragoza',
   'goals_scored': 4.0,
   'goals_scored_by_opponent': 3.0,
   'GD': 1,
   'WDL': 'W',
   'season': 2011,
   'is_home': 1},
  {'Sea': '11-12',
   'Lge': 'SPA1',
   'Date': '02/10/2011',
   'opponent': 'Levante',
   'goals_scored': 0.0,
   'goals_scored_by_opponent': 1.0,
   'GD': -1,
   'WDL': 'L',
   'season': 2011,
   'is_home': 1},
  {'Sea': '11-12',
   'Lge': 'SPA1',
   'Date': '23/10/2011',
   'opponent': 'Vallecano',
   'goals_scored': 0.0,
   'goals_scored_by_opponent': 2.0,
   'GD': -2,
   'WDL': 'L',
   'season': 2011,
   'is_home': 1},
  {'Sea': '11-12',
   'Lge': 'SPA1',
   'Date': '05/11/2011',
   'opponent': 'Malaga',
   'goals_scored': 0.0,
   'goals_scored_by_o

In [97]:
dfs = []

for key, inner_list in data.items():
    df_inner = pd.DataFrame(inner_list)
    df_inner['outer_key'] = key  # Add outer key as a column
    dfs.append(df_inner)

# Concatenate all DataFrames
df_concat = pd.concat(dfs, ignore_index=True)

df_concat['Date'] = pd.to_datetime(df_concat['Date'], format='mixed')

# Pivot the DataFrame to have 'date' as index and 'outer_key' as columns
df_pivot = df_concat.pivot_table(index='Date', columns='outer_key', aggfunc='first')

# Sort the columns to ensure the outer keys are grouped together
df_pivot = df_pivot.sort_index(axis=1, level=0)

df_pivot

Unnamed: 0_level_0,GD,GD,GD,GD,GD,GD,GD,GD,GD,GD,GD,GD,GD,GD,GD,GD,GD,GD,GD,GD,GD,GD,GD,GD,GD,GD,GD,GD,GD,GD,GD,GD,GD,Lge,Lge,Lge,Lge,Lge,Lge,Lge,Lge,Lge,Lge,Lge,Lge,Lge,Lge,Lge,Lge,Lge,Lge,Lge,Lge,Lge,Lge,Lge,Lge,Lge,Lge,Lge,Lge,Lge,Lge,Lge,Lge,Lge,Sea,Sea,Sea,Sea,Sea,Sea,Sea,Sea,Sea,Sea,Sea,Sea,Sea,Sea,Sea,Sea,Sea,Sea,Sea,Sea,Sea,Sea,Sea,Sea,Sea,Sea,Sea,Sea,Sea,Sea,Sea,Sea,Sea,WDL,WDL,WDL,WDL,WDL,WDL,WDL,WDL,WDL,WDL,WDL,WDL,WDL,WDL,WDL,WDL,WDL,WDL,WDL,WDL,WDL,WDL,WDL,WDL,WDL,WDL,WDL,WDL,WDL,WDL,WDL,WDL,WDL,goals_scored,goals_scored,goals_scored,goals_scored,goals_scored,goals_scored,goals_scored,goals_scored,goals_scored,goals_scored,goals_scored,goals_scored,goals_scored,goals_scored,goals_scored,goals_scored,goals_scored,goals_scored,goals_scored,goals_scored,goals_scored,goals_scored,goals_scored,goals_scored,goals_scored,goals_scored,goals_scored,goals_scored,goals_scored,goals_scored,goals_scored,goals_scored,goals_scored,goals_scored_by_opponent,goals_scored_by_opponent,goals_scored_by_opponent,goals_scored_by_opponent,goals_scored_by_opponent,goals_scored_by_opponent,goals_scored_by_opponent,goals_scored_by_opponent,goals_scored_by_opponent,goals_scored_by_opponent,goals_scored_by_opponent,goals_scored_by_opponent,goals_scored_by_opponent,goals_scored_by_opponent,goals_scored_by_opponent,goals_scored_by_opponent,goals_scored_by_opponent,goals_scored_by_opponent,goals_scored_by_opponent,goals_scored_by_opponent,goals_scored_by_opponent,goals_scored_by_opponent,goals_scored_by_opponent,goals_scored_by_opponent,goals_scored_by_opponent,goals_scored_by_opponent,goals_scored_by_opponent,goals_scored_by_opponent,goals_scored_by_opponent,goals_scored_by_opponent,goals_scored_by_opponent,goals_scored_by_opponent,goals_scored_by_opponent,is_home,is_home,is_home,is_home,is_home,is_home,is_home,is_home,is_home,is_home,is_home,is_home,is_home,is_home,is_home,is_home,is_home,is_home,is_home,is_home,is_home,is_home,is_home,is_home,is_home,is_home,is_home,is_home,is_home,is_home,is_home,is_home,is_home,opponent,opponent,opponent,opponent,opponent,opponent,opponent,opponent,opponent,opponent,opponent,opponent,opponent,opponent,opponent,opponent,opponent,opponent,opponent,opponent,opponent,opponent,opponent,opponent,opponent,opponent,opponent,opponent,opponent,opponent,opponent,opponent,opponent,season,season,season,season,season,season,season,season,season,season,season,season,season,season,season,season,season,season,season,season,season,season,season,season,season,season,season,season,season,season,season,season,season
outer_key,Alaves,Almeria,Athletic Bilbao,Athletico Madrid,Celta de Vigo,Cordoba,Eibar,Elche,Espanyol Barcelona,FC Barcelona,Getafe,Granada,Hercules,La Coruna,Las Palmas,Leganes,Levante,Malaga,Mallorca,Osasuna,Real Betis,Real Madrid,Real Sociedad,Santander,Sevilla FC,Sporting Gijon,Tenerife,Valencia,Valladolid,Vallecano,Villarreal,Xerez,Zaragoza,Alaves,Almeria,Athletic Bilbao,Athletico Madrid,Celta de Vigo,Cordoba,Eibar,Elche,Espanyol Barcelona,FC Barcelona,Getafe,Granada,Hercules,La Coruna,Las Palmas,Leganes,Levante,Malaga,Mallorca,Osasuna,Real Betis,Real Madrid,Real Sociedad,Santander,Sevilla FC,Sporting Gijon,Tenerife,Valencia,Valladolid,Vallecano,Villarreal,Xerez,Zaragoza,Alaves,Almeria,Athletic Bilbao,Athletico Madrid,Celta de Vigo,Cordoba,Eibar,Elche,Espanyol Barcelona,FC Barcelona,Getafe,Granada,Hercules,La Coruna,Las Palmas,Leganes,Levante,Malaga,Mallorca,Osasuna,Real Betis,Real Madrid,Real Sociedad,Santander,Sevilla FC,Sporting Gijon,Tenerife,Valencia,Valladolid,Vallecano,Villarreal,Xerez,Zaragoza,Alaves,Almeria,Athletic Bilbao,Athletico Madrid,Celta de Vigo,Cordoba,Eibar,Elche,Espanyol Barcelona,FC Barcelona,Getafe,Granada,Hercules,La Coruna,Las Palmas,Leganes,Levante,Malaga,Mallorca,Osasuna,Real Betis,Real Madrid,Real Sociedad,Santander,Sevilla FC,Sporting Gijon,Tenerife,Valencia,Valladolid,Vallecano,Villarreal,Xerez,Zaragoza,Alaves,Almeria,Athletic Bilbao,Athletico Madrid,Celta de Vigo,Cordoba,Eibar,Elche,Espanyol Barcelona,FC Barcelona,Getafe,Granada,Hercules,La Coruna,Las Palmas,Leganes,Levante,Malaga,Mallorca,Osasuna,Real Betis,Real Madrid,Real Sociedad,Santander,Sevilla FC,Sporting Gijon,Tenerife,Valencia,Valladolid,Vallecano,Villarreal,Xerez,Zaragoza,Alaves,Almeria,Athletic Bilbao,Athletico Madrid,Celta de Vigo,Cordoba,Eibar,Elche,Espanyol Barcelona,FC Barcelona,Getafe,Granada,Hercules,La Coruna,Las Palmas,Leganes,Levante,Malaga,Mallorca,Osasuna,Real Betis,Real Madrid,Real Sociedad,Santander,Sevilla FC,Sporting Gijon,Tenerife,Valencia,Valladolid,Vallecano,Villarreal,Xerez,Zaragoza,Alaves,Almeria,Athletic Bilbao,Athletico Madrid,Celta de Vigo,Cordoba,Eibar,Elche,Espanyol Barcelona,FC Barcelona,Getafe,Granada,Hercules,La Coruna,Las Palmas,Leganes,Levante,Malaga,Mallorca,Osasuna,Real Betis,Real Madrid,Real Sociedad,Santander,Sevilla FC,Sporting Gijon,Tenerife,Valencia,Valladolid,Vallecano,Villarreal,Xerez,Zaragoza,Alaves,Almeria,Athletic Bilbao,Athletico Madrid,Celta de Vigo,Cordoba,Eibar,Elche,Espanyol Barcelona,FC Barcelona,Getafe,Granada,Hercules,La Coruna,Las Palmas,Leganes,Levante,Malaga,Mallorca,Osasuna,Real Betis,Real Madrid,Real Sociedad,Santander,Sevilla FC,Sporting Gijon,Tenerife,Valencia,Valladolid,Vallecano,Villarreal,Xerez,Zaragoza,Alaves,Almeria,Athletic Bilbao,Athletico Madrid,Celta de Vigo,Cordoba,Eibar,Elche,Espanyol Barcelona,FC Barcelona,Getafe,Granada,Hercules,La Coruna,Las Palmas,Leganes,Levante,Malaga,Mallorca,Osasuna,Real Betis,Real Madrid,Real Sociedad,Santander,Sevilla FC,Sporting Gijon,Tenerife,Valencia,Valladolid,Vallecano,Villarreal,Xerez,Zaragoza
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2,Unnamed: 42_level_2,Unnamed: 43_level_2,Unnamed: 44_level_2,Unnamed: 45_level_2,Unnamed: 46_level_2,Unnamed: 47_level_2,Unnamed: 48_level_2,Unnamed: 49_level_2,Unnamed: 50_level_2,Unnamed: 51_level_2,Unnamed: 52_level_2,Unnamed: 53_level_2,Unnamed: 54_level_2,Unnamed: 55_level_2,Unnamed: 56_level_2,Unnamed: 57_level_2,Unnamed: 58_level_2,Unnamed: 59_level_2,Unnamed: 60_level_2,Unnamed: 61_level_2,Unnamed: 62_level_2,Unnamed: 63_level_2,Unnamed: 64_level_2,Unnamed: 65_level_2,Unnamed: 66_level_2,Unnamed: 67_level_2,Unnamed: 68_level_2,Unnamed: 69_level_2,Unnamed: 70_level_2,Unnamed: 71_level_2,Unnamed: 72_level_2,Unnamed: 73_level_2,Unnamed: 74_level_2,Unnamed: 75_level_2,Unnamed: 76_level_2,Unnamed: 77_level_2,Unnamed: 78_level_2,Unnamed: 79_level_2,Unnamed: 80_level_2,Unnamed: 81_level_2,Unnamed: 82_level_2,Unnamed: 83_level_2,Unnamed: 84_level_2,Unnamed: 85_level_2,Unnamed: 86_level_2,Unnamed: 87_level_2,Unnamed: 88_level_2,Unnamed: 89_level_2,Unnamed: 90_level_2,Unnamed: 91_level_2,Unnamed: 92_level_2,Unnamed: 93_level_2,Unnamed: 94_level_2,Unnamed: 95_level_2,Unnamed: 96_level_2,Unnamed: 97_level_2,Unnamed: 98_level_2,Unnamed: 99_level_2,Unnamed: 100_level_2,Unnamed: 101_level_2,Unnamed: 102_level_2,Unnamed: 103_level_2,Unnamed: 104_level_2,Unnamed: 105_level_2,Unnamed: 106_level_2,Unnamed: 107_level_2,Unnamed: 108_level_2,Unnamed: 109_level_2,Unnamed: 110_level_2,Unnamed: 111_level_2,Unnamed: 112_level_2,Unnamed: 113_level_2,Unnamed: 114_level_2,Unnamed: 115_level_2,Unnamed: 116_level_2,Unnamed: 117_level_2,Unnamed: 118_level_2,Unnamed: 119_level_2,Unnamed: 120_level_2,Unnamed: 121_level_2,Unnamed: 122_level_2,Unnamed: 123_level_2,Unnamed: 124_level_2,Unnamed: 125_level_2,Unnamed: 126_level_2,Unnamed: 127_level_2,Unnamed: 128_level_2,Unnamed: 129_level_2,Unnamed: 130_level_2,Unnamed: 131_level_2,Unnamed: 132_level_2,Unnamed: 133_level_2,Unnamed: 134_level_2,Unnamed: 135_level_2,Unnamed: 136_level_2,Unnamed: 137_level_2,Unnamed: 138_level_2,Unnamed: 139_level_2,Unnamed: 140_level_2,Unnamed: 141_level_2,Unnamed: 142_level_2,Unnamed: 143_level_2,Unnamed: 144_level_2,Unnamed: 145_level_2,Unnamed: 146_level_2,Unnamed: 147_level_2,Unnamed: 148_level_2,Unnamed: 149_level_2,Unnamed: 150_level_2,Unnamed: 151_level_2,Unnamed: 152_level_2,Unnamed: 153_level_2,Unnamed: 154_level_2,Unnamed: 155_level_2,Unnamed: 156_level_2,Unnamed: 157_level_2,Unnamed: 158_level_2,Unnamed: 159_level_2,Unnamed: 160_level_2,Unnamed: 161_level_2,Unnamed: 162_level_2,Unnamed: 163_level_2,Unnamed: 164_level_2,Unnamed: 165_level_2,Unnamed: 166_level_2,Unnamed: 167_level_2,Unnamed: 168_level_2,Unnamed: 169_level_2,Unnamed: 170_level_2,Unnamed: 171_level_2,Unnamed: 172_level_2,Unnamed: 173_level_2,Unnamed: 174_level_2,Unnamed: 175_level_2,Unnamed: 176_level_2,Unnamed: 177_level_2,Unnamed: 178_level_2,Unnamed: 179_level_2,Unnamed: 180_level_2,Unnamed: 181_level_2,Unnamed: 182_level_2,Unnamed: 183_level_2,Unnamed: 184_level_2,Unnamed: 185_level_2,Unnamed: 186_level_2,Unnamed: 187_level_2,Unnamed: 188_level_2,Unnamed: 189_level_2,Unnamed: 190_level_2,Unnamed: 191_level_2,Unnamed: 192_level_2,Unnamed: 193_level_2,Unnamed: 194_level_2,Unnamed: 195_level_2,Unnamed: 196_level_2,Unnamed: 197_level_2,Unnamed: 198_level_2,Unnamed: 199_level_2,Unnamed: 200_level_2,Unnamed: 201_level_2,Unnamed: 202_level_2,Unnamed: 203_level_2,Unnamed: 204_level_2,Unnamed: 205_level_2,Unnamed: 206_level_2,Unnamed: 207_level_2,Unnamed: 208_level_2,Unnamed: 209_level_2,Unnamed: 210_level_2,Unnamed: 211_level_2,Unnamed: 212_level_2,Unnamed: 213_level_2,Unnamed: 214_level_2,Unnamed: 215_level_2,Unnamed: 216_level_2,Unnamed: 217_level_2,Unnamed: 218_level_2,Unnamed: 219_level_2,Unnamed: 220_level_2,Unnamed: 221_level_2,Unnamed: 222_level_2,Unnamed: 223_level_2,Unnamed: 224_level_2,Unnamed: 225_level_2,Unnamed: 226_level_2,Unnamed: 227_level_2,Unnamed: 228_level_2,Unnamed: 229_level_2,Unnamed: 230_level_2,Unnamed: 231_level_2,Unnamed: 232_level_2,Unnamed: 233_level_2,Unnamed: 234_level_2,Unnamed: 235_level_2,Unnamed: 236_level_2,Unnamed: 237_level_2,Unnamed: 238_level_2,Unnamed: 239_level_2,Unnamed: 240_level_2,Unnamed: 241_level_2,Unnamed: 242_level_2,Unnamed: 243_level_2,Unnamed: 244_level_2,Unnamed: 245_level_2,Unnamed: 246_level_2,Unnamed: 247_level_2,Unnamed: 248_level_2,Unnamed: 249_level_2,Unnamed: 250_level_2,Unnamed: 251_level_2,Unnamed: 252_level_2,Unnamed: 253_level_2,Unnamed: 254_level_2,Unnamed: 255_level_2,Unnamed: 256_level_2,Unnamed: 257_level_2,Unnamed: 258_level_2,Unnamed: 259_level_2,Unnamed: 260_level_2,Unnamed: 261_level_2,Unnamed: 262_level_2,Unnamed: 263_level_2,Unnamed: 264_level_2,Unnamed: 265_level_2,Unnamed: 266_level_2,Unnamed: 267_level_2,Unnamed: 268_level_2,Unnamed: 269_level_2,Unnamed: 270_level_2,Unnamed: 271_level_2,Unnamed: 272_level_2,Unnamed: 273_level_2,Unnamed: 274_level_2,Unnamed: 275_level_2,Unnamed: 276_level_2,Unnamed: 277_level_2,Unnamed: 278_level_2,Unnamed: 279_level_2,Unnamed: 280_level_2,Unnamed: 281_level_2,Unnamed: 282_level_2,Unnamed: 283_level_2,Unnamed: 284_level_2,Unnamed: 285_level_2,Unnamed: 286_level_2,Unnamed: 287_level_2,Unnamed: 288_level_2,Unnamed: 289_level_2,Unnamed: 290_level_2,Unnamed: 291_level_2,Unnamed: 292_level_2,Unnamed: 293_level_2,Unnamed: 294_level_2,Unnamed: 295_level_2,Unnamed: 296_level_2,Unnamed: 297_level_2
2009-01-11,,-1.0,,,,,,,0.0,,,,,0.0,,,,-1.0,1.0,,,,,-1.0,,0.0,-5.0,1.0,0.0,,5.0,,1.0,,SPA1,,,,,,,SPA1,,,,,SPA1,,,,SPA1,SPA1,,,,,SPA1,,SPA1,SPA1,SPA1,SPA1,,SPA1,,SPA1,,09-10,,,,,,,09-10,,,,,09-10,,,,09-10,09-10,,,,,09-10,,09-10,09-10,09-10,09-10,,09-10,,09-10,,L,,,,,,,D,,,,,D,,,,L,W,,,,,L,,D,L,W,D,,W,,W,,1.0,,,,,,,1.0,,,,,1.0,,,,0.0,1.0,,,,,0.0,,1.0,0.0,1.0,1.0,,5.0,,2.0,,2.0,,,,,,,1.0,,,,,1.0,,,,1.0,0.0,,,,,1.0,,1.0,5.0,0.0,1.0,,0.0,,1.0,,0.0,,,,,,,1.0,,,,,1.0,,,,1.0,1.0,,,,,0.0,,0.0,0.0,0.0,0.0,,1.0,,1.0,,Zaragoza,,,,,,,Valladolid,,,,,Sporting Gijon,,,,Valencia,Santander,,,,,Mallorca,,La Coruna,Villarreal,Malaga,Espanyol Barcelona,,Tenerife,,Almeria,,2009.0,,,,,,,2009.0,,,,,2009.0,,,,2009.0,2009.0,,,,,2009.0,,2009.0,2009.0,2009.0,2009.0,,2009.0,,2009.0
2009-02-12,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,,,,-2.0,,,,,,,,,,,SPA1,,,,,,,,,,,,,,,,,,,,,,SPA1,,,,,,,,,,,09-10,,,,,,,,,,,,,,,,,,,,,,09-10,,,,,,,,,,,W,,,,,,,,,,,,,,,,,,,,,,L,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,2.0,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,Xerez,,,,,,,,,,,,,,,,,,,,,,FC Barcelona,,,,,,,,,,,2009.0,,,,,,,,,,,,,,,,,,,,,,2009.0,
2009-03-10,,-1.0,,1.0,,,,,,1.0,,,,1.0,,,,,,,,,,,,,-1.0,,,,,,-1.0,,SPA1,,SPA1,,,,,,SPA1,,,,SPA1,,,,,,,,,,,,,SPA1,,,,,,SPA1,,09-10,,09-10,,,,,,09-10,,,,09-10,,,,,,,,,,,,,09-10,,,,,,09-10,,L,,W,,,,,,W,,,,W,,,,,,,,,,,,,L,,,,,,L,,0.0,,2.0,,,,,,1.0,,,,1.0,,,,,,,,,,,,,0.0,,,,,,1.0,,1.0,,1.0,,,,,,0.0,,,,0.0,,,,,,,,,,,,,1.0,,,,,,2.0,,0.0,,1.0,,,,,,1.0,,,,0.0,,,,,,,,,,,,,1.0,,,,,,0.0,,FC Barcelona,,Zaragoza,,,,,,Almeria,,,,Tenerife,,,,,,,,,,,,,La Coruna,,,,,,Athletico Madrid,,2009.0,,2009.0,,,,,,2009.0,,,,2009.0,,,,,,,,,,,,,2009.0,,,,,,2009.0
2009-04-10,,,0.0,,,,,,0.0,,1.0,,,,,,,0.0,-3.0,-1.0,,-1.0,,-1.0,1.0,3.0,,1.0,0.0,,0.0,0.0,,,,SPA1,,,,,,SPA1,,SPA1,,,,,,,SPA1,SPA1,SPA1,,SPA1,,SPA1,SPA1,SPA1,,SPA1,SPA1,,SPA1,SPA1,,,,09-10,,,,,,09-10,,09-10,,,,,,,09-10,09-10,09-10,,09-10,,09-10,09-10,09-10,,09-10,09-10,,09-10,09-10,,,,D,,,,,,D,,W,,,,,,,D,L,L,,L,,L,W,W,,W,D,,D,D,,,,2.0,,,,,,0.0,,2.0,,,,,,,1.0,1.0,1.0,,1.0,,0.0,2.0,4.0,,1.0,2.0,,0.0,1.0,,,,2.0,,,,,,0.0,,1.0,,,,,,,1.0,4.0,2.0,,2.0,,1.0,1.0,1.0,,0.0,2.0,,0.0,1.0,,,,0.0,,,,,,0.0,,1.0,,,,,,,0.0,0.0,0.0,,0.0,,1.0,1.0,1.0,,0.0,1.0,,1.0,1.0,,,,Valladolid,,,,,,Villarreal,,Osasuna,,,,,,,Xerez,Sporting Gijon,Getafe,,Sevilla FC,,Valencia,Real Madrid,Mallorca,,Santander,Athletic Bilbao,,Espanyol Barcelona,Malaga,,,,2009.0,,,,,,2009.0,,2009.0,,,,,,,2009.0,2009.0,2009.0,,2009.0,,2009.0,2009.0,2009.0,,2009.0,2009.0,,2009.0,2009.0,
2009-05-12,,-2.0,,2.0,,,,,,2.0,,,,-2.0,,,,,,,,2.0,,,0.0,,,,0.0,,,-2.0,,,SPA1,,SPA1,,,,,,SPA1,,,,SPA1,,,,,,,,SPA1,,,SPA1,,,,SPA1,,,SPA1,,,09-10,,09-10,,,,,,09-10,,,,09-10,,,,,,,,09-10,,,09-10,,,,09-10,,,09-10,,,L,,W,,,,,,W,,,,L,,,,,,,,W,,,D,,,,D,,,L,,,2.0,,2.0,,,,,,3.0,,,,1.0,,,,,,,,4.0,,,1.0,,,,1.0,,,0.0,,,4.0,,0.0,,,,,,1.0,,,,3.0,,,,,,,,2.0,,,1.0,,,,1.0,,,2.0,,,0.0,,0.0,,,,,,0.0,,,,1.0,,,,,,,,1.0,,,1.0,,,,0.0,,,1.0,,,Real Madrid,,Xerez,,,,,,La Coruna,,,,FC Barcelona,,,,,,,,Almeria,,,Valladolid,,,,Sevilla FC,,,Athletico Madrid,,,2009.0,,2009.0,,,,,,2009.0,,,,2009.0,,,,,,,,2009.0,,,2009.0,,,,2009.0,,,2009.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-10-04,,,,,,,,,,,,,,,,,,,,,,,2.0,,,-2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,SPA1,,,SPA1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,16-17,,,16-17,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,W,,,L,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.0,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Sporting Gijon,,,Real Sociedad,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2016.0,,,2016.0,,,,,,,
2017-11-02,-6.0,,1.0,,,,,,,6.0,,,,-1.0,,,,,,-2.0,0.0,2.0,,,,,,0.0,,,,,,SPA1,,SPA1,,,,,,,SPA1,,,,SPA1,,,,,,SPA1,SPA1,SPA1,,,,,,SPA1,,,,,,16-17,,16-17,,,,,,,16-17,,,,16-17,,,,,,16-17,16-17,16-17,,,,,,16-17,,,,,,L,,W,,,,,,,W,,,,L,,,,,,L,D,W,,,,,,D,,,,,,0.0,,2.0,,,,,,,6.0,,,,1.0,,,,,,1.0,0.0,3.0,,,,,,0.0,,,,,,6.0,,1.0,,,,,,,0.0,,,,2.0,,,,,,3.0,0.0,1.0,,,,,,0.0,,,,,,1.0,,1.0,,,,,,,0.0,,,,0.0,,,,,,1.0,1.0,0.0,,,,,,0.0,,,,,,FC Barcelona,,La Coruna,,,,,,,Alaves,,,,Athletic Bilbao,,,,,,Real Madrid,Valencia,Osasuna,,,,,,Real Betis,,,,,,2016.0,,2016.0,,,,,,,2016.0,,,,2016.0,,,,,,2016.0,2016.0,2016.0,,,,,,2016.0,,,,,
2017-11-03,1.0,,,1.0,,,,,,,,-1.0,,,,0.0,,-1.0,,,,,,,0.0,0.0,,0.0,,,,,,SPA1,,,SPA1,,,,,,,,SPA1,,,,SPA1,,SPA1,,,,,,,SPA1,SPA1,,SPA1,,,,,,16-17,,,16-17,,,,,,,,16-17,,,,16-17,,16-17,,,,,,,16-17,16-17,,16-17,,,,,,W,,,W,,,,,,,,L,,,,D,,L,,,,,,,D,D,,D,,,,,,2.0,,,1.0,,,,,,,,0.0,,,,1.0,,1.0,,,,,,,1.0,1.0,,1.0,,,,,,1.0,,,0.0,,,,,,,,1.0,,,,1.0,,2.0,,,,,,,1.0,1.0,,1.0,,,,,,0.0,,,0.0,,,,,,,,1.0,,,,0.0,,1.0,,,,,,,1.0,0.0,,1.0,,,,,,Malaga,,,Granada,,,,,,,,Athletico Madrid,,,,Sevilla FC,,Alaves,,,,,,,Leganes,Valencia,,Sporting Gijon,,,,,,2016.0,,,2016.0,,,,,,,,2016.0,,,,2016.0,,2016.0,,,,,,,2016.0,2016.0,,2016.0,,,,,
2017-12-02,,,,1.0,-1.0,,,,,,,,,,-1.0,-2.0,,0.0,,,,,,,1.0,2.0,,,,,0.0,,,,,,SPA1,SPA1,,,,,,,,,,SPA1,SPA1,,SPA1,,,,,,,SPA1,SPA1,,,,,SPA1,,,,,,16-17,16-17,,,,,,,,,,16-17,16-17,,16-17,,,,,,,16-17,16-17,,,,,16-17,,,,,,W,L,,,,,,,,,,L,L,,D,,,,,,,W,W,,,,,D,,,,,,3.0,2.0,,,,,,,,,,0.0,0.0,,1.0,,,,,,,1.0,2.0,,,,,1.0,,,,,,2.0,3.0,,,,,,,,,,1.0,2.0,,1.0,,,,,,,0.0,0.0,,,,,1.0,,,,,,1.0,0.0,,,,,,,,,,1.0,1.0,,0.0,,,,,,,0.0,0.0,,,,,1.0,,,,,,Celta de Vigo,Athletico Madrid,,,,,,,,,,Sevilla FC,Sporting Gijon,,Villarreal,,,,,,,Las Palmas,Leganes,,,,,Malaga,,,,,,2016.0,2016.0,,,,,,,,,,2016.0,2016.0,,2016.0,,,,,,,2016.0,2016.0,,,,,2016.0,,


In [23]:
# `nid` is short for "node id"
nid = "df_data_source"
df_data_source = dtfcore.DfDataSource(nid, preprocessed_df)

In [24]:
df_out_fit = df_data_source.fit()["df_out"]
_LOG.debug(hpandas.df_to_str(df_out_fit))

Unnamed: 0,Sea,Lge,HT,AT,HS,AS,GD,WDL,season
2009-01-31 00:00:00,09-10,ECU1,Manta FC,Barcelona SC,0.5,0.5,0,D,2009
2009-01-31 00:00:00,09-10,CHL1,Curico Unido,Colo Colo,2.0,2.0,0,D,2009
2009-01-31 00:00:00,09-10,CHL1,Everton CD,Union Espanola,0.5,1.0,-1,L,2009
,...,...,...,...,...,...,...,...,...
2017-06-27 00:00:00,16-17,ARG1,Estudiantes,Quilmes AC,1.0,0.5,1,W,2016
2017-06-28 00:00:00,16-17,ARG1,Talleres Cordoba,San Lorenzo,1.0,1.0,0,D,2016
2017-06-28 00:00:00,16-17,ARG1,Independiente,CA Lanus,1.0,1.0,0,D,2016


DEBUG None


In [None]:
"""
  is_home              GS
Manta FC

"""

In [25]:
def unravel_df(df: pd.DataFrame()) -> pd.DataFrame():
    """    
    Unravel the dataset by creating two entries for each row as team-opponent
    pair.
    
    :param df: Input dataframe.
    :return: unraveled dataframe.
    """
    # Create entry for home team `HT`.
    home_df = df[["Sea", "Lge", "HT", "AT", "HS"]].copy()
    home_df.rename(
                columns={"HT": "team", "AT": "opponent", "HS": "goals"}, inplace=True
                  )
    home_df["is_home"] = 1
    # Create entry for away team `AT`.
    away_df = df[["Sea", "Lge", "HT", "AT", "AS"]].copy()
    away_df.rename(
                columns={"AT": "team", "HT": "opponent", "AS": "goals"}, inplace=True
                  )
    away_df["is_home"] = 0
    # Concatenate the two splits.
    unraveled_df = pd.concat([home_df, away_df], ignore_index=True)
    # return the unraveled dataframe.
    return unraveled_df

In [40]:
# Prepare a DataFlow transformer node to compute percentage returns
#  and only return percentage returns.
nid = "unravel_dataset"
unravel_node_v2 = dtfcore.GroupedColDfToDfTransformer(
    nid,
    transformer_func=unravel_df,
    in_col_groups=[('Sea', 'Lge', 'HT', 'AT', 'HS', 'AS', 'GD', 'WDL', 'season')],
    out_col_group=(),
)

In [41]:
unravel_node_v2.fit(preprocessed_df)["df_out"]

AssertionError: 
################################################################################
* Failed assertion *
'('Sea', 'Lge', 'HT', 'AT', 'HS', 'AS', 'GD', 'WDL', 'season')' in 'Index(['Sea', 'Lge', 'HT', 'AT', 'HS', 'AS', 'GD', 'WDL', 'season'], dtype='object')'
################################################################################
