In [2]:
import pandas as pd
import numpy as np
from math import sqrt
import os
import pysubgroup as ps

In [3]:
PATH: str = "../data/spadl_format/"
LEAGUES: list[str] = ["England", "Spain", "France", "Italy", "Germany"]

teams_df = pd.read_json("../data/wyscout/teams/teams.json")
players_df = pd.read_json("../data/wyscout/players/players.json")
ranking_df = pd.read_json('../data/wyscout/playeranks/playeranks.json')

df_dict = {}
if not os.path.exists("../data/processed/"):
    os.mkdir("../data/processed")
for league in LEAGUES:

    df = pd.read_csv(f"{PATH}{league}.csv", index_col=0)

    # remove not used columns
    df.drop(["original_event_id", "result_name","bodypart_id", "type_id"], inplace=True, axis=1)

    df_dict[league] = df
all_df = pd.concat([df for df in df_dict.values()])
all_df

Unnamed: 0,game_id,period_id,time_seconds,team_id,player_id,start_x,start_y,end_x,end_y,result_id,action_id,type_name,bodypart_name,player_name
0,2499719,1,2.758649,1609,25413,51.45,34.68,32.55,14.96,1,0,pass,foot,A. Lacazette
1,2499719,1,4.946850,1609,370224,32.55,14.96,53.55,17.00,1,1,pass,foot,R. Holding
2,2499719,1,6.542188,1609,3319,53.55,17.00,36.75,19.72,1,2,pass,head,M. Özil
3,2499719,1,8.143395,1609,120339,36.75,19.72,43.05,3.40,1,3,pass,head,Mohamed Elneny
4,2499719,1,10.302366,1609,167145,43.05,3.40,75.60,8.16,1,4,pass,foot,Bellerín
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
389146,2517044,2,2817.761761,2463,94831,87.15,36.72,87.15,36.72,0,1148,shot,foot_right,Y. Ōsako
389147,2517044,2,2818.280436,2451,14917,17.85,31.96,23.10,27.20,1,1149,interception,foot,R. Knoche
389148,2517044,2,2823.180681,2451,14804,23.10,27.20,0.00,3.40,1,1150,dribble,foot,J. Błaszczykowski
389149,2517044,2,2828.080925,2451,14804,0.00,3.40,5.25,4.76,0,1151,pass,foot,J. Błaszczykowski


In [4]:
print(all_df["type_name"].unique())

['pass' 'interception' 'dribble' 'take_on' 'tackle' 'foul'
 'freekick_short' 'cross' 'shot' 'clearance' 'throw_in' 'goalkick'
 'corner_short' 'corner_crossed' 'keeper_save' 'freekick_crossed'
 'shot_freekick' 'bad_touch' 'shot_penalty']


## Pre-process

Falta decidir se posições serão float ou discretizadas. 

In [5]:
GOAL_CENTER_X: int = 105
GOAL_CENTER_Y: int = 34

UPPER_CROSSBAR_X: int = 105
UPPER_CROSSBAR_Y: int = 38

LOWER_CROSSBAR_X: int = 105
LOWER_CROSSBAR_Y: int = 30


def get_shot_angle(shot_pos_x, shot_pos_y):
    v1 = np.array([UPPER_CROSSBAR_X - shot_pos_x, UPPER_CROSSBAR_Y - shot_pos_y])
    v2 = np.array([LOWER_CROSSBAR_X - shot_pos_x, LOWER_CROSSBAR_Y - shot_pos_y])
    return np.arccos(np.dot(v1 / np.linalg.norm(v1), v2 / np.linalg.norm(v2)))

def calcular_media_global():
    # Filtrar jogadores com dados válidos
    jogadores_validos = ranking_df[ranking_df['playerankScore'].notna()]
    media_global = np.sum(jogadores_validos['playerankScore'] * jogadores_validos['minutesPlayed']) / jogadores_validos['minutesPlayed'].sum()
    return media_global

media_rank_global = calcular_media_global()

def calcular_ranking_medio(player_id):
    jogador_rankings = ranking_df[ranking_df['playerId'] == player_id]
    if jogador_rankings.empty:
        return media_rank_global
    ranking_ponderado = np.sum(jogador_rankings['playerankScore'] * jogador_rankings['minutesPlayed']) / jogador_rankings['minutesPlayed'].sum()
    return ranking_ponderado

def calculate_distance(x1, y1, x2, y2):
    return sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)

def discretize_start_x(x):
    if x < 26.25:
        return 'defensive'
    elif x < 52.5:
        return 'pre-defensive'
    elif x < 78.75:
        return 'pre-attacking'
    else:
        return 'attacking'

def discretize_start_y(y):
    if y < 22.67:
        return 'left'
    elif y < 45.33:
        return 'center'
    else:
        return 'right'

def generate_shots_with_counts_events(df):
    shot_data = []
    result_ids = []
    grouped = df.groupby(['game_id', 'period_id'])
    
    for (game_id, period_id), group in grouped:
        group = group.sort_values(by='time_seconds').reset_index(drop=True)
        start_index = 0

        while start_index < len(group):
            shot_index = group[start_index:].index[group['type_name'][start_index:] == 'shot']
            if len(shot_index) == 0:
                break
            shot_index = shot_index[0]

            shot_row = group.loc[shot_index]
            play_events = group.loc[start_index:shot_index]
            
            for idx in play_events.index[::-1]:
                if play_events.loc[idx, 'team_id'] != shot_row['team_id']:
                    start_index = idx + 1
                    break
            else:
                start_index = play_events.index[0]

            play_events = group.loc[start_index:shot_index]

            player_rank = calcular_ranking_medio(shot_row['player_id'])
            
            play_distance = 0
            play_distance_towards_goal = 0
            play_distances_to_goal = []

            for i in range(1, len(play_events)):
                x1, y1 = play_events.iloc[i - 1][['start_x', 'start_y']]
                x2, y2 = play_events.iloc[i][['start_x', 'start_y']]
                dist = calculate_distance(x1, y1, x2, y2)
                play_distance += dist
                play_distance_towards_goal += x2 - x1
                play_distances_to_goal.append(sqrt((x2 - GOAL_CENTER_X) ** 2 + (y2 - GOAL_CENTER_Y) ** 2))

            if len(play_events) > 1:
                play_mean_distance_to_the_goal = np.mean(play_distances_to_goal)
                play_std_distance_to_the_goal = np.std(play_distances_to_goal)
            else:
                play_mean_distance_to_the_goal = 0
                play_std_distance_to_the_goal = 0

            play_duration = play_events['time_seconds'].iloc[-1] - play_events['time_seconds'].iloc[0]
            ratio_distance = play_distance_towards_goal / play_distance if play_distance != 0 else 0
            total_time_per_play = play_duration / len(play_events) if len(play_events) != 0 else 0
            play_speed = play_distance / play_duration if play_duration != 0 else 0
            play_speed_towards_goal = play_distance_towards_goal / play_duration if play_duration != 0 else 0

            shot_data.append({
                'game_id': game_id,
                'period_id': period_id,
                'team_id': shot_row['team_id'],
                'player_id': shot_row['player_id'],
                'time_seconds': shot_row['time_seconds'],
                'start_x': shot_row['start_x'],
                'start_y': shot_row['start_y'],
                'disc_start_x': discretize_start_x(shot_row['start_x']),
                'disc_start_y': discretize_start_y(shot_row['start_y']),
                'num_events': len(play_events),
                'num_passes': (play_events['type_name'] == 'pass').sum(),
                'num_dribbles': (play_events['type_name'] == 'dribble').sum(),
                'play_duration': play_duration,
                'player_rank': player_rank,
                'bodypart_name': shot_row['bodypart_name'],
                'play_distance': play_distance,
                'play_mean_distance_to_the_goal': play_mean_distance_to_the_goal,
                'play_std_distance_to_the_goal': play_std_distance_to_the_goal,
                'play_distance_towards_goal': play_distance_towards_goal,
                'ratio_distance': ratio_distance,
                'total_time_per_play': total_time_per_play,
                'play_speed': play_speed,
                'play_speed_towards_goal': play_speed_towards_goal,
            })

            result_ids.append(shot_row['result_id'])
            start_index = shot_index + 1

    shots_df = pd.DataFrame(shot_data)
    shots_df["shot_distance_from_goal"] = shots_df.apply(lambda x: sqrt((x["start_x"] - GOAL_CENTER_X)**2 + (x["start_y"] - GOAL_CENTER_Y)**2), axis=1)
    shots_df["shot_angle_from_goal"] = shots_df[["start_x", "start_y"]].apply(lambda pos: get_shot_angle(pos["start_x"], pos["start_y"]), axis=1)
    shots_df["result_id"] = result_ids
    return shots_df

shots_df = generate_shots_with_counts_events(all_df)

In [6]:
shots_df.head()

Unnamed: 0,game_id,period_id,team_id,player_id,time_seconds,start_x,start_y,disc_start_x,disc_start_y,num_events,...,play_mean_distance_to_the_goal,play_std_distance_to_the_goal,play_distance_towards_goal,ratio_distance,total_time_per_play,play_speed,play_speed_towards_goal,shot_distance_from_goal,shot_angle_from_goal,result_id
0,2499719,1,1609,25413,94.595788,92.4,40.12,attacking,center,7,...,30.932864,8.413575,24.15,0.15487,2.83695,7.852377,1.216095,14.007655,0.509981,1
1,2499719,1,1631,26150,179.854785,89.25,32.64,attacking,center,2,...,15.808608,0.0,-9.45,-0.25819,2.273328,8.050087,-2.07845,15.808608,0.494098,0
2,2499719,1,1631,14763,254.745027,100.8,32.64,attacking,center,7,...,21.08692,12.262251,4.2,0.034728,2.705084,6.386817,0.221805,4.414703,1.46731,1
3,2499719,1,1609,7868,425.824035,85.05,45.56,attacking,right,6,...,57.591081,22.443334,75.6,0.783021,2.161246,7.44548,5.82997,23.057235,0.300168,0
4,2499719,1,1609,7868,815.462015,78.75,47.6,attacking,right,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,29.563872,0.24003,0


## Binary Goal/Not Goal

In [32]:
shots_df_cp = shots_df.copy()
shots_df_cp.drop(["game_id", "period_id", "team_id", "player_id", "time_seconds"], inplace=True, axis=1)

# Definir o alvo (target) da descoberta de subgrupos
# Aqui, vamos assumir que queremos encontrar subgrupos de chutes bem-sucedidos (result_name == 'Goal')
target = ps.BinaryTarget('result_id', 1)

search_space = ps.create_selectors(shots_df_cp, ignore=['result_id'])

# Configurar a tarefa de descoberta de subgrupos
task = ps.SubgroupDiscoveryTask(shots_df_cp, target, search_space, result_set_size=10, depth=3, qf=ps.WRAccQF())

# Configurar e executar o Beam Search
search_algorithm = ps.BeamSearch()
result = search_algorithm.execute(task)

# Exibir os resultados
for sg_result in result.to_dataframe().head(10).itertuples(index=False):
    print(f"Quality: {sg_result.quality}")
    print(f"Subgroup: {sg_result.subgroup}")
    print(f"Size of Subgroup: {sg_result.size_sg}")
    # print(f"Size of Dataset: {sg_result.size_dataset}")
    # print(f"Positives in Subgroup: {sg_result.positives_sg}")
    # print(f"Positives in Dataset: {sg_result.positives_dataset}")
    # print(f"Size of Complement: {sg_result.size_complement}")
    # print(f"Relative Size of Subgroup: {sg_result.relative_size_sg}")
    # print(f"Relative Size of Complement: {sg_result.relative_size_complement}")
    # print(f"Coverage of Subgroup: {sg_result.coverage_sg}")
    # print(f"Coverage of Complement: {sg_result.coverage_complement}")
    # print(f"Target Share in Subgroup: {sg_result.target_share_sg}")
    # print(f"Target Share in Complement: {sg_result.target_share_complement}")
    # print(f"Target Share in Dataset: {sg_result.target_share_dataset}")
    # print(f"Lift: {sg_result.lift}")
    print("-" * 40)

result.to_dataframe()

Quality: 0.031155481651607888
Subgroup: shot_angle_from_goal>=0.60
Size of Subgroup: 8189
----------------------------------------
Quality: 0.031155481651607888
Subgroup: disc_start_y=='center' AND shot_angle_from_goal>=0.60
Size of Subgroup: 8189
----------------------------------------
Quality: 0.031155481651607888
Subgroup: disc_start_x=='attacking' AND shot_angle_from_goal>=0.60
Size of Subgroup: 8189
----------------------------------------
Quality: 0.031155481651607888
Subgroup: disc_start_x=='attacking' AND disc_start_y=='center' AND shot_angle_from_goal>=0.60
Size of Subgroup: 8189
----------------------------------------
Quality: 0.030839137893783095
Subgroup: shot_distance_from_goal<11.26
Size of Subgroup: 8045
----------------------------------------
Quality: 0.030839137893783095
Subgroup: disc_start_y=='center' AND shot_distance_from_goal<11.26
Size of Subgroup: 8045
----------------------------------------
Quality: 0.030839137893783095
Subgroup: disc_start_x=='attacking' A

Unnamed: 0,quality,subgroup,size_sg,size_dataset,positives_sg,positives_dataset,size_complement,relative_size_sg,relative_size_complement,coverage_sg,coverage_complement,target_share_sg,target_share_complement,target_share_dataset,lift
0,0.031155,shot_angle_from_goal>=0.60,8189,40461,2125,4271,32272,0.202392,0.797608,0.497542,0.502458,0.259494,0.066497,0.105558,2.458301
1,0.031155,disc_start_y=='center' AND shot_angle_from_goa...,8189,40461,2125,4271,32272,0.202392,0.797608,0.497542,0.502458,0.259494,0.066497,0.105558,2.458301
2,0.031155,disc_start_x=='attacking' AND shot_angle_from_...,8189,40461,2125,4271,32272,0.202392,0.797608,0.497542,0.502458,0.259494,0.066497,0.105558,2.458301
3,0.031155,disc_start_x=='attacking' AND disc_start_y=='c...,8189,40461,2125,4271,32272,0.202392,0.797608,0.497542,0.502458,0.259494,0.066497,0.105558,2.458301
4,0.030839,shot_distance_from_goal<11.26,8045,40461,2097,4271,32416,0.198833,0.801167,0.490986,0.509014,0.260659,0.067066,0.105558,2.469332
5,0.030839,disc_start_y=='center' AND shot_distance_from_...,8045,40461,2097,4271,32416,0.198833,0.801167,0.490986,0.509014,0.260659,0.067066,0.105558,2.469332
6,0.030839,disc_start_x=='attacking' AND shot_distance_fr...,8045,40461,2097,4271,32416,0.198833,0.801167,0.490986,0.509014,0.260659,0.067066,0.105558,2.469332
7,0.030839,disc_start_x=='attacking' AND disc_start_y=='c...,8045,40461,2097,4271,32416,0.198833,0.801167,0.490986,0.509014,0.260659,0.067066,0.105558,2.469332
8,0.02972,shot_angle_from_goal>=0.60 AND shot_distance_f...,6797,40461,1920,4271,33664,0.167989,0.832011,0.449543,0.550457,0.282478,0.069837,0.105558,2.67603
9,0.02972,disc_start_y=='center' AND shot_angle_from_goa...,6797,40461,1920,4271,33664,0.167989,0.832011,0.449543,0.550457,0.282478,0.069837,0.105558,2.67603


In [8]:
import pandas as pd

# Supondo que `result_df` é o DataFrame contendo os resultados dos subgrupos
# E que `shots_df_cp` é o dataframe original

def get_covered_indices(subgroup, df):
    """
    Retorna os índices dos registros no dataframe que são cobertos pelo subgrupo.
    """
    condition = subgroup.covers(df)
    return df[condition].index

# Lista para armazenar todos os índices cobertos
covered_indices = set()

# Iterar sobre os subgrupos e adicionar os índices cobertos ao conjunto
for sg_result in result.to_dataframe().itertuples():
    subgroup = sg_result.subgroup  # Ajuste conforme a coluna correta
    indices = get_covered_indices(subgroup, shots_df_cp)
    covered_indices.update(indices)
    print(len(covered_indices) / len(shots_df_cp))

# Cobertura total (proporção de registros cobertos)
total_coverage = len(covered_indices) / len(shots_df_cp)

print(f"Total Coverage: {total_coverage:.4f}")


0.20239242727564816
0.20239242727564816
0.20239242727564816
0.20239242727564816
0.23323694421788882
0.23323694421788882
0.23323694421788882
0.23323694421788882
0.23323694421788882
0.23323694421788882
Total Coverage: 0.2332


## XG

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

In [10]:
# constans
RANDOM_STATE: int = 123
TEST_SIZE: float  = 0.3

shots_df_cp = shots_df.copy()


# Random Forest Classifier
rfc = RandomForestClassifier(random_state=RANDOM_STATE)
X = shots_df_cp[["bodypart_name", "shot_distance_from_goal", "shot_angle_from_goal"]]
X["bodypart_name"] = X["bodypart_name"].apply(lambda val: 0 if val == "foot_right" else 1 if val == "foot_left" else 2)
y = shots_df_cp["result_id"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)
rfc.fit(X=X_train, y=y_train)
y_pred = rfc.predict(X=X_test)
classification_report(y_test, y_pred, output_dict=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["bodypart_name"] = X["bodypart_name"].apply(lambda val: 0 if val == "foot_right" else 1 if val == "foot_left" else 2)


{'0': {'precision': 0.9070434415858287,
  'recall': 0.9876010286554004,
  'f1-score': 0.9456096381304138,
  'support': 10888.0},
 '1': {'precision': 0.5246478873239436,
  'recall': 0.11910471622701839,
  'f1-score': 0.19413680781758957,
  'support': 1251.0},
 'accuracy': 0.8980970425899992,
 'macro avg': {'precision': 0.7158456644548862,
  'recall': 0.5533528724412095,
  'f1-score': 0.5698732229740017,
  'support': 12139.0},
 'weighted avg': {'precision': 0.8676351840372977,
  'recall': 0.8980970425899992,
  'f1-score': 0.8681656550410867,
  'support': 12139.0}}

In [11]:
shots_df_cp["xg"] = rfc.predict(X=X)


In [12]:
print("Comparação 0/1: ", shots_df_cp[shots_df_cp["xg"]==0].shape[0], " VS " , shots_df_cp[shots_df_cp["xg"]==1].shape[0])


Comparação 0/1:  39526  VS  935


In [13]:
shots_df_cp.drop(["game_id", "period_id", "team_id", "player_id", "time_seconds", "result_id"], inplace=True, axis=1)

# Definir o alvo (target) da descoberta de subgrupos
# Aqui, vamos assumir que queremos encontrar subgrupos de chutes bem-sucedidos (result_name == 'Goal')
target = ps.BinaryTarget('xg', 1)

search_space = ps.create_selectors(shots_df_cp, ignore=['xg'])

# Configurar a tarefa de descoberta de subgrupos
task = ps.SubgroupDiscoveryTask(shots_df_cp, target, search_space, result_set_size=10, depth=2, qf=ps.WRAccQF())

# Configurar e executar o Beam Search
search_algorithm = ps.BeamSearch()
result = search_algorithm.execute(task)

# Exibir os resultados
for sg_result in result.to_dataframe().head(10).itertuples(index=False):
    print(f"Quality: {sg_result.quality}")
    print(f"Subgroup: {sg_result.subgroup}")
    # print(f"Size of Subgroup: {sg_result.size_sg}")
    # print(f"Size of Dataset: {sg_result.size_dataset}")
    # print(f"Positives in Subgroup: {sg_result.positives_sg}")
    # print(f"Positives in Dataset: {sg_result.positives_dataset}")
    # print(f"Size of Complement: {sg_result.size_complement}")
    # print(f"Relative Size of Subgroup: {sg_result.relative_size_sg}")
    # print(f"Relative Size of Complement: {sg_result.relative_size_complement}")
    # print(f"Coverage of Subgroup: {sg_result.coverage_sg}")
    # print(f"Coverage of Complement: {sg_result.coverage_complement}")
    # print(f"Target Share in Subgroup: {sg_result.target_share_sg}")
    # print(f"Target Share in Complement: {sg_result.target_share_complement}")
    # print(f"Target Share in Dataset: {sg_result.target_share_dataset}")
    # print(f"Lift: {sg_result.lift}")
    print("-" * 40)

result.to_dataframe()

Quality: 0.018716662379767844
Subgroup: shot_angle_from_goal>=0.60 AND start_x>=96.60
----------------------------------------
Quality: 0.018398288803861684
Subgroup: shot_distance_from_goal<11.26 AND start_x>=96.60
----------------------------------------
Quality: 0.018238065116663007
Subgroup: shot_angle_from_goal>=0.60 AND shot_distance_from_goal<11.26
----------------------------------------
Quality: 0.017915680160877117
Subgroup: disc_start_y=='center' AND start_x>=96.60
----------------------------------------
Quality: 0.017846586326198236
Subgroup: shot_distance_from_goal<11.26
----------------------------------------
Quality: 0.017846586326198236
Subgroup: disc_start_y=='center' AND shot_distance_from_goal<11.26
----------------------------------------
Quality: 0.017846586326198236
Subgroup: disc_start_x=='attacking' AND shot_distance_from_goal<11.26
----------------------------------------
Quality: 0.017443045908338128
Subgroup: shot_angle_from_goal>=0.60
---------------------

Unnamed: 0,quality,subgroup,size_sg,size_dataset,positives_sg,positives_dataset,size_complement,relative_size_sg,relative_size_complement,coverage_sg,coverage_complement,target_share_sg,target_share_complement,target_share_dataset,lift
0,0.018717,shot_angle_from_goal>=0.60 AND start_x>=96.60,5007,40461,873,935,35454,0.123749,0.876251,0.93369,0.06631,0.174356,0.001749,0.023109,7.545042
1,0.018398,shot_distance_from_goal<11.26 AND start_x>=96.60,6127,40461,886,935,34334,0.15143,0.84857,0.947594,0.052406,0.144606,0.001427,0.023109,6.257644
2,0.018238,shot_angle_from_goal>=0.60 AND shot_distance_f...,6797,40461,895,935,33664,0.167989,0.832011,0.957219,0.042781,0.131676,0.001188,0.023109,5.698109
3,0.017916,disc_start_y=='center' AND start_x>=96.60,6972,40461,886,935,33489,0.172314,0.827686,0.947594,0.052406,0.12708,0.001463,0.023109,5.499223
4,0.017847,shot_distance_from_goal<11.26,8045,40461,908,935,32416,0.198833,0.801167,0.971123,0.028877,0.112865,0.000833,0.023109,4.884103
5,0.017847,disc_start_y=='center' AND shot_distance_from_...,8045,40461,908,935,32416,0.198833,0.801167,0.971123,0.028877,0.112865,0.000833,0.023109,4.884103
6,0.017847,disc_start_x=='attacking' AND shot_distance_fr...,8045,40461,908,935,32416,0.198833,0.801167,0.971123,0.028877,0.112865,0.000833,0.023109,4.884103
7,0.017443,shot_angle_from_goal>=0.60,8189,40461,895,935,32272,0.202392,0.797608,0.957219,0.042781,0.109293,0.001239,0.023109,4.729521
8,0.017443,disc_start_y=='center' AND shot_angle_from_goa...,8189,40461,895,935,32272,0.202392,0.797608,0.957219,0.042781,0.109293,0.001239,0.023109,4.729521
9,0.017443,disc_start_x=='attacking' AND shot_angle_from_...,8189,40461,895,935,32272,0.202392,0.797608,0.957219,0.042781,0.109293,0.001239,0.023109,4.729521


In [14]:
def get_covered_indices(subgroup, df):
    """
    Retorna os índices dos registros no dataframe que são cobertos pelo subgrupo.
    """
    condition = subgroup.covers(df)
    return df[condition].index

# Lista para armazenar todos os índices cobertos
covered_indices = set()

# Iterar sobre os subgrupos e adicionar os índices cobertos ao conjunto
for sg_result in result.to_dataframe().itertuples():
    subgroup = sg_result.subgroup  # Ajuste conforme a coluna correta
    indices = get_covered_indices(subgroup, shots_df_cp)
    covered_indices.update(indices)
    print(len(covered_indices) / len(shots_df_cp))

# Cobertura total (proporção de registros cobertos)
total_coverage = len(covered_indices) / len(shots_df_cp)

print(f"Total Coverage: {total_coverage:.4f}")


0.12374879513605694
0.15142977187909346
0.1956699043523393
0.21655421269864808
0.21971775289785225
0.21971775289785225
0.21971775289785225
0.2541212525641976
0.2541212525641976
0.2541212525641976
Total Coverage: 0.2541


## VAEP

In [15]:
from tqdm import tqdm
import socceraction.spadl as spd
from socceraction.vaep import features as ft
import socceraction.vaep.labels as lab
import socceraction.vaep.formula as fm
import xgboost as xgb
import sklearn.metrics as mt

In [16]:
def features_transform(spadl):
    spadl.loc[spadl.result_id.isin([2, 3]), ["result_id"]] = 0
    spadl.loc[spadl.result_name.isin(["offside", "owngoal"]), ["result_name"]] = "fail"

    xfns = [
        ft.actiontype_onehot,
        ft.bodypart_onehot,
        ft.result_onehot,
        ft.goalscore,
        ft.startlocation,
        ft.endlocation,
        ft.team,
        ft.time,
        ft.time_delta
    ]

    features = []
    for game in tqdm(np.unique(spadl.game_id).tolist()):
        match_actions = spadl.loc[spadl.game_id == game].reset_index(drop=True)
        match_states = ft.gamestates(actions=match_actions)
        match_feats = pd.concat([fn(match_states) for fn in xfns], axis=1)
        features.append(match_feats)
    features = pd.concat(features).reset_index(drop=True)

    return features

def labels_transform(spadl):
    yfns = [lab.scores, lab.concedes]

    labels = []
    for game in tqdm(np.unique(spadl.game_id).tolist()):
        match_actions = spadl.loc[spadl.game_id == game].reset_index(drop=True)
        labels.append(pd.concat([fn(actions=match_actions) for fn in yfns], axis=1))

    labels = pd.concat(labels).reset_index(drop=True)

    return labels

def train_vaep(X_train, y_train, X_test, y_test):
    models = {}
    for m in ["scores", "concedes"]:
        models[m] = xgb.XGBClassifier(random_state=0, n_estimators=50, max_depth=3)

        print("training " + m + " model")
        models[m].fit(X_train, y_train[m])

        p = sum(y_train[m]) / len(y_train[m])
        base = [p] * len(y_train[m])
        y_train_pred = models[m].predict_proba(X_train)[:, 1]
        train_brier = mt.brier_score_loss(y_train[m], y_train_pred) / mt.brier_score_loss(y_train[m], base)
        print(m + " Train NBS: " + str(train_brier))
        print()

        p = sum(y_test[m]) / len(y_test[m])
        base = [p] * len(y_test[m])
        y_test_pred = models[m].predict_proba(X_test)[:, 1]
        test_brier = mt.brier_score_loss(y_test[m], y_test_pred) / mt.brier_score_loss(y_test[m], base)
        print(m + " Test NBS: " + str(test_brier))
        print()

        print("----------------------------------------")

    return models

def generate_predictions(features, models):
    preds = {}
    for m in ["scores", "concedes"]:
        preds[m] = models[m].predict_proba(features)[:, 1]
    preds = pd.DataFrame(preds)

    return preds

def calculate_action_values(spadl, predictions):
    action_values = fm.value(actions=spadl, Pscores=predictions["scores"], Pconcedes=predictions["concedes"])
    action_values = pd.concat([
        spadl[["original_event_id", "player_id", "action_id", "game_id", "start_x", "start_y", "end_x", "end_y", "type_name", "result_name"]],
        predictions.rename(columns={"scores": "Pscores", "concedes": "Pconcedes"}),
        action_values
    ], axis=1)

    return action_values


In [17]:
spadl = {}
for league in LEAGUES:
    spadl[league] = pd.read_csv(f"../data/spadl_format/{league}.csv")

features = {}
for league in LEAGUES:
    features[league] = features_transform(spadl[league])

labels = {}
for league in LEAGUES:
    labels[league] = labels_transform(spadl[league])

models = train_vaep(X_train=features["England"], y_train=labels["England"], X_test=features["Spain"], y_test=labels["Spain"])


100%|██████████| 380/380 [00:06<00:00, 62.72it/s]
100%|██████████| 380/380 [00:06<00:00, 63.17it/s]
100%|██████████| 380/380 [00:06<00:00, 63.14it/s]
100%|██████████| 380/380 [00:06<00:00, 63.09it/s]
100%|██████████| 306/306 [00:04<00:00, 62.88it/s]
100%|██████████| 380/380 [00:07<00:00, 54.25it/s]
100%|██████████| 380/380 [00:06<00:00, 54.45it/s]
100%|██████████| 380/380 [00:06<00:00, 54.70it/s]
100%|██████████| 380/380 [00:07<00:00, 53.55it/s]
100%|██████████| 306/306 [00:05<00:00, 54.14it/s]


training scores model
scores Train NBS: 0.8452471194228581

scores Test NBS: 0.8503677630926355

----------------------------------------
training concedes model
concedes Train NBS: 0.9660641623881886

concedes Test NBS: 0.9766251611701147

----------------------------------------


In [18]:
preds = {}
action_values = {}

for league in LEAGUES:
    preds[league] = generate_predictions(features=features[league], models=models)
    action_values[league] = calculate_action_values(spadl=spadl[league], predictions=preds[league])

all_action_values = pd.concat([df for df in action_values.values()])

In [19]:
shots_df

Unnamed: 0,game_id,period_id,team_id,player_id,time_seconds,start_x,start_y,disc_start_x,disc_start_y,num_events,...,play_mean_distance_to_the_goal,play_std_distance_to_the_goal,play_distance_towards_goal,ratio_distance,total_time_per_play,play_speed,play_speed_towards_goal,shot_distance_from_goal,shot_angle_from_goal,result_id
0,2499719,1,1609,25413,94.595788,92.40,40.12,attacking,center,7,...,30.932864,8.413575,24.15,0.154870,2.836950,7.852377,1.216095,14.007655,0.509981,1
1,2499719,1,1631,26150,179.854785,89.25,32.64,attacking,center,2,...,15.808608,0.000000,-9.45,-0.258190,2.273328,8.050087,-2.078450,15.808608,0.494098,0
2,2499719,1,1631,14763,254.745027,100.80,32.64,attacking,center,7,...,21.086920,12.262251,4.20,0.034728,2.705084,6.386817,0.221805,4.414703,1.467310,1
3,2499719,1,1609,7868,425.824035,85.05,45.56,attacking,right,6,...,57.591081,22.443334,75.60,0.783021,2.161246,7.445480,5.829970,23.057235,0.300168,0
4,2499719,1,1609,7868,815.462015,78.75,47.60,attacking,right,1,...,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,29.563872,0.240030,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40456,2576338,2,3193,116269,1152.032980,99.75,37.40,attacking,center,9,...,25.291699,10.312188,28.35,0.174816,2.465061,7.309718,1.277859,6.254798,1.067542,0
40457,2576338,2,3193,3548,1251.730517,97.65,42.16,attacking,center,20,...,52.186433,19.477819,75.60,0.262006,2.510138,5.747561,1.505893,10.982172,0.512084,0
40458,2576338,2,3193,21177,2065.034482,94.50,36.72,attacking,center,1,...,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,10.846585,0.690619,1
40459,2576338,2,3193,349102,2367.252041,82.95,46.24,attacking,right,3,...,27.824216,2.604777,6.30,0.133757,2.367156,6.632500,0.887140,25.219439,0.277183,0


In [20]:
all_action_values

Unnamed: 0,original_event_id,player_id,action_id,game_id,start_x,start_y,end_x,end_y,type_name,result_name,Pscores,Pconcedes,offensive_value,defensive_value,vaep_value
0,177959171.0,25413,0,2499719,51.45,34.68,32.55,14.96,pass,success,0.003555,0.000560,0.000000,-0.000000,0.000000
1,177959172.0,370224,1,2499719,32.55,14.96,53.55,17.00,pass,success,0.004460,0.000536,0.000905,0.000024,0.000928
2,177959173.0,3319,2,2499719,53.55,17.00,36.75,19.72,pass,success,0.005223,0.000446,0.000764,0.000090,0.000854
3,177959174.0,120339,3,2499719,36.75,19.72,43.05,3.40,pass,success,0.002345,0.000363,-0.002879,0.000083,-0.002795
4,177959175.0,167145,4,2499719,43.05,3.40,75.60,8.16,pass,success,0.005549,0.000346,0.003204,0.000017,0.003221
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
389146,251206865.0,94831,1148,2517044,87.15,36.72,87.15,36.72,shot,fail,0.014503,0.009633,0.007984,-0.002125,0.005859
389147,251206783.0,14917,1149,2517044,17.85,31.96,23.10,27.20,interception,success,0.005530,0.017490,-0.004103,-0.002987,-0.007090
389148,,14804,1150,2517044,23.10,27.20,0.00,3.40,dribble,success,0.004298,0.008181,-0.001233,0.009310,0.008077
389149,251206790.0,14804,1151,2517044,0.00,3.40,5.25,4.76,pass,fail,0.002736,0.014004,-0.001561,-0.005823,-0.007385


In [21]:
shots_df_cp = shots_df.copy()
all_action_values_cp = all_action_values.copy()
all_action_values_cp.drop(["original_event_id", "result_name", "action_id", "type_name"], inplace=True, axis=1)

shots_df_cp = shots_df_cp.merge(all_action_values_cp, on=['game_id', 'player_id', 'start_x', 'start_y'], how='left')

# Exibir o dataframe resultante
shots_df_cp

Unnamed: 0,game_id,period_id,team_id,player_id,time_seconds,start_x,start_y,disc_start_x,disc_start_y,num_events,...,shot_distance_from_goal,shot_angle_from_goal,result_id,end_x,end_y,Pscores,Pconcedes,offensive_value,defensive_value,vaep_value
0,2499719,1,1609,25413,94.595788,92.40,40.12,attacking,center,7,...,14.007655,0.509981,1,105.0,37.4000,0.978135,0.002137,0.902766,-0.000387,0.902379
1,2499719,1,1631,26150,179.854785,89.25,32.64,attacking,center,2,...,15.808608,0.494098,0,105.0,40.8000,0.018184,0.007322,-0.020396,-0.003519,-0.023916
2,2499719,1,1631,14763,254.745027,100.80,32.64,attacking,center,7,...,4.414703,1.467310,1,105.0,34.0000,0.977107,0.002290,0.885530,0.000666,0.886196
3,2499719,1,1609,7868,425.824035,85.05,45.56,attacking,right,6,...,23.057235,0.300168,0,105.0,40.8000,0.021434,0.002819,-0.004685,-0.001744,-0.006429
4,2499719,1,1609,7868,815.462015,78.75,47.60,attacking,right,1,...,29.563872,0.240030,0,105.0,37.4000,0.017245,0.005117,-0.019283,-0.002159,-0.021442
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40778,2576338,2,3193,116269,1152.032980,99.75,37.40,attacking,center,9,...,6.254798,1.067542,0,105.0,40.8000,0.034776,0.009745,-0.081321,-0.004355,-0.085676
40779,2576338,2,3193,3548,1251.730517,97.65,42.16,attacking,center,20,...,10.982172,0.512084,0,105.0,40.8000,0.029877,0.010725,-0.052057,-0.005569,-0.057625
40780,2576338,2,3193,21177,2065.034482,94.50,36.72,attacking,center,1,...,10.846585,0.690619,1,105.0,34.0000,0.977372,0.007182,0.950002,-0.002297,0.947704
40781,2576338,2,3193,349102,2367.252041,82.95,46.24,attacking,right,3,...,25.219439,0.277183,0,105.0,40.8000,0.015293,0.006782,-0.020930,-0.003574,-0.024504


In [22]:
print(min(np.unique(shots_df_cp['Pscores'].tolist())), max(np.unique(shots_df_cp['Pscores'].tolist())))

0.0024156407453119755 0.9968808889389038


In [23]:
shots_df_cp.drop(["game_id", "period_id", "team_id", "player_id", "time_seconds", "result_id"], inplace=True, axis=1)

# Definir o alvo (target) da descoberta de subgrupos
# Aqui, vamos assumir que queremos encontrar subgrupos de chutes bem-sucedidos (result_name == 'Goal')
target = ps.NumericTarget('Pscores')

search_space = ps.create_selectors(shots_df_cp, ignore=["Pscores", "Pconcedes", "offensive_value", "defensive_value", "vaep_value"])

# Configurar a tarefa de descoberta de subgrupos
task = ps.SubgroupDiscoveryTask(shots_df_cp, target, search_space, result_set_size=10, depth=2, qf=ps.StandardQFNumeric(1.0))
# Configurar e executar o Beam Search
search_algorithm = ps.BeamSearch()

result = search_algorithm.execute(task)

# Exibir os resultados
for sg_result in result.to_dataframe().head(10).itertuples(index=False):
    print(f"Quality: {sg_result.quality}")
    print(f"Subgroup: {sg_result.subgroup}")
    # print(f"Size of Subgroup: {sg_result.size_sg}")
    # print(f"Size of Dataset: {sg_result.size_dataset}")
    # print(f"Positives in Subgroup: {sg_result.positives_sg}")
    # print(f"Positives in Dataset: {sg_result.positives_dataset}")
    # print(f"Size of Complement: {sg_result.size_complement}")
    # print(f"Relative Size of Subgroup: {sg_result.relative_size_sg}")
    # print(f"Relative Size of Complement: {sg_result.relative_size_complement}")
    # print(f"Coverage of Subgroup: {sg_result.coverage_sg}")
    # print(f"Coverage of Complement: {sg_result.coverage_complement}")
    # print(f"Target Share in Subgroup: {sg_result.target_share_sg}")
    # print(f"Target Share in Complement: {sg_result.target_share_complement}")
    # print(f"Target Share in Dataset: {sg_result.target_share_dataset}")
    # print(f"Lift: {sg_result.lift}")
    print("-" * 40)

print(result.to_dataframe().shape)

Quality: 1378.038871049881
Subgroup: disc_start_y=='center' AND end_x>=105.0
----------------------------------------
Quality: 1322.4417318552732
Subgroup: end_x>=105.0 AND shot_angle_from_goal>=0.60
----------------------------------------
Quality: 1317.8171427845955
Subgroup: disc_start_x=='attacking' AND end_x>=105.0
----------------------------------------
Quality: 1301.6050531119108
Subgroup: end_x>=105.0 AND shot_distance_from_goal<11.26
----------------------------------------
Quality: 1259.5690176784992
Subgroup: shot_angle_from_goal>=0.60
----------------------------------------
Quality: 1259.5690176784992
Subgroup: disc_start_y=='center' AND shot_angle_from_goal>=0.60
----------------------------------------
Quality: 1259.5690176784992
Subgroup: disc_start_x=='attacking' AND shot_angle_from_goal>=0.60
----------------------------------------
Quality: 1246.4595663398504
Subgroup: shot_distance_from_goal<11.26
----------------------------------------
Quality: 1246.4595663398504

In [24]:
result.to_dataframe()

Unnamed: 0,quality,subgroup,size_sg,size_dataset,mean_sg,mean_dataset,std_sg,std_dataset,median_sg,median_dataset,max_sg,max_dataset,min_sg,min_dataset,mean_lift,median_lift
0,1378.038871,disc_start_y=='center' AND end_x>=105.0,23139,40783,0.184786,0.125231,0.35357,0.292654,0.029709,0.026238,0.996881,0.996881,0.005334,0.002416,1.47556,1.132277
1,1322.441732,end_x>=105.0 AND shot_angle_from_goal>=0.60,7503,40783,0.301486,0.125231,0.426523,0.292654,0.036123,0.026238,0.995717,0.996881,0.01408,0.002416,2.407441,1.376754
2,1317.817143,disc_start_x=='attacking' AND end_x>=105.0,27422,40783,0.173288,0.125231,0.341678,0.292654,0.030061,0.026238,0.996881,0.996881,0.005468,0.002416,1.383747,1.145722
3,1301.605053,end_x>=105.0 AND shot_distance_from_goal<11.26,7447,40783,0.300013,0.125231,0.425722,0.292654,0.036167,0.026238,0.995717,0.996881,0.013894,0.002416,2.395682,1.37843
4,1259.569018,shot_angle_from_goal>=0.60,8242,40783,0.278054,0.125231,0.414101,0.292654,0.036029,0.026238,0.995717,0.996881,0.009808,0.002416,2.220332,1.373161
5,1259.569018,disc_start_y=='center' AND shot_angle_from_goa...,8242,40783,0.278054,0.125231,0.414101,0.292654,0.036029,0.026238,0.995717,0.996881,0.009808,0.002416,2.220332,1.373161
6,1259.569018,disc_start_x=='attacking' AND shot_angle_from_...,8242,40783,0.278054,0.125231,0.414101,0.292654,0.036029,0.026238,0.995717,0.996881,0.009808,0.002416,2.220332,1.373161
7,1246.459566,shot_distance_from_goal<11.26,8083,40783,0.279438,0.125231,0.414796,0.292654,0.036144,0.026238,0.995717,0.996881,0.012183,0.002416,2.231386,1.377538
8,1246.459566,disc_start_y=='center' AND shot_distance_from_...,8083,40783,0.279438,0.125231,0.414796,0.292654,0.036144,0.026238,0.995717,0.996881,0.012183,0.002416,2.231386,1.377538
9,1246.459566,disc_start_x=='attacking' AND shot_distance_fr...,8083,40783,0.279438,0.125231,0.414796,0.292654,0.036144,0.026238,0.995717,0.996881,0.012183,0.002416,2.231386,1.377538


In [25]:
def get_covered_indices(subgroup, df):
    # Verifica se a condição cobre algum registro
    condition = subgroup.covers(df)
    return df[condition].index

# Lista para armazenar todos os índices cobertos
covered_indices = set()

# Iterar sobre os subgrupos e adicionar os índices cobertos ao conjunto
for sg_result in result.to_dataframe().itertuples():
    subgroup = sg_result.subgroup
    indices = get_covered_indices(subgroup, shots_df_cp)
    covered_indices.update(indices)
    print(len(covered_indices) / len(shots_df_cp))

# Cobertura total (proporção de registros cobertos)
total_coverage = len(covered_indices) / len(shots_df_cp)

print(f"Total Coverage: {total_coverage:.4f}")

0.5673687565897555
0.5673687565897555
0.732658215432901
0.732658215432901
0.750778510653949
0.750778510653949
0.750778510653949
0.753990633352132
0.753990633352132
0.753990633352132
Total Coverage: 0.7540


In [26]:
result.to_dataframe().head()

Unnamed: 0,quality,subgroup,size_sg,size_dataset,mean_sg,mean_dataset,std_sg,std_dataset,median_sg,median_dataset,max_sg,max_dataset,min_sg,min_dataset,mean_lift,median_lift
0,1378.038871,disc_start_y=='center' AND end_x>=105.0,23139,40783,0.184786,0.125231,0.35357,0.292654,0.029709,0.026238,0.996881,0.996881,0.005334,0.002416,1.47556,1.132277
1,1322.441732,end_x>=105.0 AND shot_angle_from_goal>=0.60,7503,40783,0.301486,0.125231,0.426523,0.292654,0.036123,0.026238,0.995717,0.996881,0.01408,0.002416,2.407441,1.376754
2,1317.817143,disc_start_x=='attacking' AND end_x>=105.0,27422,40783,0.173288,0.125231,0.341678,0.292654,0.030061,0.026238,0.996881,0.996881,0.005468,0.002416,1.383747,1.145722
3,1301.605053,end_x>=105.0 AND shot_distance_from_goal<11.26,7447,40783,0.300013,0.125231,0.425722,0.292654,0.036167,0.026238,0.995717,0.996881,0.013894,0.002416,2.395682,1.37843
4,1259.569018,shot_angle_from_goal>=0.60,8242,40783,0.278054,0.125231,0.414101,0.292654,0.036029,0.026238,0.995717,0.996881,0.009808,0.002416,2.220332,1.373161


## Análise Uma liga VS Outra Liga

#### Pre-process

In [74]:
England_df = pd.read_csv(f"{PATH}England.csv", index_col=0)
England_df.drop(["original_event_id", "result_name","bodypart_id", "type_id"], inplace=True, axis=1)
England_shots_df = generate_shots_with_counts_events(England_df)


Spain_df = pd.read_csv(f"{PATH}Spain.csv", index_col=0)
Spain_df.drop(["original_event_id", "result_name","bodypart_id", "type_id"], inplace=True, axis=1)
Spain_shots_df = generate_shots_with_counts_events(Spain_df)

### Binário

In [77]:
England_shots_df_cp = England_shots_df.copy()
England_shots_df_cp.drop(["game_id", "period_id", "team_id", "player_id", "time_seconds"], inplace=True, axis=1)

# Definir o alvo (target) da descoberta de subgrupos
# Aqui, vamos assumir que queremos encontrar subgrupos de chutes bem-sucedidos (result_name == 'Goal')
target = ps.BinaryTarget('result_id', 1)

search_space = ps.create_selectors(England_shots_df_cp, ignore=['result_id'])

# Configurar a tarefa de descoberta de subgrupos
task = ps.SubgroupDiscoveryTask(England_shots_df_cp, target, search_space, result_set_size=10, depth=3, qf=ps.WRAccQF())

# Configurar e executar o Beam Search
search_algorithm = ps.BeamSearch()
result = search_algorithm.execute(task)

# # Exibir os resultados
for sg_result in result.to_dataframe().head(10).itertuples(index=False):
    print(f"Quality: {sg_result.quality}")
    print(f"Subgroup: {sg_result.subgroup}")
    print(f"Size of Subgroup: {sg_result.size_sg}")
#     print(f"Size of Dataset: {sg_result.size_dataset}")
#     print(f"Positives in Subgroup: {sg_result.positives_sg}")
#     print(f"Positives in Dataset: {sg_result.positives_dataset}")
#     print(f"Size of Complement: {sg_result.size_complement}")
#     print(f"Relative Size of Subgroup: {sg_result.relative_size_sg}")
#     print(f"Relative Size of Complement: {sg_result.relative_size_complement}")
#     print(f"Coverage of Subgroup: {sg_result.coverage_sg}")
#     print(f"Coverage of Complement: {sg_result.coverage_complement}")
#     print(f"Target Share in Subgroup: {sg_result.target_share_sg}")
#     print(f"Target Share in Complement: {sg_result.target_share_complement}")
#     print(f"Target Share in Dataset: {sg_result.target_share_dataset}")
#     print(f"Lift: {sg_result.lift}")
    print("-" * 40)

result.to_dataframe()

Quality: 0.03192391658395455
Subgroup: shot_angle_from_goal>=0.61
Size of Subgroup: 1694
----------------------------------------
Quality: 0.03192391658395455
Subgroup: disc_start_y=='center' AND shot_angle_from_goal>=0.61
Size of Subgroup: 1694
----------------------------------------
Quality: 0.03192391658395455
Subgroup: disc_start_x=='attacking' AND shot_angle_from_goal>=0.61
Size of Subgroup: 1694
----------------------------------------
Quality: 0.03192391658395455
Subgroup: disc_start_x=='attacking' AND disc_start_y=='center' AND shot_angle_from_goal>=0.61
Size of Subgroup: 1694
----------------------------------------
Quality: 0.03192076617948671
Subgroup: shot_distance_from_goal<11.26
Size of Subgroup: 1685
----------------------------------------
Quality: 0.03192076617948671
Subgroup: disc_start_y=='center' AND shot_distance_from_goal<11.26
Size of Subgroup: 1685
----------------------------------------
Quality: 0.03192076617948671
Subgroup: disc_start_x=='attacking' AND shot

Unnamed: 0,quality,subgroup,size_sg,size_dataset,positives_sg,positives_dataset,size_complement,relative_size_sg,relative_size_complement,coverage_sg,coverage_complement,target_share_sg,target_share_complement,target_share_dataset,lift
0,0.031924,shot_angle_from_goal>=0.61,1694,8451,453,914,6757,0.20045,0.79955,0.495624,0.504376,0.267414,0.068226,0.108153,2.472559
1,0.031924,disc_start_y=='center' AND shot_angle_from_goa...,1694,8451,453,914,6757,0.20045,0.79955,0.495624,0.504376,0.267414,0.068226,0.108153,2.472559
2,0.031924,disc_start_x=='attacking' AND shot_angle_from_...,1694,8451,453,914,6757,0.20045,0.79955,0.495624,0.504376,0.267414,0.068226,0.108153,2.472559
3,0.031924,disc_start_x=='attacking' AND disc_start_y=='c...,1694,8451,453,914,6757,0.20045,0.79955,0.495624,0.504376,0.267414,0.068226,0.108153,2.472559
4,0.031921,shot_distance_from_goal<11.26,1685,8451,452,914,6766,0.199385,0.800615,0.49453,0.50547,0.268249,0.068283,0.108153,2.480278
5,0.031921,disc_start_y=='center' AND shot_distance_from_...,1685,8451,452,914,6766,0.199385,0.800615,0.49453,0.50547,0.268249,0.068283,0.108153,2.480278
6,0.031921,disc_start_x=='attacking' AND shot_distance_fr...,1685,8451,452,914,6766,0.199385,0.800615,0.49453,0.50547,0.268249,0.068283,0.108153,2.480278
7,0.031921,disc_start_x=='attacking' AND disc_start_y=='c...,1685,8451,452,914,6766,0.199385,0.800615,0.49453,0.50547,0.268249,0.068283,0.108153,2.480278
8,0.030326,shot_angle_from_goal>=0.61 AND shot_distance_f...,1449,8451,413,914,7002,0.171459,0.828541,0.45186,0.54814,0.285024,0.071551,0.108153,2.635382
9,0.030326,disc_start_y=='center' AND shot_angle_from_goa...,1449,8451,413,914,7002,0.171459,0.828541,0.45186,0.54814,0.285024,0.071551,0.108153,2.635382


In [79]:
def show_coverage(result, df):# Lista para armazenar todos os índices cobertos
    covered_indices = set()

    # Iterar sobre os subgrupos e adicionar os índices cobertos ao conjunto
    for sg_result in result.to_dataframe().itertuples():
        subgroup = sg_result.subgroup  # Ajuste conforme a coluna correta
        indices = get_covered_indices(subgroup, df)
        covered_indices.update(indices)
        print(len(covered_indices) / len(df))

    # Cobertura total (proporção de registros cobertos)
    total_coverage = len(covered_indices) / len(df)

    print(f"Total Coverage: {total_coverage:.4f}")

show_coverage(result, England_shots_df_cp)

0.20044965092888414
0.20044965092888414
0.20044965092888414
0.20044965092888414
0.22837534019642647
0.22837534019642647
0.22837534019642647
0.22837534019642647
0.22837534019642647
0.22837534019642647
Total Coverage: 0.2284


In [80]:
Spain_shots_df_cp = Spain_shots_df.copy()
Spain_shots_df_cp.drop(["game_id", "period_id", "team_id", "player_id", "time_seconds"], inplace=True, axis=1)

# Definir o alvo (target) da descoberta de subgrupos
# Aqui, vamos assumir que queremos encontrar subgrupos de chutes bem-sucedidos (result_name == 'Goal')
target = ps.BinaryTarget('result_id', 1)

search_space = ps.create_selectors(Spain_shots_df_cp, ignore=['result_id'])

# Configurar a tarefa de descoberta de subgrupos
task = ps.SubgroupDiscoveryTask(Spain_shots_df_cp, target, search_space, result_set_size=10, depth=3, qf=ps.WRAccQF())

# Configurar e executar o Beam Search
search_algorithm = ps.BeamSearch()
result = search_algorithm.execute(task)

# # Exibir os resultados
for sg_result in result.to_dataframe().head(10).itertuples(index=False):
    print(f"Quality: {sg_result.quality}")
    print(f"Subgroup: {sg_result.subgroup}")
    print(f"Size of Subgroup: {sg_result.size_sg}")
#     print(f"Size of Dataset: {sg_result.size_dataset}")
#     print(f"Positives in Subgroup: {sg_result.positives_sg}")
#     print(f"Positives in Dataset: {sg_result.positives_dataset}")
#     print(f"Size of Complement: {sg_result.size_complement}")
#     print(f"Relative Size of Subgroup: {sg_result.relative_size_sg}")
#     print(f"Relative Size of Complement: {sg_result.relative_size_complement}")
#     print(f"Coverage of Subgroup: {sg_result.coverage_sg}")
#     print(f"Coverage of Complement: {sg_result.coverage_complement}")
#     print(f"Target Share in Subgroup: {sg_result.target_share_sg}")
#     print(f"Target Share in Complement: {sg_result.target_share_complement}")
#     print(f"Target Share in Dataset: {sg_result.target_share_dataset}")
#     print(f"Lift: {sg_result.lift}")
    print("-" * 40)

result.to_dataframe()

Quality: 0.030671627196098374
Subgroup: shot_distance_from_goal<11.04
Size of Subgroup: 1582
----------------------------------------
Quality: 0.030671627196098374
Subgroup: disc_start_y=='center' AND shot_distance_from_goal<11.04
Size of Subgroup: 1582
----------------------------------------
Quality: 0.030671627196098374
Subgroup: disc_start_x=='attacking' AND shot_distance_from_goal<11.04
Size of Subgroup: 1582
----------------------------------------
Quality: 0.030671627196098374
Subgroup: disc_start_x=='attacking' AND disc_start_y=='center' AND shot_distance_from_goal<11.04
Size of Subgroup: 1582
----------------------------------------
Quality: 0.030533135443692964
Subgroup: shot_angle_from_goal>=0.61
Size of Subgroup: 1601
----------------------------------------
Quality: 0.030533135443692964
Subgroup: disc_start_y=='center' AND shot_angle_from_goal>=0.61
Size of Subgroup: 1601
----------------------------------------
Quality: 0.030533135443692964
Subgroup: disc_start_x=='attack

Unnamed: 0,quality,subgroup,size_sg,size_dataset,positives_sg,positives_dataset,size_complement,relative_size_sg,relative_size_complement,coverage_sg,coverage_complement,target_share_sg,target_share_complement,target_share_dataset,lift
0,0.030672,shot_distance_from_goal<11.04,1582,7979,420,884,6397,0.19827,0.80173,0.475113,0.524887,0.265487,0.072534,0.110791,2.396288
1,0.030672,disc_start_y=='center' AND shot_distance_from_...,1582,7979,420,884,6397,0.19827,0.80173,0.475113,0.524887,0.265487,0.072534,0.110791,2.396288
2,0.030672,disc_start_x=='attacking' AND shot_distance_fr...,1582,7979,420,884,6397,0.19827,0.80173,0.475113,0.524887,0.265487,0.072534,0.110791,2.396288
3,0.030672,disc_start_x=='attacking' AND disc_start_y=='c...,1582,7979,420,884,6397,0.19827,0.80173,0.475113,0.524887,0.265487,0.072534,0.110791,2.396288
4,0.030533,shot_angle_from_goal>=0.61,1601,7979,421,884,6378,0.200652,0.799348,0.476244,0.523756,0.262961,0.072593,0.110791,2.373488
5,0.030533,disc_start_y=='center' AND shot_angle_from_goa...,1601,7979,421,884,6378,0.200652,0.799348,0.476244,0.523756,0.262961,0.072593,0.110791,2.373488
6,0.030533,disc_start_x=='attacking' AND shot_angle_from_...,1601,7979,421,884,6378,0.200652,0.799348,0.476244,0.523756,0.262961,0.072593,0.110791,2.373488
7,0.030533,disc_start_x=='attacking' AND disc_start_y=='c...,1601,7979,421,884,6378,0.200652,0.799348,0.476244,0.523756,0.262961,0.072593,0.110791,2.373488
8,0.029298,shot_angle_from_goal>=0.61 AND shot_distance_f...,1347,7979,383,884,6632,0.168818,0.831182,0.433258,0.566742,0.284336,0.075543,0.110791,2.566418
9,0.029298,disc_start_y=='center' AND shot_angle_from_goa...,1347,7979,383,884,6632,0.168818,0.831182,0.433258,0.566742,0.284336,0.075543,0.110791,2.566418


In [81]:
show_coverage(result, Spain_shots_df_cp)

0.19827045995738815
0.19827045995738815
0.19827045995738815
0.19827045995738815
0.2301040230605339
0.2301040230605339
0.2301040230605339
0.2301040230605339
0.2301040230605339
0.2301040230605339
Total Coverage: 0.2301


### xG

In [82]:
England_shots_df_cp = England_shots_df.copy()
X = England_shots_df_cp[["bodypart_name", "shot_distance_from_goal", "shot_angle_from_goal"]]
X["bodypart_name"] = X["bodypart_name"].apply(lambda val: 0 if val == "foot_right" else 1 if val == "foot_left" else 2)
England_shots_df_cp["xg"] = rfc.predict(X=X)

target = ps.BinaryTarget('xg', 1)

search_space = ps.create_selectors(England_shots_df_cp, ignore=['xg', 'result_id'])

# Configurar a tarefa de descoberta de subgrupos
task = ps.SubgroupDiscoveryTask(England_shots_df_cp, target, search_space, result_set_size=10, depth=2, qf=ps.WRAccQF())

# Configurar e executar o Beam Search
search_algorithm = ps.BeamSearch()
result = search_algorithm.execute(task)

# Exibir os resultados
for sg_result in result.to_dataframe().head(10).itertuples(index=False):
    print(f"Quality: {sg_result.quality}")
    print(f"Subgroup: {sg_result.subgroup}")
    print(f"Size of Subgroup: {sg_result.size_sg}")
    # print(f"Size of Dataset: {sg_result.size_dataset}")
    # print(f"Positives in Subgroup: {sg_result.positives_sg}")
    # print(f"Positives in Dataset: {sg_result.positives_dataset}")
    # print(f"Size of Complement: {sg_result.size_complement}")
    # print(f"Relative Size of Subgroup: {sg_result.relative_size_sg}")
    # print(f"Relative Size of Complement: {sg_result.relative_size_complement}")
    # print(f"Coverage of Subgroup: {sg_result.coverage_sg}")
    # print(f"Coverage of Complement: {sg_result.coverage_complement}")
    # print(f"Target Share in Subgroup: {sg_result.target_share_sg}")
    # print(f"Target Share in Complement: {sg_result.target_share_complement}")
    # print(f"Target Share in Dataset: {sg_result.target_share_dataset}")
    # print(f"Lift: {sg_result.lift}")
    print("-" * 40)

result.to_dataframe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["bodypart_name"] = X["bodypart_name"].apply(lambda val: 0 if val == "foot_right" else 1 if val == "foot_left" else 2)


Quality: 0.020278817516265642
Subgroup: shot_angle_from_goal>=0.61 AND start_x>=96.60
Size of Subgroup: 1096
----------------------------------------
Quality: 0.019811633536383207
Subgroup: shot_distance_from_goal<11.26 AND start_x>=96.60
Size of Subgroup: 1332
----------------------------------------
Quality: 0.01969935312114981
Subgroup: shot_angle_from_goal>=0.61 AND shot_distance_from_goal<11.26
Size of Subgroup: 1449
----------------------------------------
Quality: 0.019304628444027413
Subgroup: disc_start_y=='center' AND start_x>=96.60
Size of Subgroup: 1502
----------------------------------------
Quality: 0.01923216914126737
Subgroup: shot_distance_from_goal<11.26
Size of Subgroup: 1685
----------------------------------------
Quality: 0.01923216914126737
Subgroup: disc_start_y=='center' AND shot_distance_from_goal<11.26
Size of Subgroup: 1685
----------------------------------------
Quality: 0.01923216914126737
Subgroup: disc_start_x=='attacking' AND shot_distance_from_goal<1

Unnamed: 0,quality,subgroup,size_sg,size_dataset,positives_sg,positives_dataset,size_complement,relative_size_sg,relative_size_complement,coverage_sg,coverage_complement,target_share_sg,target_share_complement,target_share_dataset,lift
0,0.020279,shot_angle_from_goal>=0.61 AND start_x>=96.60,1096,8451,199,213,7355,0.129689,0.870311,0.934272,0.065728,0.181569,0.001903,0.025204,7.203955
1,0.019812,shot_distance_from_goal<11.26 AND start_x>=96.60,1332,8451,201,213,7119,0.157614,0.842386,0.943662,0.056338,0.150901,0.001686,0.025204,5.987153
2,0.019699,shot_angle_from_goal>=0.61 AND shot_distance_f...,1449,8451,203,213,7002,0.171459,0.828541,0.953052,0.046948,0.140097,0.001428,0.025204,5.558481
3,0.019305,disc_start_y=='center' AND start_x>=96.60,1502,8451,201,213,6949,0.17773,0.82227,0.943662,0.056338,0.133822,0.001727,0.025204,5.309512
4,0.019232,shot_distance_from_goal<11.26,1685,8451,205,213,6766,0.199385,0.800615,0.962441,0.037559,0.121662,0.001182,0.025204,4.827057
5,0.019232,disc_start_y=='center' AND shot_distance_from_...,1685,8451,205,213,6766,0.199385,0.800615,0.962441,0.037559,0.121662,0.001182,0.025204,4.827057
6,0.019232,disc_start_x=='attacking' AND shot_distance_fr...,1685,8451,205,213,6766,0.199385,0.800615,0.962441,0.037559,0.121662,0.001182,0.025204,4.827057
7,0.018969,shot_angle_from_goal>=0.61,1694,8451,203,213,6757,0.20045,0.79955,0.953052,0.046948,0.119835,0.00148,0.025204,4.754569
8,0.018969,disc_start_y=='center' AND shot_angle_from_goa...,1694,8451,203,213,6757,0.20045,0.79955,0.953052,0.046948,0.119835,0.00148,0.025204,4.754569
9,0.018969,disc_start_x=='attacking' AND shot_angle_from_...,1694,8451,203,213,6757,0.20045,0.79955,0.953052,0.046948,0.119835,0.00148,0.025204,4.754569


In [83]:
show_coverage(result, England_shots_df_cp)

0.12968879422553545
0.15761448349307774
0.19938468820257957
0.21950065081055498
0.21950065081055498
0.21950065081055498
0.21950065081055498
0.24849130280440185
0.24849130280440185
0.24849130280440185
Total Coverage: 0.2485


In [84]:
Spain_shots_df_cp = Spain_shots_df.copy()
X = Spain_shots_df_cp[["bodypart_name", "shot_distance_from_goal", "shot_angle_from_goal"]]
X["bodypart_name"] = X["bodypart_name"].apply(lambda val: 0 if val == "foot_right" else 1 if val == "foot_left" else 2)
Spain_shots_df_cp["xg"] = rfc.predict(X=X)

target = ps.BinaryTarget('xg', 1)

search_space = ps.create_selectors(Spain_shots_df_cp, ignore=['xg', 'result_id'])

# Configurar a tarefa de descoberta de subgrupos
task = ps.SubgroupDiscoveryTask(Spain_shots_df_cp, target, search_space, result_set_size=10, depth=2, qf=ps.WRAccQF())

# Configurar e executar o Beam Search
search_algorithm = ps.BeamSearch()
result = search_algorithm.execute(task)

# Exibir os resultados
for sg_result in result.to_dataframe().head(10).itertuples(index=False):
    print(f"Quality: {sg_result.quality}")
    print(f"Subgroup: {sg_result.subgroup}")
    print(f"Size of Subgroup: {sg_result.size_sg}")
    # print(f"Size of Dataset: {sg_result.size_dataset}")
    # print(f"Positives in Subgroup: {sg_result.positives_sg}")
    # print(f"Positives in Dataset: {sg_result.positives_dataset}")
    # print(f"Size of Complement: {sg_result.size_complement}")
    # print(f"Relative Size of Subgroup: {sg_result.relative_size_sg}")
    # print(f"Relative Size of Complement: {sg_result.relative_size_complement}")
    # print(f"Coverage of Subgroup: {sg_result.coverage_sg}")
    # print(f"Coverage of Complement: {sg_result.coverage_complement}")
    # print(f"Target Share in Subgroup: {sg_result.target_share_sg}")
    # print(f"Target Share in Complement: {sg_result.target_share_complement}")
    # print(f"Target Share in Dataset: {sg_result.target_share_dataset}")
    # print(f"Lift: {sg_result.lift}")
    print("-" * 40)

result.to_dataframe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["bodypart_name"] = X["bodypart_name"].apply(lambda val: 0 if val == "foot_right" else 1 if val == "foot_left" else 2)


Quality: 0.019539824436689862
Subgroup: shot_angle_from_goal>=0.61 AND start_x>=96.60
Size of Subgroup: 996
----------------------------------------
Quality: 0.019353063981194782
Subgroup: shot_angle_from_goal>=0.61 AND shot_distance_from_goal<11.04
Size of Subgroup: 1347
----------------------------------------
Quality: 0.019328733287707654
Subgroup: shot_distance_from_goal<11.04 AND start_x>=96.60
Size of Subgroup: 1231
----------------------------------------
Quality: 0.01914197283221257
Subgroup: shot_distance_from_goal<11.04
Size of Subgroup: 1582
----------------------------------------
Quality: 0.01914197283221257
Subgroup: disc_start_y=='center' AND shot_distance_from_goal<11.04
Size of Subgroup: 1582
----------------------------------------
Quality: 0.01914197283221257
Subgroup: disc_start_x=='attacking' AND shot_distance_from_goal<11.04
Size of Subgroup: 1582
----------------------------------------
Quality: 0.018728492409129927
Subgroup: disc_start_y=='center' AND start_x>=9

Unnamed: 0,quality,subgroup,size_sg,size_dataset,positives_sg,positives_dataset,size_complement,relative_size_sg,relative_size_complement,coverage_sg,coverage_complement,target_share_sg,target_share_complement,target_share_dataset,lift
0,0.01954,shot_angle_from_goal>=0.61 AND start_x>=96.60,996,7979,180,193,6983,0.124828,0.875172,0.932642,0.067358,0.180723,0.001862,0.024188,7.47144
1,0.019353,shot_angle_from_goal>=0.61 AND shot_distance_f...,1347,7979,187,193,6632,0.168818,0.831182,0.968912,0.031088,0.138827,0.000905,0.024188,5.739382
2,0.019329,shot_distance_from_goal<11.04 AND start_x>=96.60,1231,7979,184,193,6748,0.15428,0.84572,0.953368,0.046632,0.149472,0.001334,0.024188,6.179466
3,0.019142,shot_distance_from_goal<11.04,1582,7979,191,193,6397,0.19827,0.80173,0.989637,0.010363,0.120733,0.000313,0.024188,4.99135
4,0.019142,disc_start_y=='center' AND shot_distance_from_...,1582,7979,191,193,6397,0.19827,0.80173,0.989637,0.010363,0.120733,0.000313,0.024188,4.99135
5,0.019142,disc_start_x=='attacking' AND shot_distance_fr...,1582,7979,191,193,6397,0.19827,0.80173,0.989637,0.010363,0.120733,0.000313,0.024188,4.99135
6,0.018728,disc_start_y=='center' AND start_x>=96.60,1429,7979,184,193,6550,0.179095,0.820905,0.953368,0.046632,0.128761,0.001374,0.024188,5.323249
7,0.018583,shot_angle_from_goal>=0.61,1601,7979,187,193,6378,0.200652,0.799348,0.968912,0.031088,0.116802,0.000941,0.024188,4.828825
8,0.018583,disc_start_y=='center' AND shot_angle_from_goa...,1601,7979,187,193,6378,0.200652,0.799348,0.968912,0.031088,0.116802,0.000941,0.024188,4.828825
9,0.018583,disc_start_x=='attacking' AND shot_angle_from_...,1601,7979,187,193,6378,0.200652,0.799348,0.968912,0.031088,0.116802,0.000941,0.024188,4.828825


In [85]:
show_coverage(result, Spain_shots_df_cp)

0.12482767264068179
0.16881814763754857
0.19827045995738815
0.19827045995738815
0.19827045995738815
0.19827045995738815
0.22308559969921044
0.25491916280235616
0.25491916280235616
0.25491916280235616
Total Coverage: 0.2549


### VAEP

In [87]:
England_shots_df_cp = England_shots_df.copy()
Spain_shots_df_cp = Spain_shots_df.copy()
all_action_values_cp = all_action_values.copy()
all_action_values_cp.drop(["original_event_id", "result_name", "action_id", "type_name", "end_x", "end_y"], inplace=True, axis=1)

England_shots_df_cp = England_shots_df_cp.merge(all_action_values_cp, on=['game_id', 'player_id', 'start_x', 'start_y'], how='left')
Spain_shots_df_cp = Spain_shots_df_cp.merge(all_action_values_cp, on=['game_id', 'player_id', 'start_x', 'start_y'], how='left')

In [88]:
England_shots_df_cp.drop(["game_id", "period_id", "team_id", "player_id", "time_seconds", "result_id"], inplace=True, axis=1)

# Definir o alvo (target) da descoberta de subgrupos
# Aqui, vamos assumir que queremos encontrar subgrupos de chutes bem-sucedidos (result_name == 'Goal')
target = ps.NumericTarget('Pscores')

search_space = ps.create_selectors(England_shots_df_cp, ignore=["Pscores", "Pconcedes", "offensive_value", "defensive_value", "vaep_value"])

# Configurar a tarefa de descoberta de subgrupos
task = ps.SubgroupDiscoveryTask(England_shots_df_cp, target, search_space, result_set_size=10, depth=2, qf=ps.StandardQFNumeric(1.0))
# Configurar e executar o Beam Search
search_algorithm = ps.BeamSearch()

result = search_algorithm.execute(task)

# Exibir os resultados
for sg_result in result.to_dataframe().head(10).itertuples(index=False):
    print(f"Quality: {sg_result.quality}")
    print(f"Subgroup: {sg_result.subgroup}")
    # print(f"Size of Subgroup: {sg_result.size_sg}")
    # print(f"Size of Dataset: {sg_result.size_dataset}")
    # print(f"Positives in Subgroup: {sg_result.positives_sg}")
    # print(f"Positives in Dataset: {sg_result.positives_dataset}")
    # print(f"Size of Complement: {sg_result.size_complement}")
    # print(f"Relative Size of Subgroup: {sg_result.relative_size_sg}")
    # print(f"Relative Size of Complement: {sg_result.relative_size_complement}")
    # print(f"Coverage of Subgroup: {sg_result.coverage_sg}")
    # print(f"Coverage of Complement: {sg_result.coverage_complement}")
    # print(f"Target Share in Subgroup: {sg_result.target_share_sg}")
    # print(f"Target Share in Complement: {sg_result.target_share_complement}")
    # print(f"Target Share in Dataset: {sg_result.target_share_dataset}")
    # print(f"Lift: {sg_result.lift}")
    print("-" * 40)

result.to_dataframe()

Quality: 270.02617797255516
Subgroup: shot_distance_from_goal<11.26
----------------------------------------
Quality: 270.02617797255516
Subgroup: disc_start_y=='center' AND shot_distance_from_goal<11.26
----------------------------------------
Quality: 270.02617797255516
Subgroup: disc_start_x=='attacking' AND shot_distance_from_goal<11.26
----------------------------------------
Quality: 268.9564814865589
Subgroup: shot_angle_from_goal>=0.61
----------------------------------------
Quality: 268.9564814865589
Subgroup: disc_start_y=='center' AND shot_angle_from_goal>=0.61
----------------------------------------
Quality: 268.9564814865589
Subgroup: disc_start_x=='attacking' AND shot_angle_from_goal>=0.61
----------------------------------------
Quality: 254.3015831708908
Subgroup: shot_angle_from_goal>=0.61 AND shot_distance_from_goal<11.26
----------------------------------------
Quality: 241.8643211722374
Subgroup: disc_start_y=='center' AND start_x>=96.60
--------------------------

Unnamed: 0,quality,subgroup,size_sg,size_dataset,mean_sg,mean_dataset,std_sg,std_dataset,median_sg,median_dataset,max_sg,max_dataset,min_sg,min_dataset,mean_lift,median_lift
0,270.026178,shot_distance_from_goal<11.26,1687,8528,0.287464,0.127401,0.41932,0.295678,0.036383,0.026265,0.995717,0.996881,0.01408,0.002416,2.256369,1.385209
1,270.026178,disc_start_y=='center' AND shot_distance_from_...,1687,8528,0.287464,0.127401,0.41932,0.295678,0.036383,0.026265,0.995717,0.996881,0.01408,0.002416,2.256369,1.385209
2,270.026178,disc_start_x=='attacking' AND shot_distance_fr...,1687,8528,0.287464,0.127401,0.41932,0.295678,0.036383,0.026265,0.995717,0.996881,0.01408,0.002416,2.256369,1.385209
3,268.956481,shot_angle_from_goal>=0.61,1707,8528,0.284962,0.127401,0.418119,0.295678,0.035959,0.026265,0.995717,0.996881,0.01408,0.002416,2.23673,1.369085
4,268.956481,disc_start_y=='center' AND shot_angle_from_goa...,1707,8528,0.284962,0.127401,0.418119,0.295678,0.035959,0.026265,0.995717,0.996881,0.01408,0.002416,2.23673,1.369085
5,268.956481,disc_start_x=='attacking' AND shot_angle_from_...,1707,8528,0.284962,0.127401,0.418119,0.295678,0.035959,0.026265,0.995717,0.996881,0.01408,0.002416,2.23673,1.369085
6,254.301583,shot_angle_from_goal>=0.61 AND shot_distance_f...,1460,8528,0.30158,0.127401,0.426455,0.295678,0.036595,0.026265,0.995717,0.996881,0.01408,0.002416,2.36717,1.393301
7,241.864321,disc_start_y=='center' AND start_x>=96.60,1502,8528,0.288429,0.127401,0.419899,0.295678,0.036135,0.026265,0.995717,0.996881,0.01408,0.002416,2.263945,1.375764
8,240.865721,shot_distance_from_goal<11.26 AND start_x>=96.60,1332,8528,0.308231,0.127401,0.42949,0.295678,0.036623,0.026265,0.995717,0.996881,0.01408,0.002416,2.419375,1.394351
9,232.259416,start_x>=96.60,1813,8528,0.255509,0.127401,0.401293,0.295678,0.035342,0.026265,0.995717,0.996881,0.0119,0.002416,2.005546,1.345591


In [89]:
show_coverage(result, England_shots_df_cp)

0.1978189493433396
0.1978189493433396
0.1978189493433396
0.22678236397748594
0.22678236397748594
0.22678236397748594
0.22678236397748594
0.24671669793621012
0.24671669793621012
0.28318480300187615
Total Coverage: 0.2832


In [90]:
Spain_shots_df_cp.drop(["game_id", "period_id", "team_id", "player_id", "time_seconds", "result_id"], inplace=True, axis=1)

# Definir o alvo (target) da descoberta de subgrupos
# Aqui, vamos assumir que queremos encontrar subgrupos de chutes bem-sucedidos (result_name == 'Goal')
target = ps.NumericTarget('Pscores')

search_space = ps.create_selectors(Spain_shots_df_cp, ignore=["Pscores", "Pconcedes", "offensive_value", "defensive_value", "vaep_value"])

# Configurar a tarefa de descoberta de subgrupos
task = ps.SubgroupDiscoveryTask(Spain_shots_df_cp, target, search_space, result_set_size=10, depth=2, qf=ps.StandardQFNumeric(1.0))
# Configurar e executar o Beam Search
search_algorithm = ps.BeamSearch()

result = search_algorithm.execute(task)

# Exibir os resultados
for sg_result in result.to_dataframe().head(10).itertuples(index=False):
    print(f"Quality: {sg_result.quality}")
    print(f"Subgroup: {sg_result.subgroup}")
    # print(f"Size of Subgroup: {sg_result.size_sg}")
    # print(f"Size of Dataset: {sg_result.size_dataset}")
    # print(f"Positives in Subgroup: {sg_result.positives_sg}")
    # print(f"Positives in Dataset: {sg_result.positives_dataset}")
    # print(f"Size of Complement: {sg_result.size_complement}")
    # print(f"Relative Size of Subgroup: {sg_result.relative_size_sg}")
    # print(f"Relative Size of Complement: {sg_result.relative_size_complement}")
    # print(f"Coverage of Subgroup: {sg_result.coverage_sg}")
    # print(f"Coverage of Complement: {sg_result.coverage_complement}")
    # print(f"Target Share in Subgroup: {sg_result.target_share_sg}")
    # print(f"Target Share in Complement: {sg_result.target_share_complement}")
    # print(f"Target Share in Dataset: {sg_result.target_share_dataset}")
    # print(f"Lift: {sg_result.lift}")
    print("-" * 40)

result.to_dataframe()

Quality: 243.79056924581528
Subgroup: shot_distance_from_goal<11.04
----------------------------------------
Quality: 243.79056924581528
Subgroup: disc_start_y=='center' AND shot_distance_from_goal<11.04
----------------------------------------
Quality: 243.79056924581528
Subgroup: disc_start_x=='attacking' AND shot_distance_from_goal<11.04
----------------------------------------
Quality: 243.06272520124912
Subgroup: shot_angle_from_goal>=0.61
----------------------------------------
Quality: 243.06272520124912
Subgroup: disc_start_y=='center' AND shot_angle_from_goal>=0.61
----------------------------------------
Quality: 243.06272520124912
Subgroup: disc_start_x=='attacking' AND shot_angle_from_goal>=0.61
----------------------------------------
Quality: 231.45427523553371
Subgroup: shot_angle_from_goal>=0.61 AND shot_distance_from_goal<11.04
----------------------------------------
Quality: 226.88204964995384
Subgroup: shot_distance_from_goal<11.04 AND start_x>=96.60
--------------

Unnamed: 0,quality,subgroup,size_sg,size_dataset,mean_sg,mean_dataset,std_sg,std_dataset,median_sg,median_dataset,max_sg,max_dataset,min_sg,min_dataset,mean_lift,median_lift
0,243.790569,shot_distance_from_goal<11.04,1588,8043,0.28427,0.13075,0.417027,0.298985,0.036685,0.026685,0.995627,0.995627,0.015709,0.003558,2.174153,1.374755
1,243.790569,disc_start_y=='center' AND shot_distance_from_...,1588,8043,0.28427,0.13075,0.417027,0.298985,0.036685,0.026685,0.995627,0.995627,0.015709,0.003558,2.174153,1.374755
2,243.790569,disc_start_x=='attacking' AND shot_distance_fr...,1588,8043,0.28427,0.13075,0.417027,0.298985,0.036685,0.026685,0.995627,0.995627,0.015709,0.003558,2.174153,1.374755
3,243.062725,shot_angle_from_goal>=0.61,1611,8043,0.281627,0.13075,0.415756,0.298985,0.036599,0.026685,0.995627,0.995627,0.009808,0.003558,2.153935,1.371526
4,243.062725,disc_start_y=='center' AND shot_angle_from_goa...,1611,8043,0.281627,0.13075,0.415756,0.298985,0.036599,0.026685,0.995627,0.995627,0.009808,0.003558,2.153935,1.371526
5,243.062725,disc_start_x=='attacking' AND shot_angle_from_...,1611,8043,0.281627,0.13075,0.415756,0.298985,0.036599,0.026685,0.995627,0.995627,0.009808,0.003558,2.153935,1.371526
6,231.454275,shot_angle_from_goal>=0.61 AND shot_distance_f...,1353,8043,0.301817,0.13075,0.425935,0.298985,0.037228,0.026685,0.995627,0.995627,0.016668,0.003558,2.308356,1.395114
7,226.88205,shot_distance_from_goal<11.04 AND start_x>=96.60,1234,8043,0.314609,0.13075,0.431803,0.298985,0.037358,0.026685,0.995627,0.995627,0.015709,0.003558,2.406188,1.399988
8,221.689238,disc_start_y=='center' AND start_x>=96.60,1432,8043,0.285561,0.13075,0.417906,0.298985,0.036409,0.026685,0.995627,0.995627,0.015709,0.003558,2.184023,1.364411
9,214.545707,shot_angle_from_goal>=0.61 AND start_x>=96.60,999,8043,0.34551,0.13075,0.444272,0.298985,0.038502,0.026685,0.995627,0.995627,0.016668,0.003558,2.642528,1.442854


In [91]:
show_coverage(result, Spain_shots_df_cp)

0.19743876662936716
0.19743876662936716
0.19743876662936716
0.22951634962078826
0.22951634962078826
0.22951634962078826
0.22951634962078826
0.22951634962078826
0.25413402959094866
0.25413402959094866
Total Coverage: 0.2541


## Avaliação Times (Topo VS Meio VS Baixo da tabela)

### Pre-process

In [47]:
# 1º: Manchester City - 1625
# 10º: Newcastle - 1613
# 20º: West Bromwich - 1627

England_df = pd.read_csv(f"{PATH}England.csv", index_col=0)
England_df.drop(["original_event_id", "result_name","bodypart_id", "type_id"], inplace=True, axis=1)

MC_df = England_df[England_df["team_id"] == 1625]
MC_shots_df = generate_shots_with_counts_events(MC_df)

NC_df = England_df[England_df["team_id"] == 1613]
NC_shots_df = generate_shots_with_counts_events(NC_df)

WB_df = England_df[England_df["team_id"] == 1627]
WB_shots_df = generate_shots_with_counts_events(WB_df)

In [51]:
MC_shots_df

Unnamed: 0,game_id,period_id,team_id,player_id,time_seconds,start_x,start_y,disc_start_x,disc_start_y,num_events,...,play_mean_distance_to_the_goal,play_std_distance_to_the_goal,play_distance_towards_goal,ratio_distance,total_time_per_play,play_speed,play_speed_towards_goal,shot_distance_from_goal,shot_angle_from_goal,result_id
0,2499720,1,1625,340386,189.560864,93.45,42.84,attacking,center,47,...,53.868916,19.735366,42.00,0.044038,3.961749,5.122023,0.225561,14.544693,0.441421,0
1,2499720,1,1625,70083,534.526010,84.00,48.28,attacking,right,65,...,45.904966,13.137510,21.00,0.017086,5.201843,3.635010,0.062108,25.395244,0.261031,0
2,2499720,1,1625,105339,794.071176,93.45,27.88,attacking,center,53,...,51.809342,13.198073,55.65,0.057894,4.195384,4.322957,0.250275,13.071224,0.537975,0
3,2499720,1,1625,340386,1995.689167,96.60,29.92,attacking,center,145,...,50.788555,17.775042,52.50,0.017514,8.032031,2.573904,0.045078,9.338437,0.756460,0
4,2499720,1,1625,340386,1998.335772,97.65,39.44,attacking,center,1,...,0.000000,0.000000,0.00,0.000000,0.000000,0.000000,0.000000,9.144184,0.715772,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
598,2500095,2,1625,245364,896.992909,82.95,46.92,attacking,right,34,...,61.007314,24.876650,51.45,0.065115,7.045725,3.298353,0.214774,25.556387,0.270101,0
599,2500095,2,1625,11066,902.415248,92.40,27.88,attacking,center,2,...,14.007655,0.000000,7.35,0.803790,1.248029,3.663450,2.944643,14.007655,0.509981,0
600,2500095,2,1625,9380,926.967286,92.40,38.08,attacking,center,2,...,13.244108,0.000000,-12.60,-0.314133,1.021896,19.625494,-6.165011,13.244108,0.563864,0
601,2500095,2,1625,38021,978.686885,84.00,26.52,attacking,center,7,...,28.525122,6.073473,-21.00,-0.198078,3.090426,4.900801,-0.970740,22.292384,0.336059,0


### Binário

In [52]:
MC_shots_df_cp = MC_shots_df.copy()
MC_shots_df_cp.drop(["game_id", "period_id", "team_id", "player_id", "time_seconds"], inplace=True, axis=1)

# Definir o alvo (target) da descoberta de subgrupos
# Aqui, vamos assumir que queremos encontrar subgrupos de chutes bem-sucedidos (result_name == 'Goal')
target = ps.BinaryTarget('result_id', 1)

search_space = ps.create_selectors(MC_shots_df_cp, ignore=['result_id'])

# Configurar a tarefa de descoberta de subgrupos
task = ps.SubgroupDiscoveryTask(MC_shots_df_cp, target, search_space, result_set_size=10, depth=3, qf=ps.WRAccQF())

# Configurar e executar o Beam Search
search_algorithm = ps.BeamSearch()
result = search_algorithm.execute(task)

# Exibir os resultados
for sg_result in result.to_dataframe().head(10).itertuples(index=False):
    print(f"Quality: {sg_result.quality}")
    print(f"Subgroup: {sg_result.subgroup}")
    print(f"Size of Subgroup: {sg_result.size_sg}")
    # print(f"Size of Dataset: {sg_result.size_dataset}")
    # print(f"Positives in Subgroup: {sg_result.positives_sg}")
    # print(f"Positives in Dataset: {sg_result.positives_dataset}")
    # print(f"Size of Complement: {sg_result.size_complement}")
    # print(f"Relative Size of Subgroup: {sg_result.relative_size_sg}")
    # print(f"Relative Size of Complement: {sg_result.relative_size_complement}")
    # print(f"Coverage of Subgroup: {sg_result.coverage_sg}")
    # print(f"Coverage of Complement: {sg_result.coverage_complement}")
    # print(f"Target Share in Subgroup: {sg_result.target_share_sg}")
    # print(f"Target Share in Complement: {sg_result.target_share_complement}")
    # print(f"Target Share in Dataset: {sg_result.target_share_dataset}")
    # print(f"Lift: {sg_result.lift}")
    print("-" * 40)

result.to_dataframe()

Quality: 0.04832113616549646
Subgroup: shot_angle_from_goal>=0.62
Size of Subgroup: 121
----------------------------------------
Quality: 0.04832113616549646
Subgroup: disc_start_y=='center' AND shot_angle_from_goal>=0.62
Size of Subgroup: 121
----------------------------------------
Quality: 0.04832113616549646
Subgroup: disc_start_x=='attacking' AND shot_angle_from_goal>=0.62
Size of Subgroup: 121
----------------------------------------
Quality: 0.04832113616549646
Subgroup: disc_start_x=='attacking' AND disc_start_y=='center' AND shot_angle_from_goal>=0.62
Size of Subgroup: 121
----------------------------------------
Quality: 0.04743831973355995
Subgroup: disc_start_y=='center' AND start_x>=96.60
Size of Subgroup: 118
----------------------------------------
Quality: 0.04743831973355995
Subgroup: disc_start_x=='attacking' AND disc_start_y=='center' AND start_x>=96.60
Size of Subgroup: 118
----------------------------------------
Quality: 0.04717980027997106
Subgroup: shot_distance

Unnamed: 0,quality,subgroup,size_sg,size_dataset,positives_sg,positives_dataset,size_complement,relative_size_sg,relative_size_complement,coverage_sg,coverage_complement,target_share_sg,target_share_complement,target_share_dataset,lift
0,0.048321,shot_angle_from_goal>=0.62,121,603,48,94,482,0.200663,0.799337,0.510638,0.489362,0.396694,0.095436,0.155887,2.544751
1,0.048321,disc_start_y=='center' AND shot_angle_from_goa...,121,603,48,94,482,0.200663,0.799337,0.510638,0.489362,0.396694,0.095436,0.155887,2.544751
2,0.048321,disc_start_x=='attacking' AND shot_angle_from_...,121,603,48,94,482,0.200663,0.799337,0.510638,0.489362,0.396694,0.095436,0.155887,2.544751
3,0.048321,disc_start_x=='attacking' AND disc_start_y=='c...,121,603,48,94,482,0.200663,0.799337,0.510638,0.489362,0.396694,0.095436,0.155887,2.544751
4,0.047438,disc_start_y=='center' AND start_x>=96.60,118,603,47,94,485,0.195688,0.804312,0.5,0.5,0.398305,0.096907,0.155887,2.555085
5,0.047438,disc_start_x=='attacking' AND disc_start_y=='c...,118,603,47,94,485,0.195688,0.804312,0.5,0.5,0.398305,0.096907,0.155887,2.555085
6,0.04718,shot_distance_from_goal<10.90,119,603,47,94,484,0.197347,0.802653,0.5,0.5,0.394958,0.097107,0.155887,2.533613
7,0.04718,disc_start_y=='center' AND shot_distance_from_...,119,603,47,94,484,0.197347,0.802653,0.5,0.5,0.394958,0.097107,0.155887,2.533613
8,0.04718,disc_start_x=='attacking' AND shot_distance_fr...,119,603,47,94,484,0.197347,0.802653,0.5,0.5,0.394958,0.097107,0.155887,2.533613
9,0.04718,disc_start_x=='attacking' AND disc_start_y=='c...,119,603,47,94,484,0.197347,0.802653,0.5,0.5,0.394958,0.097107,0.155887,2.533613


In [53]:
show_coverage(result, MC_shots_df_cp)

0.20066334991708126
0.20066334991708126
0.20066334991708126
0.20066334991708126
0.2603648424543947
0.2603648424543947
0.2603648424543947
0.2603648424543947
0.2603648424543947
0.2603648424543947
Total Coverage: 0.2604


In [54]:
NC_shots_df_cp = NC_shots_df.copy()
NC_shots_df_cp.drop(["game_id", "period_id", "team_id", "player_id", "time_seconds"], inplace=True, axis=1)

# Definir o alvo (target) da descoberta de subgrupos
# Aqui, vamos assumir que queremos encontrar subgrupos de chutes bem-sucedidos (result_name == 'Goal')
target = ps.BinaryTarget('result_id', 1)

search_space = ps.create_selectors(NC_shots_df_cp, ignore=['result_id'])

# Configurar a tarefa de descoberta de subgrupos
task = ps.SubgroupDiscoveryTask(NC_shots_df_cp, target, search_space, result_set_size=10, depth=3, qf=ps.WRAccQF())

# Configurar e executar o Beam Search
search_algorithm = ps.BeamSearch()
result = search_algorithm.execute(task)

# Exibir os resultados
for sg_result in result.to_dataframe().head(10).itertuples(index=False):
    print(f"Quality: {sg_result.quality}")
    print(f"Subgroup: {sg_result.subgroup}")
    print(f"Size of Subgroup: {sg_result.size_sg}")
    # print(f"Size of Dataset: {sg_result.size_dataset}")
    # print(f"Positives in Subgroup: {sg_result.positives_sg}")
    # print(f"Positives in Dataset: {sg_result.positives_dataset}")
    # print(f"Size of Complement: {sg_result.size_complement}")
    # print(f"Relative Size of Subgroup: {sg_result.relative_size_sg}")
    # print(f"Relative Size of Complement: {sg_result.relative_size_complement}")
    # print(f"Coverage of Subgroup: {sg_result.coverage_sg}")
    # print(f"Coverage of Complement: {sg_result.coverage_complement}")
    # print(f"Target Share in Subgroup: {sg_result.target_share_sg}")
    # print(f"Target Share in Complement: {sg_result.target_share_complement}")
    # print(f"Target Share in Dataset: {sg_result.target_share_dataset}")
    # print(f"Lift: {sg_result.lift}")
    print("-" * 40)

result.to_dataframe()

Quality: 0.04048780487804878
Subgroup: shot_distance_from_goal<11.04
Size of Subgroup: 82
----------------------------------------
Quality: 0.04048780487804878
Subgroup: disc_start_y=='center' AND shot_distance_from_goal<11.04
Size of Subgroup: 82
----------------------------------------
Quality: 0.04048780487804878
Subgroup: disc_start_x=='attacking' AND shot_distance_from_goal<11.04
Size of Subgroup: 82
----------------------------------------
Quality: 0.04048780487804878
Subgroup: disc_start_x=='attacking' AND disc_start_y=='center' AND shot_distance_from_goal<11.04
Size of Subgroup: 82
----------------------------------------
Quality: 0.03980963712076145
Subgroup: disc_start_y=='center' AND start_x>=96.60
Size of Subgroup: 74
----------------------------------------
Quality: 0.03980963712076145
Subgroup: disc_start_x=='attacking' AND disc_start_y=='center' AND start_x>=96.60
Size of Subgroup: 74
----------------------------------------
Quality: 0.038048780487804884
Subgroup: start_

Unnamed: 0,quality,subgroup,size_sg,size_dataset,positives_sg,positives_dataset,size_complement,relative_size_sg,relative_size_complement,coverage_sg,coverage_complement,target_share_sg,target_share_complement,target_share_dataset,lift
0,0.040488,shot_distance_from_goal<11.04,82,410,24,37,328,0.2,0.8,0.648649,0.351351,0.292683,0.039634,0.090244,3.243243
1,0.040488,disc_start_y=='center' AND shot_distance_from_...,82,410,24,37,328,0.2,0.8,0.648649,0.351351,0.292683,0.039634,0.090244,3.243243
2,0.040488,disc_start_x=='attacking' AND shot_distance_fr...,82,410,24,37,328,0.2,0.8,0.648649,0.351351,0.292683,0.039634,0.090244,3.243243
3,0.040488,disc_start_x=='attacking' AND disc_start_y=='c...,82,410,24,37,328,0.2,0.8,0.648649,0.351351,0.292683,0.039634,0.090244,3.243243
4,0.03981,disc_start_y=='center' AND start_x>=96.60,74,410,23,37,336,0.180488,0.819512,0.621622,0.378378,0.310811,0.041667,0.090244,3.44412
5,0.03981,disc_start_x=='attacking' AND disc_start_y=='c...,74,410,23,37,336,0.180488,0.819512,0.621622,0.378378,0.310811,0.041667,0.090244,3.44412
6,0.038049,start_x>=96.60,82,410,23,37,328,0.2,0.8,0.621622,0.378378,0.280488,0.042683,0.090244,3.108108
7,0.038049,shot_angle_from_goal>=0.61,82,410,23,37,328,0.2,0.8,0.621622,0.378378,0.280488,0.042683,0.090244,3.108108
8,0.038049,disc_start_y=='center' AND shot_angle_from_goa...,82,410,23,37,328,0.2,0.8,0.621622,0.378378,0.280488,0.042683,0.090244,3.108108
9,0.038049,disc_start_x=='attacking' AND start_x>=96.60,82,410,23,37,328,0.2,0.8,0.621622,0.378378,0.280488,0.042683,0.090244,3.108108


In [55]:
show_coverage(result, NC_shots_df_cp)

0.2
0.2
0.2
0.2
0.21707317073170732
0.21707317073170732
0.23658536585365852
0.2634146341463415
0.2634146341463415
0.2634146341463415
Total Coverage: 0.2634


In [56]:
WB_shots_df_cp = WB_shots_df.copy()
WB_shots_df_cp.drop(["game_id", "period_id", "team_id", "player_id", "time_seconds"], inplace=True, axis=1)

# Definir o alvo (target) da descoberta de subgrupos
# Aqui, vamos assumir que queremos encontrar subgrupos de chutes bem-sucedidos (result_name == 'Goal')
target = ps.BinaryTarget('result_id', 1)

search_space = ps.create_selectors(WB_shots_df_cp, ignore=['result_id'])

# Configurar a tarefa de descoberta de subgrupos
task = ps.SubgroupDiscoveryTask(WB_shots_df_cp, target, search_space, result_set_size=10, depth=3, qf=ps.WRAccQF())

# Configurar e executar o Beam Search
search_algorithm = ps.BeamSearch()
result = search_algorithm.execute(task)

# Exibir os resultados
for sg_result in result.to_dataframe().head(10).itertuples(index=False):
    print(f"Quality: {sg_result.quality}")
    print(f"Subgroup: {sg_result.subgroup}")
    print(f"Size of Subgroup: {sg_result.size_sg}")
    # print(f"Size of Dataset: {sg_result.size_dataset}")
    # print(f"Positives in Subgroup: {sg_result.positives_sg}")
    # print(f"Positives in Dataset: {sg_result.positives_dataset}")
    # print(f"Size of Complement: {sg_result.size_complement}")
    # print(f"Relative Size of Subgroup: {sg_result.relative_size_sg}")
    # print(f"Relative Size of Complement: {sg_result.relative_size_complement}")
    # print(f"Coverage of Subgroup: {sg_result.coverage_sg}")
    # print(f"Coverage of Complement: {sg_result.coverage_complement}")
    # print(f"Target Share in Subgroup: {sg_result.target_share_sg}")
    # print(f"Target Share in Complement: {sg_result.target_share_complement}")
    # print(f"Target Share in Dataset: {sg_result.target_share_dataset}")
    # print(f"Lift: {sg_result.lift}")
    print("-" * 40)

result.to_dataframe()

Quality: 0.02957768199933442
Subgroup: disc_start_y=='center' AND start_x>=96.60
Size of Subgroup: 68
----------------------------------------
Quality: 0.02957768199933442
Subgroup: disc_start_x=='attacking' AND disc_start_y=='center' AND start_x>=96.60
Size of Subgroup: 68
----------------------------------------
Quality: 0.029317943847858376
Subgroup: shot_distance_from_goal<10.52 AND start_x>=96.60
Size of Subgroup: 57
----------------------------------------
Quality: 0.029317943847858376
Subgroup: disc_start_y=='center' AND shot_distance_from_goal<10.52 AND start_x>=96.60
Size of Subgroup: 57
----------------------------------------
Quality: 0.029317943847858376
Subgroup: disc_start_x=='attacking' AND shot_distance_from_goal<10.52 AND start_x>=96.60
Size of Subgroup: 57
----------------------------------------
Quality: 0.029106906599784092
Subgroup: shot_distance_from_goal<10.52
Size of Subgroup: 70
----------------------------------------
Quality: 0.029106906599784092
Subgroup: di

Unnamed: 0,quality,subgroup,size_sg,size_dataset,positives_sg,positives_dataset,size_complement,relative_size_sg,relative_size_complement,coverage_sg,coverage_complement,target_share_sg,target_share_complement,target_share_dataset,lift
0,0.029578,disc_start_y=='center' AND start_x>=96.60,68,351,16,29,283,0.193732,0.806268,0.551724,0.448276,0.235294,0.045936,0.082621,2.84787
1,0.029578,disc_start_x=='attacking' AND disc_start_y=='c...,68,351,16,29,283,0.193732,0.806268,0.551724,0.448276,0.235294,0.045936,0.082621,2.84787
2,0.029318,shot_distance_from_goal<10.52 AND start_x>=96.60,57,351,15,29,294,0.162393,0.837607,0.517241,0.482759,0.263158,0.047619,0.082621,3.185118
3,0.029318,disc_start_y=='center' AND shot_distance_from_...,57,351,15,29,294,0.162393,0.837607,0.517241,0.482759,0.263158,0.047619,0.082621,3.185118
4,0.029318,disc_start_x=='attacking' AND shot_distance_fr...,57,351,15,29,294,0.162393,0.837607,0.517241,0.482759,0.263158,0.047619,0.082621,3.185118
5,0.029107,shot_distance_from_goal<10.52,70,351,16,29,281,0.19943,0.80057,0.551724,0.448276,0.228571,0.046263,0.082621,2.766502
6,0.029107,disc_start_y=='center' AND shot_distance_from_...,70,351,16,29,281,0.19943,0.80057,0.551724,0.448276,0.228571,0.046263,0.082621,2.766502
7,0.029107,disc_start_x=='attacking' AND shot_distance_fr...,70,351,16,29,281,0.19943,0.80057,0.551724,0.448276,0.228571,0.046263,0.082621,2.766502
8,0.029107,disc_start_x=='attacking' AND disc_start_y=='c...,70,351,16,29,281,0.19943,0.80057,0.551724,0.448276,0.228571,0.046263,0.082621,2.766502
9,0.028872,shot_angle_from_goal>=0.63,71,351,16,29,280,0.202279,0.797721,0.551724,0.448276,0.225352,0.046429,0.082621,2.727538


In [57]:
show_coverage(result, WB_shots_df_cp)

0.19373219373219372
0.19373219373219372
0.19373219373219372
0.19373219373219372
0.19373219373219372
0.23076923076923078
0.23076923076923078
0.23076923076923078
0.23076923076923078
0.2621082621082621
Total Coverage: 0.2621


### xG

In [58]:
MC_shots_df_cp = MC_shots_df.copy()
X = MC_shots_df_cp[["bodypart_name", "shot_distance_from_goal", "shot_angle_from_goal"]]
X["bodypart_name"] = X["bodypart_name"].apply(lambda val: 0 if val == "foot_right" else 1 if val == "foot_left" else 2)
MC_shots_df_cp["xg"] = rfc.predict(X=X)

target = ps.BinaryTarget('xg', 1)

search_space = ps.create_selectors(MC_shots_df_cp, ignore=['xg', 'result_id'])

# Configurar a tarefa de descoberta de subgrupos
task = ps.SubgroupDiscoveryTask(MC_shots_df_cp, target, search_space, result_set_size=10, depth=2, qf=ps.WRAccQF())

# Configurar e executar o Beam Search
search_algorithm = ps.BeamSearch()
result = search_algorithm.execute(task)

# Exibir os resultados
for sg_result in result.to_dataframe().head(10).itertuples(index=False):
    print(f"Quality: {sg_result.quality}")
    print(f"Subgroup: {sg_result.subgroup}")
    print(f"Size of Subgroup: {sg_result.size_sg}")
    # print(f"Size of Dataset: {sg_result.size_dataset}")
    # print(f"Positives in Subgroup: {sg_result.positives_sg}")
    # print(f"Positives in Dataset: {sg_result.positives_dataset}")
    # print(f"Size of Complement: {sg_result.size_complement}")
    # print(f"Relative Size of Subgroup: {sg_result.relative_size_sg}")
    # print(f"Relative Size of Complement: {sg_result.relative_size_complement}")
    # print(f"Coverage of Subgroup: {sg_result.coverage_sg}")
    # print(f"Coverage of Complement: {sg_result.coverage_complement}")
    # print(f"Target Share in Subgroup: {sg_result.target_share_sg}")
    # print(f"Target Share in Complement: {sg_result.target_share_complement}")
    # print(f"Target Share in Dataset: {sg_result.target_share_dataset}")
    # print(f"Lift: {sg_result.lift}")
    print("-" * 40)

result.to_dataframe()

Quality: 0.03031278103677302
Subgroup: shot_distance_from_goal<10.90 AND start_x>=96.60
Size of Subgroup: 102
----------------------------------------
Quality: 0.029864497303422083
Subgroup: shot_angle_from_goal>=0.62 AND start_x>=96.60
Size of Subgroup: 82
----------------------------------------
Quality: 0.029344708189291243
Subgroup: disc_start_y=='center' AND start_x>=96.60
Size of Subgroup: 118
----------------------------------------
Quality: 0.029284203636323634
Subgroup: shot_distance_from_goal<10.90
Size of Subgroup: 119
----------------------------------------
Quality: 0.029284203636323634
Subgroup: shot_distance_from_goal<10.90 AND team_id==1625
Size of Subgroup: 119
----------------------------------------
Quality: 0.029284203636323634
Subgroup: disc_start_y=='center' AND shot_distance_from_goal<10.90
Size of Subgroup: 119
----------------------------------------
Quality: 0.029284203636323634
Subgroup: disc_start_x=='attacking' AND shot_distance_from_goal<10.90
Size of Subg

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["bodypart_name"] = X["bodypart_name"].apply(lambda val: 0 if val == "foot_right" else 1 if val == "foot_left" else 2)


Unnamed: 0,quality,subgroup,size_sg,size_dataset,positives_sg,positives_dataset,size_complement,relative_size_sg,relative_size_complement,coverage_sg,coverage_complement,target_share_sg,target_share_complement,target_share_dataset,lift
0,0.030313,shot_distance_from_goal<10.90 AND start_x>=96.60,102,603,22,22,501,0.169154,0.830846,1.0,0.0,0.215686,0.0,0.036484,5.911765
1,0.029864,shot_angle_from_goal>=0.62 AND start_x>=96.60,82,603,21,22,521,0.135987,0.864013,0.954545,0.045455,0.256098,0.001919,0.036484,7.019401
2,0.029345,disc_start_y=='center' AND start_x>=96.60,118,603,22,22,485,0.195688,0.804312,1.0,0.0,0.186441,0.0,0.036484,5.110169
3,0.029284,shot_distance_from_goal<10.90,119,603,22,22,484,0.197347,0.802653,1.0,0.0,0.184874,0.0,0.036484,5.067227
4,0.029284,shot_distance_from_goal<10.90 AND team_id==1625,119,603,22,22,484,0.197347,0.802653,1.0,0.0,0.184874,0.0,0.036484,5.067227
5,0.029284,disc_start_y=='center' AND shot_distance_from_...,119,603,22,22,484,0.197347,0.802653,1.0,0.0,0.184874,0.0,0.036484,5.067227
6,0.029284,disc_start_x=='attacking' AND shot_distance_fr...,119,603,22,22,484,0.197347,0.802653,1.0,0.0,0.184874,0.0,0.036484,5.067227
7,0.028836,shot_angle_from_goal>=0.62 AND shot_distance_f...,99,603,21,22,504,0.164179,0.835821,0.954545,0.045455,0.212121,0.001984,0.036484,5.81405
8,0.02753,start_x>=96.60,148,603,22,22,455,0.245439,0.754561,1.0,0.0,0.148649,0.0,0.036484,4.074324
9,0.02753,start_x>=96.60 AND team_id==1625,148,603,22,22,455,0.245439,0.754561,1.0,0.0,0.148649,0.0,0.036484,4.074324


In [59]:
show_coverage(result, MC_shots_df_cp)

0.1691542288557214
0.1691542288557214
0.1956882255389718
0.22388059701492538
0.22388059701492538
0.22388059701492538
0.22388059701492538
0.22388059701492538
0.2736318407960199
0.2736318407960199
Total Coverage: 0.2736


In [60]:
NC_shots_df_cp = NC_shots_df.copy()
X = NC_shots_df_cp[["bodypart_name", "shot_distance_from_goal", "shot_angle_from_goal"]]
X["bodypart_name"] = X["bodypart_name"].apply(lambda val: 0 if val == "foot_right" else 1 if val == "foot_left" else 2)
NC_shots_df_cp["xg"] = rfc.predict(X=X)

target = ps.BinaryTarget('xg', 1)

search_space = ps.create_selectors(NC_shots_df_cp, ignore=['xg', 'result_id'])

# Configurar a tarefa de descoberta de subgrupos
task = ps.SubgroupDiscoveryTask(NC_shots_df_cp, target, search_space, result_set_size=10, depth=2, qf=ps.WRAccQF())

# Configurar e executar o Beam Search
search_algorithm = ps.BeamSearch()
result = search_algorithm.execute(task)

# Exibir os resultados
for sg_result in result.to_dataframe().head(10).itertuples(index=False):
    print(f"Quality: {sg_result.quality}")
    print(f"Subgroup: {sg_result.subgroup}")
    print(f"Size of Subgroup: {sg_result.size_sg}")
    # print(f"Size of Dataset: {sg_result.size_dataset}")
    # print(f"Positives in Subgroup: {sg_result.positives_sg}")
    # print(f"Positives in Dataset: {sg_result.positives_dataset}")
    # print(f"Size of Complement: {sg_result.size_complement}")
    # print(f"Relative Size of Subgroup: {sg_result.relative_size_sg}")
    # print(f"Relative Size of Complement: {sg_result.relative_size_complement}")
    # print(f"Coverage of Subgroup: {sg_result.coverage_sg}")
    # print(f"Coverage of Complement: {sg_result.coverage_complement}")
    # print(f"Target Share in Subgroup: {sg_result.target_share_sg}")
    # print(f"Target Share in Complement: {sg_result.target_share_complement}")
    # print(f"Target Share in Dataset: {sg_result.target_share_dataset}")
    # print(f"Lift: {sg_result.lift}")
    print("-" * 40)

result.to_dataframe()

Quality: 0.02466983938132064
Subgroup: shot_distance_from_goal<11.04 AND start_y: [33.32:38.08[
Size of Subgroup: 33
----------------------------------------
Quality: 0.02421177870315289
Subgroup: shot_angle_from_goal>=0.61 AND start_y: [33.32:38.08[
Size of Subgroup: 40
----------------------------------------
Quality: 0.02275431290898275
Subgroup: start_x>=96.60 AND start_y: [33.32:38.08[
Size of Subgroup: 25
----------------------------------------
Quality: 0.0221832242712671
Subgroup: shot_angle_from_goal>=0.61 AND shot_distance_from_goal<11.04
Size of Subgroup: 71
----------------------------------------
Quality: 0.021659726353361095
Subgroup: disc_start_x=='attacking' AND start_y: [33.32:38.08[
Size of Subgroup: 79
----------------------------------------
Quality: 0.021463414634146347
Subgroup: shot_distance_from_goal<11.04
Size of Subgroup: 82
----------------------------------------
Quality: 0.021463414634146347
Subgroup: shot_distance_from_goal<11.04 AND team_id==1613
Size of 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["bodypart_name"] = X["bodypart_name"].apply(lambda val: 0 if val == "foot_right" else 1 if val == "foot_left" else 2)


Unnamed: 0,quality,subgroup,size_sg,size_dataset,positives_sg,positives_dataset,size_complement,relative_size_sg,relative_size_complement,coverage_sg,coverage_complement,target_share_sg,target_share_complement,target_share_dataset,lift
0,0.02467,shot_distance_from_goal<11.04 AND start_y: [33...,33,410,11,11,377,0.080488,0.919512,1.0,0.0,0.333333,0.0,0.026829,12.424242
1,0.024212,shot_angle_from_goal>=0.61 AND start_y: [33.32...,40,410,11,11,370,0.097561,0.902439,1.0,0.0,0.275,0.0,0.026829,10.25
2,0.022754,start_x>=96.60 AND start_y: [33.32:38.08[,25,410,10,11,385,0.060976,0.939024,0.909091,0.090909,0.4,0.002597,0.026829,14.909091
3,0.022183,shot_angle_from_goal>=0.61 AND shot_distance_f...,71,410,11,11,339,0.173171,0.826829,1.0,0.0,0.15493,0.0,0.026829,5.774648
4,0.02166,disc_start_x=='attacking' AND start_y: [33.32:...,79,410,11,11,331,0.192683,0.807317,1.0,0.0,0.139241,0.0,0.026829,5.189873
5,0.021463,shot_distance_from_goal<11.04,82,410,11,11,328,0.2,0.8,1.0,0.0,0.134146,0.0,0.026829,5.0
6,0.021463,shot_distance_from_goal<11.04 AND team_id==1613,82,410,11,11,328,0.2,0.8,1.0,0.0,0.134146,0.0,0.026829,5.0
7,0.021463,shot_angle_from_goal>=0.61,82,410,11,11,328,0.2,0.8,1.0,0.0,0.134146,0.0,0.026829,5.0
8,0.021463,shot_angle_from_goal>=0.61 AND team_id==1613,82,410,11,11,328,0.2,0.8,1.0,0.0,0.134146,0.0,0.026829,5.0
9,0.021463,disc_start_y=='center' AND shot_distance_from_...,82,410,11,11,328,0.2,0.8,1.0,0.0,0.134146,0.0,0.026829,5.0


In [61]:
show_coverage(result, NC_shots_df_cp)

0.08048780487804878
0.0975609756097561
0.0975609756097561
0.1902439024390244
0.28536585365853656
0.3121951219512195
0.3121951219512195
0.32195121951219513
0.32195121951219513
0.32195121951219513
Total Coverage: 0.3220


In [62]:
WB_shots_df_cp = WB_shots_df.copy()
X = WB_shots_df_cp[["bodypart_name", "shot_distance_from_goal", "shot_angle_from_goal"]]
X["bodypart_name"] = X["bodypart_name"].apply(lambda val: 0 if val == "foot_right" else 1 if val == "foot_left" else 2)
WB_shots_df_cp["xg"] = rfc.predict(X=X)

target = ps.BinaryTarget('xg', 1)

search_space = ps.create_selectors(WB_shots_df_cp, ignore=['xg', 'result_id'])

# Configurar a tarefa de descoberta de subgrupos
task = ps.SubgroupDiscoveryTask(WB_shots_df_cp, target, search_space, result_set_size=10, depth=2, qf=ps.WRAccQF())

# Configurar e executar o Beam Search
search_algorithm = ps.BeamSearch()
result = search_algorithm.execute(task)

# Exibir os resultados
for sg_result in result.to_dataframe().head(10).itertuples(index=False):
    print(f"Quality: {sg_result.quality}")
    print(f"Subgroup: {sg_result.subgroup}")
    print(f"Size of Subgroup: {sg_result.size_sg}")
    # print(f"Size of Dataset: {sg_result.size_dataset}")
    # print(f"Positives in Subgroup: {sg_result.positives_sg}")
    # print(f"Positives in Dataset: {sg_result.positives_dataset}")
    # print(f"Size of Complement: {sg_result.size_complement}")
    # print(f"Relative Size of Subgroup: {sg_result.relative_size_sg}")
    # print(f"Relative Size of Complement: {sg_result.relative_size_complement}")
    # print(f"Coverage of Subgroup: {sg_result.coverage_sg}")
    # print(f"Coverage of Complement: {sg_result.coverage_complement}")
    # print(f"Target Share in Subgroup: {sg_result.target_share_sg}")
    # print(f"Target Share in Complement: {sg_result.target_share_complement}")
    # print(f"Target Share in Dataset: {sg_result.target_share_dataset}")
    # print(f"Lift: {sg_result.lift}")
    print("-" * 40)

result.to_dataframe()

Quality: 0.030705919594808485
Subgroup: shot_angle_from_goal>=0.63 AND shot_distance_from_goal<10.52
Size of Subgroup: 60
----------------------------------------
Quality: 0.02965073335443706
Subgroup: shot_distance_from_goal<10.52
Size of Subgroup: 70
----------------------------------------
Quality: 0.02965073335443706
Subgroup: shot_distance_from_goal<10.52 AND team_id==1627
Size of Subgroup: 70
----------------------------------------
Quality: 0.02965073335443706
Subgroup: disc_start_y=='center' AND shot_distance_from_goal<10.52
Size of Subgroup: 70
----------------------------------------
Quality: 0.02965073335443706
Subgroup: disc_start_x=='attacking' AND shot_distance_from_goal<10.52
Size of Subgroup: 70
----------------------------------------
Quality: 0.029545214730399918
Subgroup: shot_angle_from_goal>=0.63
Size of Subgroup: 71
----------------------------------------
Quality: 0.029545214730399918
Subgroup: shot_angle_from_goal>=0.63 AND team_id==1627
Size of Subgroup: 71
---

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["bodypart_name"] = X["bodypart_name"].apply(lambda val: 0 if val == "foot_right" else 1 if val == "foot_left" else 2)


Unnamed: 0,quality,subgroup,size_sg,size_dataset,positives_sg,positives_dataset,size_complement,relative_size_sg,relative_size_complement,coverage_sg,coverage_complement,target_share_sg,target_share_complement,target_share_dataset,lift
0,0.030706,shot_angle_from_goal>=0.63 AND shot_distance_f...,60,351,13,13,291,0.17094,0.82906,1.0,0.0,0.216667,0.0,0.037037,5.85
1,0.029651,shot_distance_from_goal<10.52,70,351,13,13,281,0.19943,0.80057,1.0,0.0,0.185714,0.0,0.037037,5.014286
2,0.029651,shot_distance_from_goal<10.52 AND team_id==1627,70,351,13,13,281,0.19943,0.80057,1.0,0.0,0.185714,0.0,0.037037,5.014286
3,0.029651,disc_start_y=='center' AND shot_distance_from_...,70,351,13,13,281,0.19943,0.80057,1.0,0.0,0.185714,0.0,0.037037,5.014286
4,0.029651,disc_start_x=='attacking' AND shot_distance_fr...,70,351,13,13,281,0.19943,0.80057,1.0,0.0,0.185714,0.0,0.037037,5.014286
5,0.029545,shot_angle_from_goal>=0.63,71,351,13,13,280,0.202279,0.797721,1.0,0.0,0.183099,0.0,0.037037,4.943662
6,0.029545,shot_angle_from_goal>=0.63 AND team_id==1627,71,351,13,13,280,0.202279,0.797721,1.0,0.0,0.183099,0.0,0.037037,4.943662
7,0.029545,disc_start_y=='center' AND shot_angle_from_goa...,71,351,13,13,280,0.202279,0.797721,1.0,0.0,0.183099,0.0,0.037037,4.943662
8,0.029545,disc_start_x=='attacking' AND shot_angle_from_...,71,351,13,13,280,0.202279,0.797721,1.0,0.0,0.183099,0.0,0.037037,4.943662
9,0.029229,shot_angle_from_goal>=0.63 AND start_x>=96.60,47,351,12,13,304,0.133903,0.866097,0.923077,0.076923,0.255319,0.003289,0.037037,6.893617


In [63]:
show_coverage(result, WB_shots_df_cp)

0.17094017094017094
0.19943019943019943
0.19943019943019943
0.19943019943019943
0.19943019943019943
0.23076923076923078
0.23076923076923078
0.23076923076923078
0.23076923076923078
0.23076923076923078
Total Coverage: 0.2308


### VAEP

In [64]:
all_action_values_cp = all_action_values.copy()
all_action_values_cp.drop(["original_event_id", "result_name", "action_id", "type_name", "end_x", "end_y"], inplace=True, axis=1)

MC_shots_df_cp = MC_shots_df.copy()
NC_shots_df_cp = NC_shots_df.copy()
WB_shots_df_cp = WB_shots_df.copy()

MC_shots_df_cp = MC_shots_df_cp.merge(all_action_values_cp, on=['game_id', 'player_id', 'start_x', 'start_y'], how='left')
NC_shots_df_cp = NC_shots_df_cp.merge(all_action_values_cp, on=['game_id', 'player_id', 'start_x', 'start_y'], how='left')
WB_shots_df_cp = WB_shots_df_cp.merge(all_action_values_cp, on=['game_id', 'player_id', 'start_x', 'start_y'], how='left')


In [67]:
MC_shots_df_cp.drop(["game_id", "period_id", "team_id", "player_id", "time_seconds", "result_id"], inplace=True, axis=1)

# Definir o alvo (target) da descoberta de subgrupos
# Aqui, vamos assumir que queremos encontrar subgrupos de chutes bem-sucedidos (result_name == 'Goal')
target = ps.NumericTarget('Pscores')

search_space = ps.create_selectors(MC_shots_df_cp, ignore=["Pscores", "Pconcedes", "offensive_value", "defensive_value", "vaep_value"])

# Configurar a tarefa de descoberta de subgrupos
task = ps.SubgroupDiscoveryTask(MC_shots_df_cp, target, search_space, result_set_size=10, depth=2, qf=ps.StandardQFNumeric(1.0))
# Configurar e executar o Beam Search
search_algorithm = ps.BeamSearch()

result = search_algorithm.execute(task)

# Exibir os resultados
for sg_result in result.to_dataframe().head(10).itertuples(index=False):
    print(f"Quality: {sg_result.quality}")
    print(f"Subgroup: {sg_result.subgroup}")
    # print(f"Size of Subgroup: {sg_result.size_sg}")
    # print(f"Size of Dataset: {sg_result.size_dataset}")
    # print(f"Positives in Subgroup: {sg_result.positives_sg}")
    # print(f"Positives in Dataset: {sg_result.positives_dataset}")
    # print(f"Size of Complement: {sg_result.size_complement}")
    # print(f"Relative Size of Subgroup: {sg_result.relative_size_sg}")
    # print(f"Relative Size of Complement: {sg_result.relative_size_complement}")
    # print(f"Coverage of Subgroup: {sg_result.coverage_sg}")
    # print(f"Coverage of Complement: {sg_result.coverage_complement}")
    # print(f"Target Share in Subgroup: {sg_result.target_share_sg}")
    # print(f"Target Share in Complement: {sg_result.target_share_complement}")
    # print(f"Target Share in Dataset: {sg_result.target_share_dataset}")
    # print(f"Lift: {sg_result.lift}")
    print("-" * 40)

result.to_dataframe()

Quality: 29.6915140748024
Subgroup: shot_distance_from_goal<11.03
----------------------------------------
Quality: 29.6915140748024
Subgroup: disc_start_y=='center' AND shot_distance_from_goal<11.03
----------------------------------------
Quality: 29.6915140748024
Subgroup: disc_start_x=='attacking' AND shot_distance_from_goal<11.03
----------------------------------------
Quality: 29.60809600353241
Subgroup: shot_angle_from_goal>=0.61
----------------------------------------
Quality: 29.60809600353241
Subgroup: disc_start_y=='center' AND shot_angle_from_goal>=0.61
----------------------------------------
Quality: 29.60809600353241
Subgroup: disc_start_x=='attacking' AND shot_angle_from_goal>=0.61
----------------------------------------
Quality: 28.32775890827179
Subgroup: disc_start_y=='center' AND start_x>=96.60
----------------------------------------
Quality: 26.88896682858467
Subgroup: shot_angle_from_goal>=0.61 AND shot_distance_from_goal<11.03
--------------------------------

Unnamed: 0,quality,subgroup,size_sg,size_dataset,mean_sg,mean_dataset,std_sg,std_dataset,median_sg,median_dataset,max_sg,max_dataset,min_sg,min_dataset,mean_lift,median_lift
0,29.691514,shot_distance_from_goal<11.03,122,610,0.41765,0.174277,0.463023,0.344937,0.046805,0.028802,0.994042,0.994042,0.020629,0.00368,2.396475,1.625026
1,29.691514,disc_start_y=='center' AND shot_distance_from_...,122,610,0.41765,0.174277,0.463023,0.344937,0.046805,0.028802,0.994042,0.994042,0.020629,0.00368,2.396475,1.625026
2,29.691514,disc_start_x=='attacking' AND shot_distance_fr...,122,610,0.41765,0.174277,0.463023,0.344937,0.046805,0.028802,0.994042,0.994042,0.020629,0.00368,2.396475,1.625026
3,29.608096,shot_angle_from_goal>=0.61,122,610,0.416966,0.174277,0.463104,0.344937,0.046295,0.028802,0.994042,0.994042,0.020629,0.00368,2.392551,1.607331
4,29.608096,disc_start_y=='center' AND shot_angle_from_goa...,122,610,0.416966,0.174277,0.463104,0.344937,0.046295,0.028802,0.994042,0.994042,0.020629,0.00368,2.392551,1.607331
5,29.608096,disc_start_x=='attacking' AND shot_angle_from_...,122,610,0.416966,0.174277,0.463104,0.344937,0.046295,0.028802,0.994042,0.994042,0.020629,0.00368,2.392551,1.607331
6,28.327759,disc_start_y=='center' AND start_x>=96.60,118,610,0.414343,0.174277,0.46299,0.344937,0.046805,0.028802,0.994042,0.994042,0.018457,0.00368,2.377497,1.625026
7,26.888967,shot_angle_from_goal>=0.61 AND shot_distance_f...,101,610,0.440504,0.174277,0.466797,0.344937,0.048291,0.028802,0.994042,0.994042,0.020629,0.00368,2.527613,1.676618
8,26.594014,start_x>=96.60,151,610,0.350396,0.174277,0.44533,0.344937,0.041428,0.028802,0.994042,0.994042,0.018457,0.00368,2.010572,1.438346
9,26.594014,disc_start_x=='attacking' AND start_x>=96.60,151,610,0.350396,0.174277,0.44533,0.344937,0.041428,0.028802,0.994042,0.994042,0.018457,0.00368,2.010572,1.438346


In [68]:
show_coverage(result, MC_shots_df_cp)

0.2
0.2
0.2
0.23442622950819672
0.23442622950819672
0.23442622950819672
0.25901639344262295
0.25901639344262295
0.31311475409836065
0.31311475409836065
Total Coverage: 0.3131


In [70]:
NC_shots_df_cp.drop(["game_id", "period_id", "team_id", "player_id", "time_seconds", "result_id"], inplace=True, axis=1)

# Definir o alvo (target) da descoberta de subgrupos
# Aqui, vamos assumir que queremos encontrar subgrupos de chutes bem-sucedidos (result_name == 'Goal')
target = ps.NumericTarget('Pscores')

search_space = ps.create_selectors(NC_shots_df_cp, ignore=["Pscores", "Pconcedes", "offensive_value", "defensive_value", "vaep_value"])

# Configurar a tarefa de descoberta de subgrupos
task = ps.SubgroupDiscoveryTask(NC_shots_df_cp, target, search_space, result_set_size=10, depth=2, qf=ps.StandardQFNumeric(1.0))
# Configurar e executar o Beam Search
search_algorithm = ps.BeamSearch()

result = search_algorithm.execute(task)

# Exibir os resultados
for sg_result in result.to_dataframe().head(10).itertuples(index=False):
    print(f"Quality: {sg_result.quality}")
    print(f"Subgroup: {sg_result.subgroup}")
    # print(f"Size of Subgroup: {sg_result.size_sg}")
    # print(f"Size of Dataset: {sg_result.size_dataset}")
    # print(f"Positives in Subgroup: {sg_result.positives_sg}")
    # print(f"Positives in Dataset: {sg_result.positives_dataset}")
    # print(f"Size of Complement: {sg_result.size_complement}")
    # print(f"Relative Size of Subgroup: {sg_result.relative_size_sg}")
    # print(f"Relative Size of Complement: {sg_result.relative_size_complement}")
    # print(f"Coverage of Subgroup: {sg_result.coverage_sg}")
    # print(f"Coverage of Complement: {sg_result.coverage_complement}")
    # print(f"Target Share in Subgroup: {sg_result.target_share_sg}")
    # print(f"Target Share in Complement: {sg_result.target_share_complement}")
    # print(f"Target Share in Dataset: {sg_result.target_share_dataset}")
    # print(f"Lift: {sg_result.lift}")
    print("-" * 40)

result.to_dataframe()

Quality: 16.405627071857452
Subgroup: shot_distance_from_goal<11.04
----------------------------------------
Quality: 16.405627071857452
Subgroup: disc_start_y=='center' AND shot_distance_from_goal<11.04
----------------------------------------
Quality: 16.405627071857452
Subgroup: disc_start_x=='attacking' AND shot_distance_from_goal<11.04
----------------------------------------
Quality: 16.09186625480652
Subgroup: disc_start_y=='center' AND start_x>=96.60
----------------------------------------
Quality: 15.50608491897583
Subgroup: start_x>=96.60
----------------------------------------
Quality: 15.50608491897583
Subgroup: disc_start_x=='attacking' AND start_x>=96.60
----------------------------------------
Quality: 15.489017486572266
Subgroup: shot_angle_from_goal>=0.61
----------------------------------------
Quality: 15.489017486572266
Subgroup: disc_start_y=='center' AND shot_angle_from_goal>=0.61
----------------------------------------
Quality: 15.489017486572266
Subgroup: dis

Unnamed: 0,quality,subgroup,size_sg,size_dataset,mean_sg,mean_dataset,std_sg,std_dataset,median_sg,median_dataset,max_sg,max_dataset,min_sg,min_dataset,mean_lift,median_lift
0,16.405627,shot_distance_from_goal<11.04,82,410,0.310007,0.109939,0.431152,0.274172,0.034518,0.024189,0.988178,0.988178,0.023678,0.003785,2.819821,1.427021
1,16.405627,disc_start_y=='center' AND shot_distance_from_...,82,410,0.310007,0.109939,0.431152,0.274172,0.034518,0.024189,0.988178,0.988178,0.023678,0.003785,2.819821,1.427021
2,16.405627,disc_start_x=='attacking' AND shot_distance_fr...,82,410,0.310007,0.109939,0.431152,0.274172,0.034518,0.024189,0.988178,0.988178,0.023678,0.003785,2.819821,1.427021
3,16.091866,disc_start_y=='center' AND start_x>=96.60,74,410,0.327396,0.109939,0.438448,0.274172,0.035051,0.024189,0.988178,0.988178,0.023678,0.003785,2.977991,1.449084
4,15.506085,start_x>=96.60,82,410,0.299037,0.109939,0.425354,0.274172,0.035051,0.024189,0.988178,0.988178,0.023678,0.003785,2.720038,1.449084
5,15.506085,disc_start_x=='attacking' AND start_x>=96.60,82,410,0.299037,0.109939,0.425354,0.274172,0.035051,0.024189,0.988178,0.988178,0.023678,0.003785,2.720038,1.449084
6,15.489017,shot_angle_from_goal>=0.61,82,410,0.298829,0.109939,0.425491,0.274172,0.03489,0.024189,0.988178,0.988178,0.023678,0.003785,2.718144,1.442411
7,15.489017,disc_start_y=='center' AND shot_angle_from_goa...,82,410,0.298829,0.109939,0.425491,0.274172,0.03489,0.024189,0.988178,0.988178,0.023678,0.003785,2.718144,1.442411
8,15.489017,disc_start_x=='attacking' AND shot_angle_from_...,82,410,0.298829,0.109939,0.425491,0.274172,0.03489,0.024189,0.988178,0.988178,0.023678,0.003785,2.718144,1.442411
9,15.372849,shot_angle_from_goal>=0.61 AND shot_distance_f...,71,410,0.326458,0.109939,0.438107,0.274172,0.034755,0.024189,0.988178,0.988178,0.023678,0.003785,2.969453,1.436836


In [71]:
show_coverage(result, NC_shots_df_cp)

0.2
0.2
0.2
0.21707317073170732
0.23658536585365852
0.23658536585365852
0.2634146341463415
0.2634146341463415
0.2634146341463415
0.2634146341463415
Total Coverage: 0.2634


In [72]:
WB_shots_df_cp.drop(["game_id", "period_id", "team_id", "player_id", "time_seconds", "result_id"], inplace=True, axis=1)

# Definir o alvo (target) da descoberta de subgrupos
# Aqui, vamos assumir que queremos encontrar subgrupos de chutes bem-sucedidos (result_name == 'Goal')
target = ps.NumericTarget('Pscores')

search_space = ps.create_selectors(WB_shots_df_cp, ignore=["Pscores", "Pconcedes", "offensive_value", "defensive_value", "vaep_value"])

# Configurar a tarefa de descoberta de subgrupos
task = ps.SubgroupDiscoveryTask(WB_shots_df_cp, target, search_space, result_set_size=10, depth=2, qf=ps.StandardQFNumeric(1.0))
# Configurar e executar o Beam Search
search_algorithm = ps.BeamSearch()

result = search_algorithm.execute(task)

# Exibir os resultados
for sg_result in result.to_dataframe().head(10).itertuples(index=False):
    print(f"Quality: {sg_result.quality}")
    print(f"Subgroup: {sg_result.subgroup}")
    # print(f"Size of Subgroup: {sg_result.size_sg}")
    # print(f"Size of Dataset: {sg_result.size_dataset}")
    # print(f"Positives in Subgroup: {sg_result.positives_sg}")
    # print(f"Positives in Dataset: {sg_result.positives_dataset}")
    # print(f"Size of Complement: {sg_result.size_complement}")
    # print(f"Relative Size of Subgroup: {sg_result.relative_size_sg}")
    # print(f"Relative Size of Complement: {sg_result.relative_size_complement}")
    # print(f"Coverage of Subgroup: {sg_result.coverage_sg}")
    # print(f"Coverage of Complement: {sg_result.coverage_complement}")
    # print(f"Target Share in Subgroup: {sg_result.target_share_sg}")
    # print(f"Target Share in Complement: {sg_result.target_share_complement}")
    # print(f"Target Share in Dataset: {sg_result.target_share_dataset}")
    # print(f"Lift: {sg_result.lift}")
    print("-" * 40)

result.to_dataframe()

Quality: 10.38562297821045
Subgroup: disc_start_y=='center' AND start_x>=96.60
----------------------------------------
Quality: 10.211431980133057
Subgroup: shot_distance_from_goal<10.52
----------------------------------------
Quality: 10.211431980133057
Subgroup: disc_start_y=='center' AND shot_distance_from_goal<10.52
----------------------------------------
Quality: 10.211431980133057
Subgroup: disc_start_x=='attacking' AND shot_distance_from_goal<10.52
----------------------------------------
Quality: 10.207541853189468
Subgroup: shot_distance_from_goal<10.52 AND start_x>=96.60
----------------------------------------
Quality: 10.141963601112366
Subgroup: shot_angle_from_goal>=0.63
----------------------------------------
Quality: 10.141963601112366
Subgroup: disc_start_y=='center' AND shot_angle_from_goal>=0.63
----------------------------------------
Quality: 10.141963601112366
Subgroup: disc_start_x=='attacking' AND shot_angle_from_goal>=0.63
----------------------------------

Unnamed: 0,quality,subgroup,size_sg,size_dataset,mean_sg,mean_dataset,std_sg,std_dataset,median_sg,median_dataset,max_sg,max_dataset,min_sg,min_dataset,mean_lift,median_lift
0,10.385623,disc_start_y=='center' AND start_x>=96.60,68,354,0.255524,0.102794,0.401368,0.261647,0.033515,0.025747,0.983416,0.983416,0.019465,0.004961,2.485782,1.301677
1,10.211432,shot_distance_from_goal<10.52,70,354,0.248672,0.102794,0.397485,0.261647,0.033162,0.025747,0.983416,0.983416,0.019465,0.004961,2.419124,1.287981
2,10.211432,disc_start_y=='center' AND shot_distance_from_...,70,354,0.248672,0.102794,0.397485,0.261647,0.033162,0.025747,0.983416,0.983416,0.019465,0.004961,2.419124,1.287981
3,10.211432,disc_start_x=='attacking' AND shot_distance_fr...,70,354,0.248672,0.102794,0.397485,0.261647,0.033162,0.025747,0.983416,0.983416,0.019465,0.004961,2.419124,1.287981
4,10.207542,shot_distance_from_goal<10.52 AND start_x>=96.60,57,354,0.281874,0.102794,0.416593,0.261647,0.033802,0.025747,0.983416,0.983416,0.019465,0.004961,2.742119,1.312838
5,10.141964,shot_angle_from_goal>=0.63,71,354,0.245639,0.102794,0.395517,0.261647,0.033536,0.025747,0.983416,0.983416,0.019465,0.004961,2.389617,1.30248
6,10.141964,disc_start_y=='center' AND shot_angle_from_goa...,71,354,0.245639,0.102794,0.395517,0.261647,0.033536,0.025747,0.983416,0.983416,0.019465,0.004961,2.389617,1.30248
7,10.141964,disc_start_x=='attacking' AND shot_angle_from_...,71,354,0.245639,0.102794,0.395517,0.261647,0.033536,0.025747,0.983416,0.983416,0.019465,0.004961,2.389617,1.30248
8,9.412968,start_x>=96.60,81,354,0.219004,0.102794,0.37713,0.261647,0.03283,0.025747,0.983416,0.983416,0.015445,0.004961,2.130507,1.275088
9,9.412968,disc_start_x=='attacking' AND start_x>=96.60,81,354,0.219004,0.102794,0.37713,0.261647,0.03283,0.025747,0.983416,0.983416,0.015445,0.004961,2.130507,1.275088


In [73]:
show_coverage(result, WB_shots_df_cp)

0.192090395480226
0.2288135593220339
0.2288135593220339
0.2288135593220339
0.2288135593220339
0.2627118644067797
0.2627118644067797
0.2627118644067797
0.2994350282485876
0.2994350282485876
Total Coverage: 0.2994
