In [1]:
import pandas as pd
import numpy as np
from math import sqrt
import os
import pysubgroup as ps

## Tem que clonar o Repositório (dentro de projeto-ad)

https://github.com/HMProenca/SSDpp-numeric

In [2]:
PATH: str = "../data/spadl_format/"
LEAGUES: list[str] = ["England", "Spain", "France", "Italy", "Germany"]

teams_df = pd.read_json("../data/wyscout/teams/teams.json")
players_df = pd.read_json("../data/wyscout/players/players.json")
ranking_df = pd.read_json('../data/wyscout/playeranks/playeranks.json')

df_dict = {}
if not os.path.exists("../data/processed/"):
    os.mkdir("../data/processed")
for league in LEAGUES:

    df = pd.read_csv(f"{PATH}{league}.csv", index_col=0)

    # remove not used columns
    df.drop(["original_event_id", "result_name","bodypart_id", "type_id"], inplace=True, axis=1)

    df_dict[league] = df
all_df = pd.concat([df for df in df_dict.values()])
all_df

Unnamed: 0,game_id,period_id,time_seconds,team_id,player_id,start_x,start_y,end_x,end_y,result_id,action_id,type_name,bodypart_name,player_name
0,2499719,1,2.758649,1609,25413,51.45,34.68,32.55,14.96,1,0,pass,foot,A. Lacazette
1,2499719,1,4.946850,1609,370224,32.55,14.96,53.55,17.00,1,1,pass,foot,R. Holding
2,2499719,1,6.542188,1609,3319,53.55,17.00,36.75,19.72,1,2,pass,head,M. Özil
3,2499719,1,8.143395,1609,120339,36.75,19.72,43.05,3.40,1,3,pass,head,Mohamed Elneny
4,2499719,1,10.302366,1609,167145,43.05,3.40,75.60,8.16,1,4,pass,foot,Bellerín
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
389146,2517044,2,2817.761761,2463,94831,87.15,36.72,87.15,36.72,0,1148,shot,foot_right,Y. Ōsako
389147,2517044,2,2818.280436,2451,14917,17.85,31.96,23.10,27.20,1,1149,interception,foot,R. Knoche
389148,2517044,2,2823.180681,2451,14804,23.10,27.20,0.00,3.40,1,1150,dribble,foot,J. Błaszczykowski
389149,2517044,2,2828.080925,2451,14804,0.00,3.40,5.25,4.76,0,1151,pass,foot,J. Błaszczykowski


In [3]:
print(all_df["type_name"].unique())

['pass' 'interception' 'dribble' 'take_on' 'tackle' 'foul'
 'freekick_short' 'cross' 'shot' 'clearance' 'throw_in' 'goalkick'
 'corner_short' 'corner_crossed' 'keeper_save' 'freekick_crossed'
 'shot_freekick' 'bad_touch' 'shot_penalty']


## Pre-process

In [4]:
GOAL_CENTER_X: int = 105
GOAL_CENTER_Y: int = 34

UPPER_CROSSBAR_X: int = 105
UPPER_CROSSBAR_Y: int = 38

LOWER_CROSSBAR_X: int = 105
LOWER_CROSSBAR_Y: int = 30


def get_shot_angle(shot_pos_x, shot_pos_y):
    v1 = np.array([UPPER_CROSSBAR_X - shot_pos_x, UPPER_CROSSBAR_Y - shot_pos_y])
    v2 = np.array([LOWER_CROSSBAR_X - shot_pos_x, LOWER_CROSSBAR_Y - shot_pos_y])
    return np.arccos(np.dot(v1 / np.linalg.norm(v1), v2 / np.linalg.norm(v2)))

def calcular_media_global():
    # Filtrar jogadores com dados válidos
    jogadores_validos = ranking_df[ranking_df['playerankScore'].notna()]
    media_global = np.sum(jogadores_validos['playerankScore'] * jogadores_validos['minutesPlayed']) / jogadores_validos['minutesPlayed'].sum()
    return media_global

media_rank_global = calcular_media_global()

def calcular_ranking_medio(player_id):
    jogador_rankings = ranking_df[ranking_df['playerId'] == player_id]
    if jogador_rankings.empty:
        return media_rank_global
    ranking_ponderado = np.sum(jogador_rankings['playerankScore'] * jogador_rankings['minutesPlayed']) / jogador_rankings['minutesPlayed'].sum()
    return ranking_ponderado

def calculate_distance(x1, y1, x2, y2):
    return sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)

def discretize_start_x(x):
    if x < 26.25:
        return 'defensive'
    elif x < 52.5:
        return 'pre-defensive'
    elif x < 78.75:
        return 'pre-attacking'
    else:
        return 'attacking'

def discretize_start_y(y):
    if y < 22.67:
        return 'left'
    elif y < 45.33:
        return 'center'
    else:
        return 'right'

def generate_shots_with_counts_events(df):
    shot_data = []
    result_ids = []
    grouped = df.groupby(['game_id', 'period_id'])
    
    for (game_id, period_id), group in grouped:
        group = group.sort_values(by='time_seconds').reset_index(drop=True)
        start_index = 0

        while start_index < len(group):
            shot_index = group[start_index:].index[group['type_name'][start_index:] == 'shot']
            if len(shot_index) == 0:
                break
            shot_index = shot_index[0]

            shot_row = group.loc[shot_index]
            play_events = group.loc[start_index:shot_index]
            
            for idx in play_events.index[::-1]:
                if play_events.loc[idx, 'team_id'] != shot_row['team_id']:
                    start_index = idx + 1
                    break
            else:
                start_index = play_events.index[0]

            play_events = group.loc[start_index:shot_index]

            player_rank = calcular_ranking_medio(shot_row['player_id'])
            
            play_distance = 0
            play_distance_towards_goal = 0
            play_distances_to_goal = []

            for i in range(1, len(play_events)):
                x1, y1 = play_events.iloc[i - 1][['start_x', 'start_y']]
                x2, y2 = play_events.iloc[i][['start_x', 'start_y']]
                dist = calculate_distance(x1, y1, x2, y2)
                play_distance += dist
                play_distance_towards_goal += x2 - x1
                play_distances_to_goal.append(sqrt((x2 - GOAL_CENTER_X) ** 2 + (y2 - GOAL_CENTER_Y) ** 2))

            if len(play_events) > 1:
                play_mean_distance_to_the_goal = np.mean(play_distances_to_goal)
                play_std_distance_to_the_goal = np.std(play_distances_to_goal)
            else:
                play_mean_distance_to_the_goal = 0
                play_std_distance_to_the_goal = 0

            play_duration = play_events['time_seconds'].iloc[-1] - play_events['time_seconds'].iloc[0]
            ratio_distance = play_distance_towards_goal / play_distance if play_distance != 0 else 0
            total_time_per_play = play_duration / len(play_events) if len(play_events) != 0 else 0
            play_speed = play_distance / play_duration if play_duration != 0 else 0
            play_speed_towards_goal = play_distance_towards_goal / play_duration if play_duration != 0 else 0

            shot_data.append({
                'game_id': game_id,
                'period_id': period_id,
                'team_id': shot_row['team_id'],
                'player_id': shot_row['player_id'],
                'time_seconds': shot_row['time_seconds'],
                'start_x': shot_row['start_x'],
                'start_y': shot_row['start_y'],
                # 'disc_start_x': discretize_start_x(shot_row['start_x']),
                # 'disc_start_y': discretize_start_y(shot_row['start_y']),
                'num_events': len(play_events),
                'num_passes': (play_events['type_name'] == 'pass').sum(),
                'num_dribbles': (play_events['type_name'] == 'dribble').sum(),
                'play_duration': play_duration,
                'player_rank': player_rank,
                'bodypart_name': shot_row['bodypart_name'],
                'play_distance': play_distance,
                'play_mean_distance_to_the_goal': play_mean_distance_to_the_goal,
                'play_std_distance_to_the_goal': play_std_distance_to_the_goal,
                'play_distance_towards_goal': play_distance_towards_goal,
                'ratio_distance': ratio_distance,
                'total_time_per_play': total_time_per_play,
                'play_speed': play_speed,
                'play_speed_towards_goal': play_speed_towards_goal,
            })

            result_ids.append(shot_row['result_id'])
            start_index = shot_index + 1

    shots_df = pd.DataFrame(shot_data)
    shots_df["shot_distance_from_goal"] = shots_df.apply(lambda x: sqrt((x["start_x"] - GOAL_CENTER_X)**2 + (x["start_y"] - GOAL_CENTER_Y)**2), axis=1)
    shots_df["shot_angle_from_goal"] = shots_df[["start_x", "start_y"]].apply(lambda pos: get_shot_angle(pos["start_x"], pos["start_y"]), axis=1)
    shots_df["result_id"] = result_ids
    return shots_df

shots_df = generate_shots_with_counts_events(all_df)

In [49]:
shots_df.head()

Unnamed: 0,game_id,period_id,team_id,player_id,time_seconds,start_x,start_y,num_events,num_passes,num_dribbles,...,play_mean_distance_to_the_goal,play_std_distance_to_the_goal,play_distance_towards_goal,ratio_distance,total_time_per_play,play_speed,play_speed_towards_goal,shot_distance_from_goal,shot_angle_from_goal,result_id
0,2499719,1,1609,25413,94.595788,92.4,40.12,7,4,0,...,30.932864,8.413575,24.15,0.15487,2.83695,7.852377,1.216095,14.007655,0.509981,1
1,2499719,1,1631,26150,179.854785,89.25,32.64,2,0,0,...,15.808608,0.0,-9.45,-0.25819,2.273328,8.050087,-2.07845,15.808608,0.494098,0
2,2499719,1,1631,14763,254.745027,100.8,32.64,7,3,0,...,21.08692,12.262251,4.2,0.034728,2.705084,6.386817,0.221805,4.414703,1.46731,1
3,2499719,1,1609,7868,425.824035,85.05,45.56,6,3,1,...,57.591081,22.443334,75.6,0.783021,2.161246,7.44548,5.82997,23.057235,0.300168,0
4,2499719,1,1609,7868,815.462015,78.75,47.6,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,29.563872,0.24003,0


## Binary Goal/Not Goal

In [6]:
import os
import sys

# Obter o caminho absoluto do diretório do notebook atual
notebook_dir = os.getcwd()

# Caminho absoluto do diretório SSDpp-numeric
ssdpp_numeric_dir = os.path.abspath(os.path.join(notebook_dir, '..', 'SSDpp-numeric'))

# Adicionar o caminho ao sys.path
sys.path.append(ssdpp_numeric_dir)

# Agora, você pode importar o módulo
from _classes import SSDC

shots_df_cp = shots_df.copy()
shots_df_cp.drop(["game_id", "period_id", "team_id", "player_id", "time_seconds"], inplace=True, axis=1)

In [7]:
shots_df_cp.dtypes

start_x                           float64
start_y                           float64
num_events                          int64
num_passes                          int64
num_dribbles                        int64
play_duration                     float64
player_rank                       float64
bodypart_name                      object
play_distance                     float64
play_mean_distance_to_the_goal    float64
play_std_distance_to_the_goal     float64
play_distance_towards_goal        float64
ratio_distance                    float64
total_time_per_play               float64
play_speed                        float64
play_speed_towards_goal           float64
shot_distance_from_goal           float64
shot_angle_from_goal              float64
result_id                           int64
dtype: object

In [9]:
for col in ["bodypart_name"]:
    shots_df_cp[col]=pd.Categorical(shots_df_cp[col])
shots_df_cp.dtypes

start_x                            float64
start_y                            float64
num_events                           int64
num_passes                           int64
num_dribbles                         int64
play_duration                      float64
player_rank                        float64
bodypart_name                     category
play_distance                      float64
play_mean_distance_to_the_goal     float64
play_std_distance_to_the_goal      float64
play_distance_towards_goal         float64
ratio_distance                     float64
total_time_per_play                float64
play_speed                         float64
play_speed_towards_goal            float64
shot_distance_from_goal            float64
shot_angle_from_goal               float64
result_id                            int64
dtype: object

In [10]:
task_name = "discovery"
target_type = "binary"

# load class and fit to data
model = SSDC(task = task_name,max_depth=3, beam_width=25,max_rules=20,n_cutpoints=3)
model.fit(shots_df_cp)

  df.loc[:,colname] = df.loc[:,colname].cat.codes #transform this column to codes


Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8
Iteration: 9
Iteration: 10
Iteration: 11
Iteration: 12
Iteration: 13
Iteration: 14
Iteration: 15
Iteration: 16
Iteration: 17
Iteration: 18
Iteration: 19
Iteration: 20
Iteration: 21
Iteration: 22
Iteration: 23
Iteration: 24
Iteration: 25
Iteration: 26
Iteration: 27
Iteration: 28
Iteration: 29
Iteration: 30
Iteration: 31
Iteration: 32
Iteration: 33
Iteration: 34
Iteration: 35
Iteration: 36
Iteration: 37
Iteration: 38
Iteration: 39
Iteration: 40
Iteration: 41
Iteration: 42
Iteration: 43
Iteration: 44
Iteration: 45
Iteration: 46
Iteration: 47
Iteration: 48
Iteration: 49
Iteration: 50
Iteration: 51
Iteration: 52
Iteration: 53
Iteration: 54
Iteration: 55
Iteration: 56
Iteration: 57
Iteration: 58
Iteration: 59
Iteration: 60
Iteration: 61
Iteration: 62
Iteration: 63
Iteration: 64
Iteration: 65
Iteration: 66
Iteration: 67
Iteration: 68
Iteration: 69
Iteration: 70
Iteration: 71
Iteration: 72
I

<_classes.SSDC at 0x7f64f85238e0>

In [11]:
model_estat = pd.DataFrame(model.statistic_rules)

In [12]:
df_model = pd.DataFrame(model.antecedent_description, columns=['Subgrupros'])

In [13]:
result_model = pd.concat([df_model, model_estat], axis=1)
result_model['WRAcc'] = (result_model['usage']/result_model['usage_default'])*((result_model['mean'])-(result_model['RSS_default_pattern']/result_model['usage_default']))

In [14]:
result_model

Unnamed: 0,Subgrupros,usage,mean,mean2,variance2,RSS2,variance,RSS_default_pattern,usage_default,RSS_default_uncovered,WRAcc
0,shot_distance_from_goal >= 24.89650778723795 A...,351,0.002849,0.0,0.00,0.022285,0.002841,4.699930,40110,3815.459977,0.000024
1,27.200000000000003 <= start_y <= 34.0 AND shot...,301,0.003322,0.0,0.00,0.022285,0.003311,4.142801,39809,3811.317176,0.000024
2,shot_distance_from_goal >= 24.89650778723795 A...,505,0.003960,0.0,0.00,0.022285,0.003945,7.204771,39304,3804.112405,0.000049
3,shot_distance_from_goal >= 24.89650778723795 A...,208,0.004808,0.0,0.00,0.022285,0.004785,3.106541,39096,3801.005864,0.000025
4,shot_distance_from_goal >= 24.89650778723795 A...,226,0.004425,0.0,0.00,0.022285,0.004405,3.307107,38870,3797.698757,0.000025
...,...,...,...,...,...,...,...,...,...,...,...
102,shot_angle_from_goal <= 0.5591113717173855 AND...,88,0.022727,0.0,0.00,0.022285,0.022211,2.558314,10971,1046.027425,0.000180
103,2.069372000000385 <= play_duration <= 16.30900...,195,0.035897,0.5,0.25,0.811168,0.034609,7.694986,10776,1038.332440,0.000637
104,start_x >= 91.35 AND num_dribbles >= 1.0 AND 0...,1171,0.145175,0.0,0.00,0.022285,0.124099,147.158097,9605,891.174343,0.015831
105,play_speed >= 6.866680189143164 AND 1.51258485...,50,0.020000,0.0,0.00,0.022285,0.019600,1.346012,9555,889.828331,0.000104


In [15]:
sorted_df = result_model.sort_values(by=['WRAcc'], ascending=False)
for sg in sorted_df["Subgrupros"].tolist():
    print(sg)

shot_angle_from_goal >= 0.5591113717173857 AND bodypart_name = foot_right AND start_x >= 95.55
start_x >= 91.35 AND num_dribbles >= 1.0 AND 0.3590221107608225 <= shot_angle_from_goal <= 0.5591113717173855
shot_angle_from_goal >= 0.5591113717173857 AND bodypart_name = foot_left AND start_x >= 95.55
shot_angle_from_goal >= 0.5591113717173857 AND num_events <= 1.0 AND start_x >= 95.55
34.68 <= start_y <= 40.8 AND start_x >= 95.55 AND play_speed >= 8.975720268508816
start_x >= 91.35 AND 16.80000000000001 <= play_distance_towards_goal <= 46.199999999999996 AND 34.68 <= start_y <= 40.8
play_distance_towards_goal <= -1.0499999999999972 AND 0.9285019999999804 <= total_time_per_play <= 2.4040031818181804 AND play_mean_distance_to_the_goal <= 28.03256333340805
player_rank >= 0.016036157894736842 AND 2.0 <= num_passes <= 3.0 AND 0.3590221107608225 <= shot_angle_from_goal <= 0.5591113717173855
play_distance_towards_goal >= 46.2 AND 27.200000000000003 <= start_y <= 34.0 AND 12.24944488538154 <= sho

In [20]:
sorted_df

Unnamed: 0,Subgrupros,usage,mean,mean2,variance2,RSS2,variance,RSS_default_pattern,usage_default,RSS_default_uncovered,WRAcc
16,shot_angle_from_goal >= 0.5591113717173857 AND...,1530,0.358824,0.5,0.25,0.811168,0.229885,449.356104,32221,2676.763813,0.016376
104,start_x >= 91.35 AND num_dribbles >= 1.0 AND 0...,1171,0.145175,0.0,0.00,0.022285,0.124099,147.158097,9605,891.174343,0.015831
13,shot_angle_from_goal >= 0.5591113717173857 AND...,957,0.385580,0.5,0.25,0.811168,0.236908,301.761325,35050,3154.005147,0.010293
5,shot_angle_from_goal >= 0.5591113717173857 AND...,878,0.431663,0.5,0.25,0.811168,0.245330,308.769892,37992,3488.928865,0.009788
96,34.68 <= start_y <= 40.8 AND start_x >= 95.55 ...,592,0.168919,0.0,0.00,0.022285,0.140385,85.484722,12623,1202.489521,0.007604
...,...,...,...,...,...,...,...,...,...,...,...
6,shot_distance_from_goal >= 24.89650778723795 A...,196,0.005102,0.0,0.00,0.022285,0.005076,2.972830,37796,3485.956035,0.000026
4,shot_distance_from_goal >= 24.89650778723795 A...,226,0.004425,0.0,0.00,0.022285,0.004405,3.307107,38870,3797.698757,0.000025
3,shot_distance_from_goal >= 24.89650778723795 A...,208,0.004808,0.0,0.00,0.022285,0.004785,3.106541,39096,3801.005864,0.000025
1,27.200000000000003 <= start_y <= 34.0 AND shot...,301,0.003322,0.0,0.00,0.022285,0.003311,4.142801,39809,3811.317176,0.000024


In [16]:
print(model) # returns the list of if-then-else if rules/subgroups with their respective antecedent description and consequent statistics


IF x in shot_distance_from_goal >= 24.89650778723795 AND 21.463913902175438 <= play_distance <= 51.69401161355543 AND 7.181128065994731 <= play_std_distance_to_the_goal <= 15.350716739940347 THEN mean = 0.002849002849002849; std = 0.05329996277455758 ,  
ELSE IF x in 27.200000000000003 <= start_y <= 34.0 AND shot_angle_from_goal <= 0.2746754310739218 AND play_speed <= 4.638509785023105 THEN mean = 0.0033222591362126247; std = 0.057543216198301564 ,  
ELSE IF x in shot_distance_from_goal >= 24.89650778723795 AND num_dribbles >= 1.0 AND 0.0 <= play_distance_towards_goal <= 16.800000000000004 THEN mean = 0.0039603960396039604; std = 0.06280693674120275 ,  
ELSE IF x in shot_distance_from_goal >= 24.89650778723795 AND play_speed >= 8.975720268508816 AND total_time_per_play <= 0.9284542000000329 THEN mean = 0.004807692307692308; std = 0.06917064697085654 ,  
ELSE IF x in shot_distance_from_goal >= 24.89650778723795 AND 2.0 <= num_passes <= 3.0 AND 82.95 <= start_x <= 90.3 THEN mean = 0.0044

In [17]:
model.number_rules # returns the number of subgroups in the list


107

In [18]:
model.rule_sets # returns a list of the index of coverage of the subgroups description (with overlap between coverages)


[[120,
  391,
  549,
  762,
  809,
  1045,
  1153,
  1517,
  1745,
  1749,
  1903,
  2090,
  2378,
  2433,
  2712,
  2891,
  3077,
  3705,
  3716,
  3844,
  4047,
  4117,
  4239,
  4391,
  4501,
  4615,
  4778,
  4969,
  5197,
  5292,
  5344,
  5377,
  5416,
  5562,
  5608,
  5754,
  5802,
  6020,
  6137,
  6276,
  6280,
  6292,
  6331,
  6468,
  6503,
  6531,
  6783,
  6804,
  6866,
  7107,
  7132,
  7163,
  7167,
  7497,
  8002,
  8040,
  8682,
  8869,
  8910,
  8969,
  9202,
  9269,
  9341,
  9342,
  9472,
  9487,
  9621,
  9646,
  9683,
  9716,
  9764,
  9884,
  10168,
  10204,
  10326,
  10471,
  10526,
  10718,
  10734,
  10918,
  10987,
  11010,
  11047,
  11467,
  12066,
  12071,
  12128,
  12359,
  12444,
  12661,
  12694,
  12742,
  12744,
  12776,
  12795,
  12818,
  12859,
  12891,
  13002,
  13052,
  13183,
  13190,
  13192,
  13236,
  13253,
  13266,
  13292,
  13305,
  13384,
  13686,
  13912,
  13956,
  14123,
  14238,
  14364,
  14471,
  14644,
  14655,
  14709,
  1475

In [19]:
model.measures # returns a dictionary of subgroup discovery measure results for the rule list and subgroups

{'avg_supp': 684.1028037383178,
 'kl_supp': 0.7169719667748936,
 'wkl_supp': 432.6348434739755,
 'avg_usg': 296.3457943925234,
 'kl_usg': 0.6448379556807088,
 'wkl_usg': 175.89627054846014,
 'wacc_supp': 74.83138464708257,
 'wacc_usg': 30.653376148302037,
 'jacc_avg': 0.009275922562741885,
 'n_rules': 107,
 'avg_items': 3.0,
 'wkl_sum': 18820.90094868523,
 'std_rules': 0.20336728392326056,
 'top1_std': 0.05329996277455758,
 'length_orig': 13946.180983824217,
 'length_final': -2038.5245487765715,
 'length_ratio': -0.14617080841995372,
 'runtime': 884.4460203647614}

## xG

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

In [25]:
# constans
RANDOM_STATE: int = 123
TEST_SIZE: float  = 0.3

shots_df_cp = shots_df.copy()

# Random Forest Classifier
rfc = RandomForestClassifier(random_state=RANDOM_STATE)
X = shots_df_cp[["bodypart_name", "shot_distance_from_goal", "shot_angle_from_goal"]]
X["bodypart_name"] = X["bodypart_name"].apply(lambda val: 0 if val == "foot_right" else 1 if val == "foot_left" else 2)
y = shots_df_cp["result_id"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)
rfc.fit(X=X_train, y=y_train)
y_pred = rfc.predict(X=X_test)
classification_report(y_test, y_pred, output_dict=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["bodypart_name"] = X["bodypart_name"].apply(lambda val: 0 if val == "foot_right" else 1 if val == "foot_left" else 2)


{'0': {'precision': 0.9070434415858287,
  'recall': 0.9876010286554004,
  'f1-score': 0.9456096381304138,
  'support': 10888.0},
 '1': {'precision': 0.5246478873239436,
  'recall': 0.11910471622701839,
  'f1-score': 0.19413680781758957,
  'support': 1251.0},
 'accuracy': 0.8980970425899992,
 'macro avg': {'precision': 0.7158456644548862,
  'recall': 0.5533528724412095,
  'f1-score': 0.5698732229740017,
  'support': 12139.0},
 'weighted avg': {'precision': 0.8676351840372977,
  'recall': 0.8980970425899992,
  'f1-score': 0.8681656550410867,
  'support': 12139.0}}

In [26]:
shots_df_cp["xg"] = rfc.predict(X=X)
shots_df_cp["bodypart_name"]=pd.Categorical(shots_df_cp["bodypart_name"])
shots_df_cp.drop(["game_id", "period_id", "team_id", "player_id", "time_seconds", "result_id"], inplace=True, axis=1)

task_name = "discovery"
target_type = "binary"

# load class and fit to data
model = SSDC(task = task_name,max_depth=3, beam_width=25,max_rules=20,n_cutpoints=3)
model.fit(shots_df_cp)



  df.loc[:,colname] = df.loc[:,colname].cat.codes #transform this column to codes


Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8
Iteration: 9
Iteration: 10
Iteration: 11
Iteration: 12
Iteration: 13
Iteration: 14
Iteration: 15
Iteration: 16
Iteration: 17
Iteration: 18
Iteration: 19
Iteration: 20
Iteration: 21
Iteration: 22
Iteration: 23
Iteration: 24
Iteration: 25
Iteration: 26
Iteration: 27
Iteration: 28
Iteration: 29
Iteration: 30
Iteration: 31
Iteration: 32
Iteration: 33
Iteration: 34
Iteration: 35
Iteration: 36
Iteration: 37
Iteration: 38
Iteration: 39
Iteration: 40
Iteration: 41
Iteration: 42
Iteration: 43
Iteration: 44
Iteration: 45
Iteration: 46
Iteration: 47
Iteration: 48
Iteration: 49
Iteration: 50
Iteration: 51
Iteration: 52
Iteration: 53
Iteration: 54
Iteration: 55
Iteration: 56
Iteration: 57
Iteration: 58
Iteration: 59
Iteration: 60
Iteration: 61
Iteration: 62
Iteration: 63
Iteration: 64
Iteration: 65
Iteration: 66
Iteration: 67
Iteration: 68
Iteration: 69
Iteration: 70
Iteration: 71
Iteration: 72
I

<_classes.SSDC at 0x7f64f7d87970>

In [27]:
model_estat = pd.DataFrame(model.statistic_rules)
df_model = pd.DataFrame(model.antecedent_description, columns=['Subgrupros'])
result_model = pd.concat([df_model, model_estat], axis=1)
result_model['WRAcc'] = (result_model['usage']/result_model['usage_default'])*((result_model['mean'])-(result_model['RSS_default_pattern']/result_model['usage_default']))
result_model

Unnamed: 0,Subgrupros,usage,mean,mean2,variance2,RSS2,variance,RSS_default_pattern,usage_default,RSS_default_uncovered,WRAcc
0,play_mean_distance_to_the_goal <= 12.248561691...,13,0.615385,0.0,0.00,0.001068,0.236686,7.637203,40448,905.756188,0.000198
1,play_mean_distance_to_the_goal <= 12.248561691...,17,0.411765,0.5,0.25,0.954851,0.242215,6.685557,40431,899.070631,0.000173
2,shot_angle_from_goal >= 0.5591113717173857 AND...,878,0.275626,0.5,0.25,0.954851,0.199656,231.284264,39553,667.786367,0.005989
3,start_x <= 94.5 AND shot_angle_from_goal >= 0....,20775,0.000048,0.0,0.00,0.001068,0.000048,12.047856,18778,655.738511,-0.000657
4,shot_angle_from_goal >= 0.5591113717173857 AND...,173,0.317919,0.0,0.00,0.001068,0.216847,52.550430,18605,603.188081,0.002930
...,...,...,...,...,...,...,...,...,...,...,...
70,28.03464899379168 <= play_mean_distance_to_the...,3,0.333333,0.0,0.00,0.001068,0.222222,0.955385,629,16.550198,0.001583
71,27.200000000000003 <= start_y <= 34.0 AND 4.63...,14,0.142857,0.0,0.00,0.001068,0.122449,1.915041,615,14.635156,0.003181
72,play_distance_towards_goal >= 0.0 AND num_even...,196,0.005102,0.0,0.00,0.001068,0.005076,1.058449,419,13.576708,0.001205
73,0.004430909090909091 <= player_rank <= 0.016025,186,0.037634,0.0,0.00,0.001068,0.036218,6.775805,233,6.800903,0.006828


In [28]:
sorted_df = result_model.sort_values(by=['WRAcc'], ascending=False)
for sg in sorted_df["Subgrupros"].tolist():
    print(sg)

start_y <= 34.0 AND player_rank <= 0.004408223684210527
play_speed >= 8.975720268508816 AND play_duration >= 2.069372000000385 AND 0.004430909090909091 <= player_rank <= 0.016025
0.004430909090909091 <= player_rank <= 0.016025
shot_angle_from_goal >= 0.5591113717173857 AND num_events <= 1.0 AND start_x >= 95.55
shot_angle_from_goal >= 0.5591113717173857 AND bodypart_name = foot_right AND play_distance >= 113.10223961707167
shot_angle_from_goal >= 0.5591113717173857 AND bodypart_name = foot_left AND 34.68 <= start_y <= 40.8
shot_angle_from_goal >= 0.5591113717173857 AND bodypart_name = foot_right AND 0.9285019999999804 <= total_time_per_play <= 1.7608733333333173
27.200000000000003 <= start_y <= 34.0 AND 4.639406895108031 <= play_speed <= 6.866582612367187 AND num_dribbles >= 1.0
shot_angle_from_goal >= 0.5591113717173857 AND bodypart_name = foot_right AND play_distance_towards_goal <= -1.0499999999999972
shot_angle_from_goal >= 0.5591113717173857 AND bodypart_name = foot_right AND rati

In [29]:
sorted_df

Unnamed: 0,Subgrupros,usage,mean,mean2,variance2,RSS2,variance,RSS_default_pattern,usage_default,RSS_default_uncovered,WRAcc
74,start_y <= 34.0 AND player_rank <= 0.004408223...,57,0.052632,0.0,0.00,0.001068,0.049861,2.891787,176,3.909117,0.011724
68,play_speed >= 8.975720268508816 AND play_durat...,460,0.041304,0.0,0.00,0.001068,0.039598,18.367515,793,19.499124,0.010524
73,0.004430909090909091 <= player_rank <= 0.016025,186,0.037634,0.0,0.00,0.001068,0.036218,6.775805,233,6.800903,0.006828
2,shot_angle_from_goal >= 0.5591113717173857 AND...,878,0.275626,0.5,0.25,0.954851,0.199656,231.284264,39553,667.786367,0.005989
16,shot_angle_from_goal >= 0.5591113717173857 AND...,259,0.185328,0.0,0.00,0.001068,0.150982,45.919876,12268,173.463208,0.003834
...,...,...,...,...,...,...,...,...,...,...,...
22,start_y <= 27.2 AND start_x >= 82.95,1276,0.000784,0.0,0.00,0.001068,0.000783,1.635180,7322,155.561469,0.000098
20,start_y >= 41.48 AND 0.004430909090909091 <= p...,1217,0.000822,0.0,0.00,0.001068,0.000821,1.603674,8645,165.805792,0.000090
18,start_x <= 81.9 AND 12.24944488538154 <= play_...,2395,0.000418,0.0,0.00,0.001068,0.000417,2.232738,9865,168.364850,0.000046
14,12.24944488538154 <= shot_distance_from_goal <...,4448,0.000225,0.0,0.00,0.001068,0.000225,3.329062,12564,227.033104,-0.000014


In [30]:
print(model) # returns the list of if-then-else if rules/subgroups with their respective antecedent description and consequent statistics


IF x in play_mean_distance_to_the_goal <= 12.248561691273391 AND num_dribbles >= 1.0 AND 16.80000000000001 <= play_distance_towards_goal <= 46.199999999999996 THEN mean = 0.6153846153846154; std = 0.48650425541051995 ,  
ELSE IF x in play_mean_distance_to_the_goal <= 12.248561691273391 AND 4.0 <= num_events <= 6.0 AND 0.010288819875776397 <= player_rank <= 0.016025 THEN mean = 0.4117647058823529; std = 0.49215295678475035 ,  
ELSE IF x in shot_angle_from_goal >= 0.5591113717173857 AND num_events <= 1.0 AND start_x >= 95.55 THEN mean = 0.275626423690205; std = 0.4468293838300615 ,  
ELSE IF x in start_x <= 94.5 AND shot_angle_from_goal >= 0.2746754310739222 THEN mean = 4.8134777376654636e-05; std = 0.006937756151657537 ,  
ELSE IF x in shot_angle_from_goal >= 0.5591113717173857 AND bodypart_name = foot_left AND 0.0 <= play_speed_towards_goal <= 1.512578105623291 THEN mean = 0.3179190751445087; std = 0.4656678395635336 ,  
ELSE IF x in shot_angle_from_goal >= 0.5591113717173857 AND bodyp

In [31]:
model.measures # returns a dictionary of subgroup discovery measure results for the rule list and subgroups


{'avg_supp': 3682.4266666666667,
 'kl_supp': 1.966790483388603,
 'wkl_supp': 4617.175197078464,
 'avg_usg': 537.1333333333333,
 'kl_usg': 3.7084867126734693,
 'wkl_usg': 1639.4285734560622,
 'wacc_supp': 73.65881779161828,
 'wacc_usg': 21.421526902449273,
 'jacc_avg': 0.027238584748455048,
 'n_rules': 75,
 'avg_items': 2.68,
 'wkl_sum': 122957.14300920462,
 'std_rules': 0.29333622266514464,
 'top1_std': 0.48650425541051995,
 'length_orig': -27816.141328954418,
 'length_final': -149145.16585375989,
 'length_ratio': 0.18650380768109345,
 'runtime': 203.72930884361267}

## VAEP

In [32]:
from tqdm import tqdm
import socceraction.spadl as spd
from socceraction.vaep import features as ft
import socceraction.vaep.labels as lab
import socceraction.vaep.formula as fm
import xgboost as xgb
import sklearn.metrics as mt

In [33]:
def features_transform(spadl):
    spadl.loc[spadl.result_id.isin([2, 3]), ["result_id"]] = 0
    spadl.loc[spadl.result_name.isin(["offside", "owngoal"]), ["result_name"]] = "fail"

    xfns = [
        ft.actiontype_onehot,
        ft.bodypart_onehot,
        ft.result_onehot,
        ft.goalscore,
        ft.startlocation,
        ft.endlocation,
        ft.team,
        ft.time,
        ft.time_delta
    ]

    features = []
    for game in tqdm(np.unique(spadl.game_id).tolist()):
        match_actions = spadl.loc[spadl.game_id == game].reset_index(drop=True)
        match_states = ft.gamestates(actions=match_actions)
        match_feats = pd.concat([fn(match_states) for fn in xfns], axis=1)
        features.append(match_feats)
    features = pd.concat(features).reset_index(drop=True)

    return features

def labels_transform(spadl):
    yfns = [lab.scores, lab.concedes]

    labels = []
    for game in tqdm(np.unique(spadl.game_id).tolist()):
        match_actions = spadl.loc[spadl.game_id == game].reset_index(drop=True)
        labels.append(pd.concat([fn(actions=match_actions) for fn in yfns], axis=1))

    labels = pd.concat(labels).reset_index(drop=True)

    return labels

def train_vaep(X_train, y_train, X_test, y_test):
    models = {}
    for m in ["scores", "concedes"]:
        models[m] = xgb.XGBClassifier(random_state=0, n_estimators=50, max_depth=3)

        print("training " + m + " model")
        models[m].fit(X_train, y_train[m])

        p = sum(y_train[m]) / len(y_train[m])
        base = [p] * len(y_train[m])
        y_train_pred = models[m].predict_proba(X_train)[:, 1]
        train_brier = mt.brier_score_loss(y_train[m], y_train_pred) / mt.brier_score_loss(y_train[m], base)
        print(m + " Train NBS: " + str(train_brier))
        print()

        p = sum(y_test[m]) / len(y_test[m])
        base = [p] * len(y_test[m])
        y_test_pred = models[m].predict_proba(X_test)[:, 1]
        test_brier = mt.brier_score_loss(y_test[m], y_test_pred) / mt.brier_score_loss(y_test[m], base)
        print(m + " Test NBS: " + str(test_brier))
        print()

        print("----------------------------------------")

    return models

def generate_predictions(features, models):
    preds = {}
    for m in ["scores", "concedes"]:
        preds[m] = models[m].predict_proba(features)[:, 1]
    preds = pd.DataFrame(preds)

    return preds

def calculate_action_values(spadl, predictions):
    action_values = fm.value(actions=spadl, Pscores=predictions["scores"], Pconcedes=predictions["concedes"])
    action_values = pd.concat([
        spadl[["original_event_id", "player_id", "action_id", "game_id", "start_x", "start_y", "end_x", "end_y", "type_name", "result_name"]],
        predictions.rename(columns={"scores": "Pscores", "concedes": "Pconcedes"}),
        action_values
    ], axis=1)

    return action_values


In [34]:
spadl = {}
for league in LEAGUES:
    spadl[league] = pd.read_csv(f"../data/spadl_format/{league}.csv")

features = {}
for league in LEAGUES:
    features[league] = features_transform(spadl[league])

labels = {}
for league in LEAGUES:
    labels[league] = labels_transform(spadl[league])

models = train_vaep(X_train=features["England"], y_train=labels["England"], X_test=features["Spain"], y_test=labels["Spain"])


100%|██████████| 380/380 [00:05<00:00, 63.72it/s]
100%|██████████| 380/380 [00:05<00:00, 65.33it/s]
100%|██████████| 380/380 [00:05<00:00, 63.93it/s]
100%|██████████| 380/380 [00:05<00:00, 65.12it/s]
100%|██████████| 306/306 [00:04<00:00, 65.35it/s]
100%|██████████| 380/380 [00:06<00:00, 56.57it/s]
100%|██████████| 380/380 [00:06<00:00, 55.28it/s]
100%|██████████| 380/380 [00:06<00:00, 55.63it/s]
100%|██████████| 380/380 [00:07<00:00, 52.48it/s]
100%|██████████| 306/306 [00:05<00:00, 53.90it/s]


training scores model
scores Train NBS: 0.8452471194228581

scores Test NBS: 0.8503677630926355

----------------------------------------
training concedes model
concedes Train NBS: 0.9660641623881886

concedes Test NBS: 0.9766251611701147

----------------------------------------


In [35]:
preds = {}
action_values = {}

for league in LEAGUES:
    preds[league] = generate_predictions(features=features[league], models=models)
    action_values[league] = calculate_action_values(spadl=spadl[league], predictions=preds[league])

all_action_values = pd.concat([df for df in action_values.values()])

In [40]:
shots_df_cp = shots_df.copy()
all_action_values_cp = all_action_values.copy()
all_action_values_cp.drop(["original_event_id", "result_name", "action_id", "type_name"], inplace=True, axis=1)

shots_df_cp = shots_df_cp.merge(all_action_values_cp, on=['game_id', 'player_id', 'start_x', 'start_y'], how='left')


In [41]:
shots_df_cp

Unnamed: 0,game_id,period_id,team_id,player_id,time_seconds,start_x,start_y,num_events,num_passes,num_dribbles,...,shot_distance_from_goal,shot_angle_from_goal,result_id,end_x,end_y,Pscores,Pconcedes,offensive_value,defensive_value,vaep_value
0,2499719,1,1609,25413,94.595788,92.40,40.12,7,4,0,...,14.007655,0.509981,1,105.0,37.4000,0.978135,0.002137,0.902766,-0.000387,0.902379
1,2499719,1,1631,26150,179.854785,89.25,32.64,2,0,0,...,15.808608,0.494098,0,105.0,40.8000,0.018184,0.007322,-0.020396,-0.003519,-0.023916
2,2499719,1,1631,14763,254.745027,100.80,32.64,7,3,0,...,4.414703,1.467310,1,105.0,34.0000,0.977107,0.002290,0.885530,0.000666,0.886196
3,2499719,1,1609,7868,425.824035,85.05,45.56,6,3,1,...,23.057235,0.300168,0,105.0,40.8000,0.021434,0.002819,-0.004685,-0.001744,-0.006429
4,2499719,1,1609,7868,815.462015,78.75,47.60,1,0,0,...,29.563872,0.240030,0,105.0,37.4000,0.017245,0.005117,-0.019283,-0.002159,-0.021442
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40778,2576338,2,3193,116269,1152.032980,99.75,37.40,9,5,2,...,6.254798,1.067542,0,105.0,40.8000,0.034776,0.009745,-0.081321,-0.004355,-0.085676
40779,2576338,2,3193,3548,1251.730517,97.65,42.16,20,16,2,...,10.982172,0.512084,0,105.0,40.8000,0.029877,0.010725,-0.052057,-0.005569,-0.057625
40780,2576338,2,3193,21177,2065.034482,94.50,36.72,1,0,0,...,10.846585,0.690619,1,105.0,34.0000,0.977372,0.007182,0.950002,-0.002297,0.947704
40781,2576338,2,3193,349102,2367.252041,82.95,46.24,3,2,0,...,25.219439,0.277183,0,105.0,40.8000,0.015293,0.006782,-0.020930,-0.003574,-0.024504


In [42]:
task_name = "discovery"
target_type = "numeric"
shots_df_cp.drop(["Pconcedes", "offensive_value", "defensive_value", "vaep_value"], inplace=True, axis=1)
shots_df_cp["bodypart_name"]=pd.Categorical(shots_df_cp["bodypart_name"])


# load class and fit to data
model = SSDC(task = task_name,max_depth=3, beam_width=25,max_rules=20,n_cutpoints=3)
model.fit(shots_df_cp)


  df.loc[:,colname] = df.loc[:,colname].cat.codes #transform this column to codes


Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4
Iteration: 5
Iteration: 6
Iteration: 7
Iteration: 8
Iteration: 9
Iteration: 10
Iteration: 11
Iteration: 12
Iteration: 13
Iteration: 14
Iteration: 15
Iteration: 16
Iteration: 17
Iteration: 18
Iteration: 19
Iteration: 20
Iteration: 21
Iteration: 22
Iteration: 23
Iteration: 24
Iteration: 25
Iteration: 26
Iteration: 27
Iteration: 28
Iteration: 29
Iteration: 30
Iteration: 31
Iteration: 32
Iteration: 33
Iteration: 34
Iteration: 35
Iteration: 36
Iteration: 37
Iteration: 38
Iteration: 39
Iteration: 40
Iteration: 41
Iteration: 42
Iteration: 43
Iteration: 44
Iteration: 45
Iteration: 46
Iteration: 47
Iteration: 48
Iteration: 49
Iteration: 50
Iteration: 51
Iteration: 52
Iteration: 53
Iteration: 54
Iteration: 55
Iteration: 56
Iteration: 57
Iteration: 58
Iteration: 59
Iteration: 60
Iteration: 61
Iteration: 62
Iteration: 63
Iteration: 64
Iteration: 65
Iteration: 66
Iteration: 67
Iteration: 68
Iteration: 69
Iteration: 70
Iteration: 71
Iteration: 72
I

<_classes.SSDC at 0x7f649122e2c0>

In [43]:
model_estat = pd.DataFrame(model.statistic_rules)
df_model = pd.DataFrame(model.antecedent_description, columns=['Subgrupros'])
result_model = pd.concat([df_model, model_estat], axis=1)
result_model['WRAcc'] = (result_model['usage']/result_model['usage_default'])*((result_model['mean'])-(result_model['RSS_default_pattern']/result_model['usage_default']))
result_model

Unnamed: 0,Subgrupros,usage,mean,mean2,variance2,RSS2,variance,RSS_default_pattern,usage_default,RSS_default_uncovered,WRAcc
0,end_y <= 29.239999999999995 AND start_x <= 81....,716,0.013988,0.030975,1.061433e-05,0.017790,0.000012,8.862114,40067,3484.055966,0.000246
1,start_x <= 81.9 AND end_y <= 29.23999999999999...,1222,0.013944,0.051348,5.951375e-05,0.011036,0.000018,15.138608,38845,3468.903382,0.000426
2,start_x <= 81.9 AND end_y <= 29.23999999999999...,499,0.009593,0.024150,3.780655e-07,0.020436,0.000012,6.670680,38346,3462.225170,0.000123
3,start_x <= 81.9 AND end_y <= 29.23999999999999...,139,0.008912,0.018739,3.398901e-06,0.022688,0.000011,1.878819,38207,3460.343108,0.000032
4,start_x <= 81.9 AND end_y >= 39.44 AND shot_di...,258,0.015881,0.031003,3.367771e-06,0.017765,0.000016,3.081948,37949,3457.254533,0.000107
...,...,...,...,...,...,...,...,...,...,...,...
145,num_events >= 2.0 AND 17.1236941107928 <= shot...,14,0.029111,0.038588,3.702610e-07,0.015015,0.000035,0.124626,4509,393.156005,0.000090
146,play_distance >= 21.494334137162753 AND 17.123...,531,0.181512,0.071505,7.100745e-05,0.005915,0.123994,67.531617,3978,325.612830,0.021963
147,play_speed <= 8.97107615188608 AND 0.0 <= rati...,25,0.028119,0.040871,2.148104e-05,0.014276,0.000065,0.230470,3953,325.375679,0.000177
148,play_speed <= 6.864853427180291 AND 0.23464605...,42,0.033156,0.066238,1.064336e-05,0.006982,0.000159,0.350094,3911,325.015047,0.000355


In [44]:
sorted_df = result_model.sort_values(by=['WRAcc'], ascending=False)
for sg in sorted_df["Subgrupros"].tolist():
    print(sg)

shot_angle_from_goal >= 0.5591113717173855 AND bodypart_name = foot_right AND start_x >= 95.55
play_distance >= 21.494334137162753 AND 17.1236941107928 <= shot_distance_from_goal <= 24.886333598985605 AND 762.8228059999999 <= time_seconds <= 2181.608798
shot_angle_from_goal >= 0.5591113717173855 AND bodypart_name = foot_left AND start_x >= 95.55
start_x >= 95.55 AND time_seconds <= 1480.9770090000002 AND start_y <= 40.8
end_y >= 34.0 AND shot_distance_from_goal >= 17.1236941107928 AND start_y >= 41.48
34.0 <= end_y <= 38.76 AND start_x <= 81.9 AND period_id <= 1.0
0.3587982134468687 <= shot_angle_from_goal <= 0.5565136053620864 AND start_y <= 34.0 AND 762.8228059999999 <= time_seconds <= 2181.608798
shot_angle_from_goal >= 0.5591113717173855 AND end_y <= 33.32 AND bodypart_name = foot_right
start_x >= 91.35 AND play_mean_distance_to_the_goal >= 28.095041593726133 AND 25715.0 <= player_id <= 171283.0
12.24944488538154 <= shot_distance_from_goal <= 17.09952046111235 AND bodypart_name = f

In [45]:
sorted_df

Unnamed: 0,Subgrupros,usage,mean,mean2,variance2,RSS2,variance,RSS_default_pattern,usage_default,RSS_default_uncovered,WRAcc
89,shot_angle_from_goal >= 0.5591113717173855 AND...,800,0.500984,0.103559,1.761895e-05,0.000975,0.220907,288.358355,14285,1934.954154,0.026926
146,play_distance >= 21.494334137162753 AND 17.123...,531,0.181512,0.071505,7.100745e-05,0.005915,0.123994,67.531617,3978,325.612830,0.021963
84,shot_angle_from_goal >= 0.5591113717173855 AND...,611,0.537476,0.094112,6.290593e-05,0.002063,0.220582,237.926644,15808,2502.738825,0.020192
124,start_x >= 95.55 AND time_seconds <= 1480.9770...,561,0.234499,0.059211,1.218662e-05,0.008742,0.147922,89.485877,7587,653.615722,0.016467
142,end_y >= 34.0 AND shot_distance_from_goal >= 1...,900,0.088564,0.076750,8.897083e-07,0.004703,0.051643,47.681490,4733,427.413565,0.014925
...,...,...,...,...,...,...,...,...,...,...,...
100,play_mean_distance_to_the_goal <= 12.362830431...,8,0.039317,0.047615,5.621465e-06,0.012060,0.000050,0.055599,13029,1563.730074,0.000024
45,end_y <= 29.239999999999995 AND player_rank <=...,52,0.009483,0.014764,4.031650e-06,0.024414,0.000009,0.696310,20773,3273.944933,0.000024
64,end_y >= 39.44 AND start_x <= 81.9 AND team_id...,27,0.015346,0.026245,4.699199e-05,0.019690,0.000020,0.325792,18205,3249.542863,0.000023
77,play_duration <= 2.076061999999638 AND play_me...,13,0.015087,0.023972,7.090026e-05,0.020649,0.000050,0.153678,17331,3128.855573,0.000011


In [46]:
print(model) # returns the list of if-then-else if rules/subgroups with their respective antecedent description and consequent statistics

IF x in end_y <= 29.239999999999995 AND start_x <= 81.9 AND start_y >= 41.48 THEN mean = 0.013987624245779016; std = 0.003447272205695281 ,  
ELSE IF x in start_x <= 81.9 AND end_y <= 29.239999999999995 AND end_x >= 105.0 THEN mean = 0.013944327045893587; std = 0.004184912634021281 ,  
ELSE IF x in start_x <= 81.9 AND end_y <= 29.239999999999995 AND total_time_per_play <= 1.7631443333333057 THEN mean = 0.009592970867710326; std = 0.0034481484878677846 ,  
ELSE IF x in start_x <= 81.9 AND end_y <= 29.239999999999995 AND 2455.0 <= team_id <= 3194.0 THEN mean = 0.00891196829260146; std = 0.003303159924273393 ,  
ELSE IF x in start_x <= 81.9 AND end_y >= 39.44 AND shot_distance_from_goal <= 24.886333598985605 THEN mean = 0.01588101418465087; std = 0.003948041795809163 ,  
ELSE IF x in end_y >= 39.44 AND start_x <= 81.9 AND time_seconds <= 762.5967270000001 THEN mean = 0.01237717237725376; std = 0.004406549640620871 ,  
ELSE IF x in start_x <= 81.9 AND end_y >= 39.44 AND player_rank <= 0.00

In [47]:
model.measures # returns a dictionary of subgroup discovery measure results for the rule list and subgroups

{'avg_supp': 1350.4866666666667,
 'kl_supp': 3.002195996663631,
 'wkl_supp': 4236.805477021408,
 'avg_usg': 249.90666666666667,
 'kl_usg': 3.5945423564140317,
 'wkl_usg': 884.0826978054753,
 'wacc_supp': 156.84626908602192,
 'wacc_usg': 35.3442937294906,
 'jacc_avg': 0.017302622210155112,
 'n_rules': 150,
 'avg_items': 2.933333333333333,
 'wkl_sum': 132612.4046708213,
 'std_rules': 0.12856491943258444,
 'top1_std': 0.003447272205695281,
 'length_orig': 11189.475277420263,
 'length_final': -113372.70818430537,
 'length_ratio': -10.132084425181686,
 'runtime': 1264.0900700092316}