In [3]:
import pandas as pd
import os
import json
import numpy as np
pd.set_option('display.max_columns', None)
import soccerdata as sd

# Load Data

In [4]:
data_directory = '../../data/raw_data/soccer_match'
events_directory = os.path.join(data_directory, 'events')
matches_directory = os.path.join(data_directory, 'matches')
other_directory = os.path.join(data_directory, 'other')
tags_directory = os.path.join(data_directory, 'tags')

events_file_names = os.listdir(events_directory)
matches_file_names = os.listdir(matches_directory)
other_file_names = os.listdir(other_directory)
tags_file_names = os.listdir(tags_directory)

In [5]:
events_dfs = {}
for file_name in events_file_names:
    events_dfs[f"{file_name.replace('.json','')}"] = pd.read_json(os.path.join(events_directory,file_name))

In [6]:
events_dfs.keys()

dict_keys(['events_European_Championship', 'events_France', 'events_Italy', 'events_Spain', 'events_Germany', 'events_World_Cup', 'events_England'])

In [7]:
matches_dfs = {}
for file_name in matches_file_names:
    matches_dfs[f"{file_name.replace('.json','')}"] = pd.read_json(os.path.join(matches_directory,file_name))

In [8]:
matches_dfs.keys()

dict_keys(['matches_Italy', 'matches_European_Championship', 'matches_Germany', 'matches_France', 'matches_Spain', 'matches_World_Cup', 'matches_England'])

In [9]:
other_dfs = {}
for file_name in other_file_names:
    other_dfs[f"{file_name.replace('.json','')}"] = pd.read_json(os.path.join(other_directory,file_name))

In [10]:
other_dfs.keys()

dict_keys(['playerank', 'referees', 'competitions', 'coaches', 'teams', 'players'])

In [11]:
tags_dfs = {}
for file_name in tags_file_names:
    tags_dfs[f"{file_name.replace('.csv','')}"] = pd.read_csv(os.path.join(tags_directory,file_name))

In [12]:
tags_dfs.keys()

dict_keys(['eventid2name', 'tags2name'])

# Analyze data

## Events Dataset

In [13]:
ev_it = events_dfs['events_Italy'].copy()

In [14]:
ev_it.head(3)

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id
0,8,Simple pass,[{'id': 1801}],8327,"[{'y': 52, 'x': 49}, {'y': 44, 'x': 43}]",2575959,Pass,3158,1H,2.530536,85,180423957
1,8,Simple pass,[{'id': 1801}],20438,"[{'y': 44, 'x': 43}, {'y': 17, 'x': 36}]",2575959,Pass,3158,1H,3.768418,85,180423958
2,7,Touch,[],8306,"[{'y': 17, 'x': 36}, {'y': 56, 'x': 78}]",2575959,Others on the ball,3158,1H,4.868265,72,180423959


In [15]:
tags_dfs['eventid2name'][tags_dfs['eventid2name']['event'].isin([8,10])].head(3)

Unnamed: 0,event,subevent,event_label,subevent_label
26,8,80,Pass,Cross
27,8,81,Pass,Hand pass
28,8,82,Pass,Head pass


## Playerank dataset

In [16]:
other_dfs['playerank'].head(3)

Unnamed: 0,goalScored,playerankScore,matchId,playerId,roleCluster,minutesPlayed
0,0,0.0053,2057991,10014,right CB,90
1,0,0.0009,2057992,10014,right CB,41
2,0,-0.0013,2057998,100140,central MF,90


In [17]:
other_dfs['playerank'][other_dfs['playerank']['playerId']==263591].head(3)

Unnamed: 0,goalScored,playerankScore,matchId,playerId,roleCluster,minutesPlayed
22039,0,0.0125,2576025,263591,central MF,34
22040,0,-0.0085,2576126,263591,right MF,19
22041,0,-0.0102,2576164,263591,right FW,28


## Competitions dataset

In [18]:
other_dfs['competitions'][['name','wyId', 'format', 'type']].head(3)

Unnamed: 0,name,wyId,format,type
0,Italian first division,524,Domestic league,club
1,English first division,364,Domestic league,club
2,Spanish first division,795,Domestic league,club


## Teams dataset

In [19]:
teams = other_dfs['teams']#[['name', 'wyId']]
teams.head(2)

Unnamed: 0,city,name,wyId,officialName,area,type
0,Newcastle upon Tyne,Newcastle United,1613,Newcastle United FC,"{'name': 'England', 'id': '0', 'alpha3code': '...",club
1,Vigo,Celta de Vigo,692,Real Club Celta de Vigo,"{'name': 'Spain', 'id': '724', 'alpha3code': '...",club


## Players dataset

In [20]:
other_dfs['players'][['firstName', 'lastName', 'currentTeamId', 'birthDate', 'role', 'wyId', 'currentNationalTeamId']].head(3)

Unnamed: 0,firstName,lastName,currentTeamId,birthDate,role,wyId,currentNationalTeamId
0,Harun,Tekin,4502,1989-06-17,"{'code2': 'GK', 'code3': 'GKP', 'name': 'Goalk...",32777,4687.0
1,Malang,Sarr,3775,1999-01-23,"{'code2': 'DF', 'code3': 'DEF', 'name': 'Defen...",393228,4423.0
2,Over,Mandanda,3772,1998-10-26,"{'code2': 'GK', 'code3': 'GKP', 'name': 'Goalk...",393230,


## Elo dataset

In [21]:
elo = sd.ClubElo()

In [22]:
elo.read_team_history('Inter').head(2)

  return pd.read_csv(


Unnamed: 0_level_0,rank,team,country,level,elo,to
from,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1946-07-07,,Inter,ITA,1,1722.613037,1946-09-22
1946-09-23,,Inter,ITA,1,1725.414429,1946-09-29


## Matches dataset

In [23]:
match_features = ['teamsData','seasonId', 'dateutc', 'winner', 'wyId', 'competitionId']

In [24]:
pd.DataFrame(matches_dfs['matches_Italy']).head(3)

Unnamed: 0,status,roundId,gameweek,teamsData,seasonId,dateutc,winner,venue,wyId,label,date,referees,duration,competitionId
0,Played,4406278,38,"{'3162': {'scoreET': 0, 'coachId': 251025, 'si...",181248,2018-05-20 18:45:00,3161,,2576335,"Lazio - Internazionale, 2 - 3","May 20, 2018 at 8:45:00 PM GMT+2","[{'refereeId': 377206, 'role': 'referee'}, {'r...",Regular,524
1,Played,4406278,38,"{'3158': {'scoreET': 0, 'coachId': 210119, 'si...",181248,2018-05-20 18:45:00,3158,MAPEI Stadium - Citt\u00e0 del Tricolore,2576336,"Sassuolo - Roma, 0 - 1","May 20, 2018 at 8:45:00 PM GMT+2","[{'refereeId': 377255, 'role': 'referee'}, {'r...",Regular,524
2,Played,4406278,38,"{'3173': {'scoreET': 0, 'coachId': 251044, 'si...",181248,2018-05-20 16:00:00,3173,,2576329,"Cagliari - Atalanta, 1 - 0","May 20, 2018 at 6:00:00 PM GMT+2","[{'refereeId': 377247, 'role': 'referee'}, {'r...",Regular,524


In [25]:
italy_games = pd.DataFrame(matches_dfs['matches_Italy'])[match_features]
italy_games.rename(columns = {'wyId':'matchId'}, inplace = True)
italy_games.head(3)

Unnamed: 0,teamsData,seasonId,dateutc,winner,matchId,competitionId
0,"{'3162': {'scoreET': 0, 'coachId': 251025, 'si...",181248,2018-05-20 18:45:00,3161,2576335,524
1,"{'3158': {'scoreET': 0, 'coachId': 210119, 'si...",181248,2018-05-20 18:45:00,3158,2576336,524
2,"{'3173': {'scoreET': 0, 'coachId': 251044, 'si...",181248,2018-05-20 16:00:00,3173,2576329,524


# Preprocessing functions

In [26]:
def accurate_not_accurate(events):
    events_df = events[events['eventId'].isin([8,10])].reset_index().copy()
    accurate = []
    for index, row in events_df.iterrows():
        list = [events_df['tags'].iloc[index][i]['id'] for i in range(len(events_df['tags'].iloc[index]))]
        if 1801 in list:
            accurate.append(1)
        elif 1802 in list:
            accurate.append(0)
        else:
            accurate.append(None)
    events_df['accurate'] = accurate
    events_df = events_df.drop(columns = ['tags','index'])
    return events_df

In [27]:
def matches_events(matches,events):
    home_passes = []
    home_accurate_passes = []
    home_shots = []
    home_accurate_shots = []

    away_passes = []
    away_accurate_passes = []
    away_shots = []
    away_accurate_shots = []

    events = events.dropna(subset = ['accurate'])
    events.loc[:,'accurate'] = events['accurate'].astype(int)
    
    condition_shot = (events['eventId']==10)
    condition_pass = (events['eventId']==8)
    condition_accurate = (events['accurate']==1)

    for index,row in matches.iterrows():
        match_id = matches['matchId'].loc[index]
        home_id = matches['homeId'].loc[index]
        away_id = matches['awayId'].loc[index]

        condition_match = (events['matchId']==match_id)
        condition_home_team = (events['teamId'] == home_id)
        condition_away_team = (events['teamId'] == away_id)
        try:
            home_passes.append(events[condition_match & condition_pass & condition_home_team].groupby('eventId').count()['id'].iloc[0])
        except:
            home_passes.append(0)
        try:
            home_accurate_passes.append(events[condition_match & condition_pass & condition_accurate & condition_home_team].groupby('eventId').count()['accurate'].iloc[0])
        except:
            home_accurate_passes.append(0)
        try:
            home_shots.append(events[condition_match & condition_shot & condition_home_team].groupby('eventId').count()['id'].iloc[0])
        except:
            home_shots.append(0)
        try:
            home_accurate_shots.append(events[condition_match & condition_shot & condition_accurate & condition_home_team].groupby('eventId').count()['accurate'].iloc[0])  
        except:
            home_accurate_shots.append(0)

        try:
            away_passes.append(events[condition_match & condition_pass & condition_away_team].groupby('eventId').count()['id'].iloc[0])
        except:
            away_passes.append(0)
        try:
            away_accurate_passes.append(events[condition_match & condition_pass & condition_accurate & condition_away_team].groupby('eventId').count()['accurate'].iloc[0])   
        except:
            away_accurate_passes.append(0)
        try:
            away_shots.append(events[condition_match & condition_shot & condition_away_team].groupby('eventId').count()['id'].iloc[0])
        except:
            away_shots.append(0)
        try:
            away_accurate_shots.append(events[condition_match & condition_shot & condition_accurate & condition_away_team].groupby('eventId').count()['accurate'].iloc[0])
        except:
            away_accurate_shots.append(0)
        
    matches['totalHomePasses'], matches['accurateHomePasses'], matches['totalHomeShots'], matches['accurateHomeShots'] = home_passes, home_accurate_passes, home_shots, home_accurate_shots
    matches['totalAwayPasses'], matches['accurateAwayPasses'], matches['totalAwayShots'], matches['accurateAwayShots'] = away_passes, away_accurate_passes, away_shots, away_accurate_shots
    return matches
        

In [28]:
def create_home_away_cols(matches):
    home = []
    away = []
    for index, row in matches.iterrows():
        team0 = list(matches.iloc[index].teamsData.keys())[0]
        team1 = list(matches.iloc[index].teamsData.keys())[1]
        if matches.iloc[index].teamsData[team0]['side'] == 'home':
            home.append(team0)
            away.append(team1)
        else:
            home.append(team1)
            away.append(team0)
    matches['homeId'] = home
    matches['homeId'] = matches['homeId'].astype(int)
    matches['awayId'] = away
    matches['awayId'] = matches['awayId'].astype(int)
    return matches

In [29]:
def get_home_away_names(matches, teams):
    teams = teams[['name', 'wyId']].copy()
    matches = matches.merge(teams, left_on = 'homeId', right_on = 'wyId')
    matches = matches.rename(columns = {'name' : 'homeTeam'}).drop(columns = 'wyId')
    matches = matches.merge(teams, left_on = 'awayId', right_on = 'wyId')
    matches = matches.rename(columns = {'name' : 'awayTeam'}).drop(columns = 'wyId')
    return matches.sort_values(by = 'dateutc', ascending = False)

In [30]:
def get_goals(matches):
    home_goals = []
    away_goals = []
    for index, row in matches.iterrows():
        team0 = list(matches.loc[index].teamsData.keys())[0]
        team1 = list(matches.loc[index].teamsData.keys())[1]
        goals0 = matches.loc[index].teamsData[team0]['score']
        goals1 = matches.loc[index].teamsData[team1]['score']
        if matches.loc[index].teamsData[team0]['side'] == 'home':
            home_goals.append(goals0)
            away_goals.append(goals1)
        else:
            home_goals.append(goals1)
            away_goals.append(goals0)
    matches['homeScore'] = home_goals
    matches['awayScore'] = away_goals
    return matches

In [31]:
def get_avg_playerank(matches, playerank):
    home_avg_ranks = []
    away_avg_ranks = []
    def calc_avg_scores(players, index):
        scores = []
        for player in players:           
            condition_player = playerank['playerId'] == player
            condition_match = playerank['matchId'] == matches.matchId.iloc[index]
            try:
                player_score = playerank[condition_player & condition_match]['playerankScore'].values[0]
                scores.append(player_score)
            except:
                None
        return np.mean(scores)

    for index, row in matches.iterrows():
        for team in list(matches.teamsData.iloc[index].keys()):
            players = []
            for dic in matches.teamsData.iloc[index][team]['formation']['lineup']:
                    players.append(dic['playerId'])
            if matches.iloc[index].teamsData[team]['side'] == 'home':
                home_avg_ranks.append(calc_avg_scores(players, index))
            else:
                away_avg_ranks.append(calc_avg_scores(players, index))
    matches['homeTeam_matchRank'] = home_avg_ranks
    matches['awayTeam_matchRank'] = away_avg_ranks
    matches = matches.drop(columns = 'teamsData')
    return matches
    
        

In [32]:
def accuracy_features(matches):
    matches['homePassAccuracy'] = matches['accurateHomePasses']/matches['totalHomePasses']
    matches['homeShotAccuracy'] = matches['accurateHomeShots']/matches['totalHomeShots']

    matches['awayPassAccuracy'] = matches['accurateAwayPasses']/matches['totalAwayPasses']
    matches['awayShotAccuracy'] = matches['accurateAwayShots']/matches['totalAwayShots']

    return matches

In [33]:
def matches_target(matches):
    def condition(row):
        if row['winner'] == row['homeId']:
            return 1
        elif row['winner'] == row['awayId']:
            return -1
        elif row['winner'] == 0:
            return 0
        else:
            return None
    matches['homeWins'] = matches.apply(condition, axis = 1)
    return matches

In [34]:
def get_last_matchranks(matches):
    
    matches = matches.sort_values(by = 'dateutc', ascending = False).reset_index()

    home_avg_matchrank = []
    away_avg_matchrank = []

    #matches_last_10_games_of_teams
    for index,row in matches.iterrows():
        #Teams and date
        home_id = matches['homeId'].iloc[index]
        away_id = matches['awayId'].iloc[index]
        date = matches['dateutc'].iloc[index]

        #Conditions
        condition_home_home = matches['homeId']==home_id
        condition_home_away = matches['awayId']==home_id
        condition_away_home = matches['homeId']==away_id
        condition_away_away = matches['awayId']==away_id
        condition_date = matches['dateutc']<date

        #Logic
        #home
        if (matches[(condition_home_home | condition_home_away) & condition_date].head(10).shape[0] < 11) & (matches[(condition_home_home | condition_home_away) & condition_date].head(10).shape[0] > 1):
            try:
                home_as_home_rank = matches[(condition_home_home | condition_home_away)  & condition_date].head(10).groupby('homeId').agg({'homeTeam_matchRank':'sum'}).loc[home_id,'homeTeam_matchRank']
                home_as_away_rank = matches[(condition_home_home | condition_home_away)  & condition_date].head(10).groupby('awayId').agg({'awayTeam_matchRank':'sum'}).loc[home_id,'awayTeam_matchRank']
                home_rank = home_as_home_rank + home_as_away_rank
                avg_home_rank = (home_rank / matches[(condition_home_home | condition_home_away) & condition_date].head(10).shape[0])
                home_avg_matchrank.append(avg_home_rank)
            except:
                home_avg_matchrank.append(None)
        else:
            home_avg_matchrank.append(None)
        #away
        if (matches[(condition_away_home | condition_away_away)  & condition_date].head(10).shape[0] < 11) & (matches[(condition_away_home | condition_away_away)  & condition_date].head(10).shape[0] > 1):
            try:
                away_as_home_rank = matches[(condition_away_home | condition_away_away)  & condition_date].head(10).groupby('homeId').agg({'homeTeam_matchRank':'sum'}).loc[away_id,'homeTeam_matchRank']
                away_as_away_rank = matches[(condition_away_home | condition_away_away)  & condition_date].head(10).groupby('awayId').agg({'awayTeam_matchRank':'sum'}).loc[away_id,'awayTeam_matchRank']
                away_rank = away_as_home_rank + away_as_away_rank
                avg_away_rank = (away_rank / matches[(condition_away_home | condition_away_away) & condition_date].head(10).shape[0])
                away_avg_matchrank.append(avg_away_rank)
            except:
                away_avg_matchrank.append(None)
        else:
            away_avg_matchrank.append(None)
            
    matches['homeLast10AvgRank'], matches['awayLast10AvgRank'] = home_avg_matchrank, away_avg_matchrank
    matches = matches.drop(columns = 'index')
    return matches
                

In [35]:
def get_wr(matches):
    matches = matches.sort_values(by = 'dateutc', ascending = False).reset_index()
    home_wins = matches['homeWins'] == 1
    home_loses = matches['homeWins'] == -1
    
    home_wr =[]
    away_wr =[]
    
    #matches_last_10_games_of_teams
    for index,row in matches.iterrows():
        home_id = matches['homeId'].iloc[index]
        away_id = matches['awayId'].iloc[index]
        date = matches['dateutc'].iloc[index]
        
        condition_home_home = matches['homeId']==home_id
        condition_home_away = matches['awayId']==home_id
        condition_away_home = matches['homeId']==away_id
        condition_away_away = matches['awayId']==away_id
        condition_date = matches['dateutc']<date
        #home_10_games_wr
        if (matches[(condition_home_home | condition_home_away) & condition_date].head(10).shape[0] < 11) & (matches[(condition_home_home | condition_home_away) & condition_date].head(10).shape[0] > 1):
            try:
                home_wins = matches[(condition_home_home | condition_home_away)  & condition_date].head(10).groupby('winner').count().loc[home_id,'homeWins']#home last 10 games wins
                home_wr.append(home_wins/10)
            except:
                home_wr.append(0)
        else:
            home_wr.append(None)

        #away_10_gams_wr
        if (matches[(condition_away_home | condition_away_away)  & condition_date].head(10).shape[0] < 11) & (matches[(condition_away_home | condition_away_away)  & condition_date].head(10).shape[0] > 1):
            try:
                away_wins = matches[(condition_away_home | condition_away_away) & condition_date].head(10).groupby('winner').count().loc[away_id,'homeWins'] #away last 10 games wins
                away_wr.append(away_wins/10)
            except:
                away_wr.append(0)
        else:
            away_wr.append(None)
    matches['homeWRlast10Games'] = home_wr
    matches['awayWRlast10Games'] = away_wr
    matches = matches.drop(columns = 'index')

    return matches


In [36]:
def get_performance(matches):

    
    home_pass_accu = []
    home_shot_accu = []
    
    away_pass_accu = []
    away_shot_accu = []

    #matches_last_10_games_of_teams
    for index,row in matches.iterrows():
        home_id = matches['homeId'].iloc[index]
        away_id = matches['awayId'].iloc[index]
        date = matches['dateutc'].iloc[index]
        
        condition_home_home = matches['homeId']==home_id
        condition_home_away = matches['awayId']==home_id
        condition_away_home = matches['homeId']==away_id
        condition_away_away = matches['awayId']==away_id
        condition_date = matches['dateutc']<date

        if (matches[(condition_home_home | condition_home_away) & condition_date].head(10).shape[0] < 11) & (matches[(condition_home_home | condition_home_away) & condition_date].head(10).shape[0] > 1):
            try:
                pass_accu_home_as_home = matches[(condition_home_home | condition_home_away) & condition_date][['homeId','homePassAccuracy']].head(10).groupby('homeId').mean().loc[home_id,'homePassAccuracy']
                shot_accu_home_as_home = matches[(condition_home_home | condition_home_away) & condition_date][['homeId','homeShotAccuracy']].head(10).groupby('homeId').mean().loc[home_id,'homeShotAccuracy']
            except:
                pass_accu_home_as_home = None
                shot_accu_home_as_home = None
            try:
                pass_accu_home_as_away = matches[(condition_home_home | condition_home_away) & condition_date][['awayId','homePassAccuracy']].head(10).groupby('awayId').mean().loc[home_id,'homePassAccuracy']
                shot_accu_home_as_away = matches[(condition_home_home | condition_home_away) & condition_date][['awayId','homeShotAccuracy']].head(10).groupby('awayId').mean().loc[home_id,'homeShotAccuracy']
            
            except:
                pass_accu_home_as_away = None
                shot_accu_home_as_away = None
            try:
                avg_pass_accu_home = (pass_accu_home_as_home + pass_accu_home_as_away)/2
                avg_shot_accu_home = (shot_accu_home_as_home + shot_accu_home_as_away)/2
                
                home_pass_accu.append(avg_pass_accu_home)
                home_shot_accu.append(avg_shot_accu_home)
            except:
                home_pass_accu.append(None)
                home_shot_accu.append(None)
        else:
            home_pass_accu.append(None)
            home_shot_accu.append(None)

        if (matches[(condition_away_home | condition_away_away)  & condition_date].head(10).shape[0] < 11) & (matches[(condition_away_home | condition_away_away)  & condition_date].head(10).shape[0] > 1):
            try:
                pass_accu_away_as_home = matches[(condition_away_home | condition_away_away) & condition_date][['homeId','homePassAccuracy']].head(10).groupby('homeId').mean().loc[away_id,'homePassAccuracy']
                shot_accu_away_as_home = matches[(condition_away_home | condition_away_away) & condition_date][['homeId','homeShotAccuracy']].head(10).groupby('homeId').mean().loc[away_id,'homeShotAccuracy']
            except:
                pass_accu_away_as_home = None
                shot_accu_away_as_home = None
            try:
                pass_accu_away_as_away = matches[(condition_away_home | condition_away_away) & condition_date][['awayId','homePassAccuracy']].head(10).groupby('awayId').mean().loc[away_id,'homePassAccuracy']
                shot_accu_away_as_away = matches[(condition_away_home | condition_away_away) & condition_date][['awayId','homeShotAccuracy']].head(10).groupby('awayId').mean().loc[away_id,'homeShotAccuracy']
            except:
                pass_accu_away_as_away = None
                shot_accu_away_as_away = None
                
            try:
                avg_pass_accu_away = (pass_accu_away_as_home + pass_accu_away_as_away)/2
                avg_shot_accu_away = (shot_accu_away_as_home + shot_accu_away_as_away)/2

                away_pass_accu.append(avg_pass_accu_away)
                away_shot_accu.append(avg_shot_accu_away)
            except:
                away_pass_accu.append(None)
                away_shot_accu.append(None)
        else:
            away_pass_accu.append(None)
            away_shot_accu.append(None)
    matches['avgHomePassAccuLast10Games'],matches['avgHomeShotAccuLast10Games'] = home_pass_accu, home_shot_accu
    matches['avgAwayPassAccuLast10Games'],matches['avgAwayShotAccuLast10Games'] = away_pass_accu, away_shot_accu
    return matches
    

# Dataset with all leagues


In [37]:
events_dfs.keys()

dict_keys(['events_European_Championship', 'events_France', 'events_Italy', 'events_Spain', 'events_Germany', 'events_World_Cup', 'events_England'])

In [38]:
matches_dfs.keys()

dict_keys(['matches_Italy', 'matches_European_Championship', 'matches_Germany', 'matches_France', 'matches_Spain', 'matches_World_Cup', 'matches_England'])

In [39]:
#other_dfs['events_Italy']

### Events

Getting events from all of the matches of the league, and then getting only the passes and shots with a new column indicating if it was accurate or not.

In [40]:
#Getting all the passes and shots, and retrieving if it was an accurate pass or not
ev_it = events_dfs['events_Italy'].copy()

In [41]:
ev_it = accurate_not_accurate(ev_it)

In [42]:
ev_it.head(5)

Unnamed: 0,eventId,subEventName,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id,accurate
0,8,Simple pass,8327,"[{'y': 52, 'x': 49}, {'y': 44, 'x': 43}]",2575959,Pass,3158,1H,2.530536,85,180423957,1
1,8,Simple pass,20438,"[{'y': 44, 'x': 43}, {'y': 17, 'x': 36}]",2575959,Pass,3158,1H,3.768418,85,180423958,1
2,8,Simple pass,8306,"[{'y': 15, 'x': 72}, {'y': 25, 'x': 72}]",2575959,Pass,3158,1H,10.376066,85,180423962,1
3,8,Simple pass,86366,"[{'y': 82, 'x': 38}, {'y': 88, 'x': 43}]",2575959,Pass,3172,1H,16.240563,85,180423979,1
4,8,Simple pass,8306,"[{'y': 15, 'x': 61}, {'y': 15, 'x': 37}]",2575959,Pass,3158,1H,19.153235,85,180423968,1


### Matches

Getting matches from a league and relevant features

In [43]:
#Relevant features
match_features = ['teamsData','seasonId', 'dateutc', 'winner', 'wyId', 'competitionId']

In [44]:
#retrieving italy matches
italy_games = pd.DataFrame(matches_dfs['matches_Italy'])[match_features].copy()
italy_games.rename(columns = {'wyId':'matchId'}, inplace = True)

In [45]:
italy_games.head(3)

Unnamed: 0,teamsData,seasonId,dateutc,winner,matchId,competitionId
0,"{'3162': {'scoreET': 0, 'coachId': 251025, 'si...",181248,2018-05-20 18:45:00,3161,2576335,524
1,"{'3158': {'scoreET': 0, 'coachId': 210119, 'si...",181248,2018-05-20 18:45:00,3158,2576336,524
2,"{'3173': {'scoreET': 0, 'coachId': 251044, 'si...",181248,2018-05-20 16:00:00,3173,2576329,524


## Processing data and feature engineering

Using built functions to process data and obtain new features

### Columns for calculation of new features

#### homeId and awayId columns

Extracting homeId and awayId from match data in teamsData column

In [46]:
#Creating homeId and awayId columns
italy_matches_prep = create_home_away_cols(italy_games).copy()

In [47]:
italy_matches_prep.head(3)

Unnamed: 0,teamsData,seasonId,dateutc,winner,matchId,competitionId,homeId,awayId
0,"{'3162': {'scoreET': 0, 'coachId': 251025, 'si...",181248,2018-05-20 18:45:00,3161,2576335,524,3162,3161
1,"{'3158': {'scoreET': 0, 'coachId': 210119, 'si...",181248,2018-05-20 18:45:00,3158,2576336,524,3315,3158
2,"{'3173': {'scoreET': 0, 'coachId': 251044, 'si...",181248,2018-05-20 16:00:00,3173,2576329,524,3173,3172


#### homeTeam, awayTeam, homeScore and awayScore columns

Retrieving score and homeTeam and awayTeam namesfrom teams dataset

In [48]:
#Getting teams names from homeId and awayId and extracting from teams dataset
italy_matches_prep = get_home_away_names(italy_matches_prep, other_dfs['teams'])

In [49]:
#Getting scores from teamsData column
italy_matches_prep = get_goals(italy_matches_prep)

In [50]:
italy_matches_prep.head(3)

Unnamed: 0,teamsData,seasonId,dateutc,winner,matchId,competitionId,homeId,awayId,homeTeam,awayTeam,homeScore,awayScore
0,"{'3162': {'scoreET': 0, 'coachId': 251025, 'si...",181248,2018-05-20 18:45:00,3161,2576335,524,3162,3161,Lazio,Internazionale,2,3
58,"{'3158': {'scoreET': 0, 'coachId': 210119, 'si...",181248,2018-05-20 18:45:00,3158,2576336,524,3315,3158,Sassuolo,Roma,0,1
45,"{'3164': {'scoreET': 0, 'coachId': 210121, 'si...",181248,2018-05-20 16:00:00,3204,2576337,524,3204,3164,SPAL,Sampdoria,3,1


#### homeTeam_matchRank	& awayTeam_matchRank columns

Calculating the average rank of the team based on the performance of lineup players of the match in each team, for the current match

In [51]:
#Getting matchrank as average of playerranks of each team in current match
italy_matches_prep = get_avg_playerank(italy_matches_prep, other_dfs['playerank'])

In [52]:
italy_matches_prep.head(3)

Unnamed: 0,seasonId,dateutc,winner,matchId,competitionId,homeId,awayId,homeTeam,awayTeam,homeScore,awayScore,homeTeam_matchRank,awayTeam_matchRank
0,181248,2018-05-20 18:45:00,3161,2576335,524,3162,3161,Lazio,Internazionale,2,3,0.01185,0.00595
58,181248,2018-05-20 18:45:00,3158,2576336,524,3315,3158,Sassuolo,Roma,0,1,0.007,0.02564
45,181248,2018-05-20 16:00:00,3204,2576337,524,3204,3164,SPAL,Sampdoria,3,1,0.0132,0.00801


#### totalHomePasses, accurateHomePasses, totalHomeShots, accurateHomeShots, totalAwayPasses, accurateAwayPasses, totalAwayShots & accurateAwayShots columns

Building columns in order to get accuracy measures, retrieving from events_df with passes and shots with accuracy categorization from events section

In [53]:
italy_matches_prep = matches_events(italy_matches_prep,ev_it)

In [54]:
italy_matches_prep.head(1)

Unnamed: 0,seasonId,dateutc,winner,matchId,competitionId,homeId,awayId,homeTeam,awayTeam,homeScore,awayScore,homeTeam_matchRank,awayTeam_matchRank,totalHomePasses,accurateHomePasses,totalHomeShots,accurateHomeShots,totalAwayPasses,accurateAwayPasses,totalAwayShots,accurateAwayShots
0,181248,2018-05-20 18:45:00,3161,2576335,524,3162,3161,Lazio,Internazionale,2,3,0.01185,0.00595,302,249,14,4,492,417,14,6


#### homePassAccuracy, homeShotAccuracy, awayPassAccuracy, awayShotAccuracy

In [55]:
italy_matches_prep = accuracy_features(italy_matches_prep)

In [56]:
italy_matches_prep.head(1)

Unnamed: 0,seasonId,dateutc,winner,matchId,competitionId,homeId,awayId,homeTeam,awayTeam,homeScore,awayScore,homeTeam_matchRank,awayTeam_matchRank,totalHomePasses,accurateHomePasses,totalHomeShots,accurateHomeShots,totalAwayPasses,accurateAwayPasses,totalAwayShots,accurateAwayShots,homePassAccuracy,homeShotAccuracy,awayPassAccuracy,awayShotAccuracy
0,181248,2018-05-20 18:45:00,3161,2576335,524,3162,3161,Lazio,Internazionale,2,3,0.01185,0.00595,302,249,14,4,492,417,14,6,0.824503,0.285714,0.847561,0.428571


### Target

Target is needed to build the feature WR of last 10 games so is setup before features:
- If home wins: 1
- If draw: 0
- If away wins: -1

In [57]:
italy_matches_prep = matches_target(italy_matches_prep)

In [58]:
italy_matches_prep.head(1)

Unnamed: 0,seasonId,dateutc,winner,matchId,competitionId,homeId,awayId,homeTeam,awayTeam,homeScore,awayScore,homeTeam_matchRank,awayTeam_matchRank,totalHomePasses,accurateHomePasses,totalHomeShots,accurateHomeShots,totalAwayPasses,accurateAwayPasses,totalAwayShots,accurateAwayShots,homePassAccuracy,homeShotAccuracy,awayPassAccuracy,awayShotAccuracy,homeWins
0,181248,2018-05-20 18:45:00,3161,2576335,524,3162,3161,Lazio,Internazionale,2,3,0.01185,0.00595,302,249,14,4,492,417,14,6,0.824503,0.285714,0.847561,0.428571,-1


### Feature engineering

In this section we engineer some relevant features for analysis
- Avg Shot and pass accuracy of last 10 games for home and away team
- Win ratio of last 10 games for home and away team
- And avg match rank for last 10 games for home and away team

#### avgHomePassAccuLast10Games, avgHomeShotAccuLast10Games, avgAwayPassAccuLast10Games,	avgAwayShotAccuLast10Games

We calculate the avg of accuracy of last 10 games for each team

In [59]:
italy_matches_prep = get_performance(italy_matches_prep)

In [60]:
italy_matches_prep.head(1)

Unnamed: 0,seasonId,dateutc,winner,matchId,competitionId,homeId,awayId,homeTeam,awayTeam,homeScore,awayScore,homeTeam_matchRank,awayTeam_matchRank,totalHomePasses,accurateHomePasses,totalHomeShots,accurateHomeShots,totalAwayPasses,accurateAwayPasses,totalAwayShots,accurateAwayShots,homePassAccuracy,homeShotAccuracy,awayPassAccuracy,awayShotAccuracy,homeWins,avgHomePassAccuLast10Games,avgHomeShotAccuLast10Games,avgAwayPassAccuLast10Games,avgAwayShotAccuLast10Games
0,181248,2018-05-20 18:45:00,3161,2576335,524,3162,3161,Lazio,Internazionale,2,3,0.01185,0.00595,302,249,14,4,492,417,14,6,0.824503,0.285714,0.847561,0.428571,-1,0.799875,0.460678,0.851845,0.345496


#### homeLast10AvgRank,	awayLast10AvgRank

We calculate the avg rank of last 10 matches for each team

In [61]:
italy_matches_prep = get_last_matchranks(italy_matches_prep)

In [62]:
italy_matches_prep.head(1)

Unnamed: 0,seasonId,dateutc,winner,matchId,competitionId,homeId,awayId,homeTeam,awayTeam,homeScore,awayScore,homeTeam_matchRank,awayTeam_matchRank,totalHomePasses,accurateHomePasses,totalHomeShots,accurateHomeShots,totalAwayPasses,accurateAwayPasses,totalAwayShots,accurateAwayShots,homePassAccuracy,homeShotAccuracy,awayPassAccuracy,awayShotAccuracy,homeWins,avgHomePassAccuLast10Games,avgHomeShotAccuLast10Games,avgAwayPassAccuLast10Games,avgAwayShotAccuLast10Games,homeLast10AvgRank,awayLast10AvgRank
0,181248,2018-05-20 18:45:00,3161,2576335,524,3162,3161,Lazio,Internazionale,2,3,0.01185,0.00595,302,249,14,4,492,417,14,6,0.824503,0.285714,0.847561,0.428571,-1,0.799875,0.460678,0.851845,0.345496,0.004582,0.007148


#### homeWRlast10Games,	awayWRlast10Games

We calculate the WR of the last 10 games for each team

In [63]:
italy_matches_prep = get_wr(italy_matches_prep)

In [64]:
italy_matches_prep.head(3)

Unnamed: 0,seasonId,dateutc,winner,matchId,competitionId,homeId,awayId,homeTeam,awayTeam,homeScore,awayScore,homeTeam_matchRank,awayTeam_matchRank,totalHomePasses,accurateHomePasses,totalHomeShots,accurateHomeShots,totalAwayPasses,accurateAwayPasses,totalAwayShots,accurateAwayShots,homePassAccuracy,homeShotAccuracy,awayPassAccuracy,awayShotAccuracy,homeWins,avgHomePassAccuLast10Games,avgHomeShotAccuLast10Games,avgAwayPassAccuLast10Games,avgAwayShotAccuLast10Games,homeLast10AvgRank,awayLast10AvgRank,homeWRlast10Games,awayWRlast10Games
0,181248,2018-05-20 18:45:00,3161,2576335,524,3162,3161,Lazio,Internazionale,2,3,0.01185,0.00595,302,249,14,4,492,417,14,6,0.824503,0.285714,0.847561,0.428571,-1,0.799875,0.460678,0.851845,0.345496,0.004582,0.007148,0.5,0.5
1,181248,2018-05-20 18:45:00,3158,2576336,524,3315,3158,Sassuolo,Roma,0,1,0.007,0.02564,337,262,14,3,500,423,15,6,0.777448,0.214286,0.846,0.4,-1,0.846916,0.446443,0.806129,0.254985,0.004757,0.007135,0.5,0.6
2,181248,2018-05-20 16:00:00,3204,2576337,524,3204,3164,SPAL,Sampdoria,3,1,0.0132,0.00801,363,315,14,5,446,386,9,3,0.867769,0.357143,0.865471,0.333333,1,0.82153,0.350605,0.796329,0.502669,0.010732,0.006874,0.2,0.3


### Dataset cleaning

Droping Nan rows and index column

In [65]:
italy_matches_prep_cleaned = italy_matches_prep.dropna(subset = [
    'avgHomePassAccuLast10Games','avgHomeShotAccuLast10Games',
    'avgAwayPassAccuLast10Games','avgAwayShotAccuLast10Games',
    'homeWRlast10Games', 'awayWRlast10Games',
    'homeLast10AvgRank', 'awayLast10AvgRank'
])

In [66]:
italy_matches_prep_cleaned.head(1)

Unnamed: 0,seasonId,dateutc,winner,matchId,competitionId,homeId,awayId,homeTeam,awayTeam,homeScore,awayScore,homeTeam_matchRank,awayTeam_matchRank,totalHomePasses,accurateHomePasses,totalHomeShots,accurateHomeShots,totalAwayPasses,accurateAwayPasses,totalAwayShots,accurateAwayShots,homePassAccuracy,homeShotAccuracy,awayPassAccuracy,awayShotAccuracy,homeWins,avgHomePassAccuLast10Games,avgHomeShotAccuLast10Games,avgAwayPassAccuLast10Games,avgAwayShotAccuLast10Games,homeLast10AvgRank,awayLast10AvgRank,homeWRlast10Games,awayWRlast10Games
0,181248,2018-05-20 18:45:00,3161,2576335,524,3162,3161,Lazio,Internazionale,2,3,0.01185,0.00595,302,249,14,4,492,417,14,6,0.824503,0.285714,0.847561,0.428571,-1,0.799875,0.460678,0.851845,0.345496,0.004582,0.007148,0.5,0.5


# Model training

### Defining variables

In [67]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping

2023-09-07 16:47:44.787961: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-07 16:47:45.138059: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-07 16:47:45.139535: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [68]:
X = italy_matches_prep_cleaned[['avgHomePassAccuLast10Games','avgHomeShotAccuLast10Games',
                     'avgAwayPassAccuLast10Games','avgAwayShotAccuLast10Games',
                     'homeWRlast10Games', 'awayWRlast10Games',
                    'homeLast10AvgRank', 'awayLast10AvgRank'
                    ]]
y = italy_matches_prep_cleaned['homeWins']

In [69]:
y = pd.get_dummies(y,prefix = 'Class', dtype = int)

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)

In [71]:
def initialize_model():
    model = Sequential()
    model.add(layers.Dense(80, input_dim = 8, activation = 'relu'))

    model.add(layers.Dense(30, activation = 'relu'))
    
    model.add(layers.Dense(16, activation = 'relu'))

    model.add(layers.Dense(6, activation = 'relu'))
    model.add(layers.Dense(6, activation = 'relu'))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(4, activation = 'relu'))
    #model.add(layers.Dropout(0.1))
    model.add(layers.Dense(3, activation = 'softmax'))

    model.compile(
        loss = 'categorical_crossentropy',
        optimizer = 'adam',
        metrics = 'accuracy'
    )

    return model

In [72]:
es = EarlyStopping(patience = 30, restore_best_weights=True)
model = initialize_model()
history = model.fit(
    X_train,
    y_train,
    validation_split = 0.3,
    shuffle = True,
    epochs = 200,
    batch_size =  64,
    verbose = 0
)

In [73]:
model.evaluate(X_test, y_test, verbose = 0)[1]

0.4901960790157318

# Building all leagues dataframe

We build the whole dataset for all leagues

In [74]:
def prep(matches, events):
    #Using relevant features for preprocessing
    match_features = ['teamsData','seasonId', 'dateutc', 'winner', 'wyId', 'competitionId']
    games = matches[match_features].copy()
    games.rename(columns = {'wyId':'matchId'}, inplace = True)

    #Processing events
    events_prep = accurate_not_accurate(events)

    #Getting current game info
    #Creating homeId and awayId columns
    games = create_home_away_cols(games).copy()
    #Getting teams names from homeId and awayId and extracting from teams dataset
    games = get_home_away_names(games, other_dfs['teams'])
    #Getting scores from teamsData column
    games = get_goals(games)
    #Getting matchrank as average of playerranks of each team in current match
    games = get_avg_playerank(games, other_dfs['playerank']) 
    #Getting events from the match
    games = matches_events(games,events_prep)
    #Getting accuracies
    games = accuracy_features(games)
    #Getting target
    games = matches_target(games) #este si

    #Features
    #Performance
    games = get_performance(games)
    #Avgmatchranks
    games = get_last_matchranks(games)
    #WRates
    games = get_wr(games)

    #Cleaning
    games = games.dropna(subset = [
    'avgHomePassAccuLast10Games','avgHomeShotAccuLast10Games',
    'avgAwayPassAccuLast10Games','avgAwayShotAccuLast10Games',
    'homeWRlast10Games', 'awayWRlast10Games',
    'homeLast10AvgRank', 'awayLast10AvgRank'
    ])

    return games
    
    

In [75]:
events_dfs.keys()

dict_keys(['events_European_Championship', 'events_France', 'events_Italy', 'events_Spain', 'events_Germany', 'events_World_Cup', 'events_England'])

In [76]:
matches_dfs.keys()

dict_keys(['matches_Italy', 'matches_European_Championship', 'matches_Germany', 'matches_France', 'matches_Spain', 'matches_World_Cup', 'matches_England'])

In [84]:
sorted(matches_dfs.keys())

['matches_England',
 'matches_European_Championship',
 'matches_France',
 'matches_Germany',
 'matches_Italy',
 'matches_Spain',
 'matches_World_Cup']

In [85]:
sorted(events_dfs.keys())

['events_England',
 'events_European_Championship',
 'events_France',
 'events_Germany',
 'events_Italy',
 'events_Spain',
 'events_World_Cup']

## Cleaning data for each league

We apply the previous function to clean the data from every league

In [82]:
european_ch_df_cleaned = prep(matches_dfs['matches_European_Championship'], events_dfs['events_European_Championship'])

In [87]:
france_df_cleaned = prep(matches_dfs['matches_France'], events_dfs['events_France'])

In [88]:
italy_df_cleaned = prep(matches_dfs['matches_Italy'], events_dfs['events_Italy'])

In [89]:
germany_df_cleaned = prep(matches_dfs['matches_Germany'], events_dfs['events_Germany'])

In [90]:
spain_df_cleaned = prep(matches_dfs['matches_Spain'], events_dfs['events_Spain'])

In [91]:
england_df_cleaned = prep(matches_dfs['matches_England'], events_dfs['events_England'])

In [92]:
worldcup_df_cleaned = prep(matches_dfs['matches_World_Cup'], events_dfs['events_World_Cup'])

### Stacking all the leagues together

We stack all the leagues one over each other

In [93]:
all_leagues_cleaned = pd.concat([european_ch_df_cleaned, 
                                 france_df_cleaned,
                                 italy_df_cleaned,
                                 germany_df_cleaned,
                                 spain_df_cleaned,
                                 england_df_cleaned,
                                 worldcup_df_cleaned
                                ],
                               axis = 0)

In [97]:
competitions = {524 : 'Serie A',
                364 : 'Premier League',
                795 : 'LaLiga',
                412 : 'Ligue 1',
                426 : 'Bundesliga',
                102 : 'UEFA championship',
                28 : 'World Cup'
               }

Unnamed: 0,name,wyId,format,area,type
0,Italian first division,524,Domestic league,"{'name': 'Italy', 'id': '380', 'alpha3code': '...",club
1,English first division,364,Domestic league,"{'name': 'England', 'id': '0', 'alpha3code': '...",club
2,Spanish first division,795,Domestic league,"{'name': 'Spain', 'id': '724', 'alpha3code': '...",club
3,French first division,412,Domestic league,"{'name': 'France', 'id': '250', 'alpha3code': ...",club
4,German first division,426,Domestic league,"{'name': 'Germany', 'id': '276', 'alpha3code':...",club
5,European Championship,102,International cup,"{'name': '', 'id': 0, 'alpha3code': 'XEU', 'al...",international
6,World Cup,28,International cup,"{'name': '', 'id': 0, 'alpha3code': 'XWO', 'al...",international


In [96]:
all_leagues_cleaned.competitionId.value_counts()

competitionId
412    341
524    339
364    335
795    331
426    271
102     13
28       8
Name: count, dtype: int64

In [859]:
all_leagues_cleaned.to_csv('../../processed_data/soccer_match/soccer_match_all_leagues_cleaned.csv', header = True, index = False)

## Model for all leagues

We select the features and targuet for the whole dataset and train a new model

#### Selecting features X and target y

We set X and y and encode y

In [839]:
X_all = all_leagues_cleaned[['avgHomePassAccuLast10Games','avgHomeShotAccuLast10Games',
                     'avgAwayPassAccuLast10Games','avgAwayShotAccuLast10Games',
                     'homeWRlast10Games', 'awayWRlast10Games',
                    'homeLast10AvgRank', 'awayLast10AvgRank'
                    ]]
y_all = all_leagues_cleaned['homeWins']


In [840]:
y_all = pd.get_dummies(y_all,prefix = 'Class', dtype = int)

#### Splitting the dataset intro train and test

We then split the dataset into train and test

In [841]:
X_train_all, X_test_all, y_train_all, y_test_all = train_test_split(X_all, y_all, test_size=0.3)

### Model1: Neural network

We define our model

In [711]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

In [1]:
def initialize_model1():
    model = Sequential()
    model.add(layers.Dense(80, input_dim = 8, activation = 'relu'))
    
    model.add(layers.Dense(16, activation = 'relu'))

    model.add(layers.Dense(6, activation = 'relu'))
    model.add(layers.Dense(6, activation = 'relu'))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(4, activation = 'relu'))
    #model.add(layers.Dropout(0.1))
    model.add(layers.Dense(3, activation = 'softmax'))

    model.compile(
        loss = 'categorical_crossentropy',
        optimizer = 'adam',
        metrics = 'accuracy'
    )

    return model

#### Initialization and evaluation

We initialize and train the model and evaluate on test data

In [2]:
es = EarlyStopping(patience = 30, restore_best_weights=True)
model = initialize_model()
history = model.fit(
    X_train_all,
    y_train_all,
    validation_split = 0.3,
    shuffle = True,
    epochs = 200,
    batch_size =  64,
    verbose = 0
)

NameError: name 'EarlyStopping' is not defined

In [None]:
model.evaluate(X_test_all, y_test_all, verbose = 0)[1]

## Model2: XGBOOST

In [854]:
y_all_1 = all_leagues_cleaned['homeWins']

In [855]:
y_all_1 = y_all_1.map({-1: 0, 0: 1, 1: 2})

In [856]:
X_train_all_1, X_test_all_1, y_train_all_1, y_test_all_1 = train_test_split(X_all, y_all_1, test_size=0.3)
X_test_all_1, X_val, y_test_all_1, y_val = train_test_split(
    X_test_all_1, y_test_all_1, test_size = 0.5 # TEST = 15%
)

In [857]:
model2 = xgb.XGBClassifier(objective='multi:softmax', random_state=42, learning_rate = 0.01)
model2.fit(X_train_all_1, y_train_all_1,
           eval_set=[(X_train_all_1, y_train_all_1), (X_val, y_val)],
           early_stopping_rounds=5,
           verbose = 0)

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


In [858]:
y_pred_all = model2.predict(X_test_all_1)
accuracy_score(y_test_all_1, y_pred_all)

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


0.43089430894308944