In [166]:
import pandas as pd
import os
import json
import numpy as np
pd.set_option('display.max_columns', None)

# Load Data

In [2]:
data_directory = '../../raw_data/soccer_match-paper_dataset'
events_directory = os.path.join(data_directory, 'events')
matches_directory = os.path.join(data_directory, 'matches')
other_directory = os.path.join(data_directory, 'other')
tags_directory = os.path.join(data_directory, 'tags')

events_file_names = os.listdir(events_directory)
matches_file_names = os.listdir(matches_directory)
other_file_names = os.listdir(other_directory)
tags_file_names = os.listdir(tags_directory)

In [3]:
events_dfs = {}
for file_name in events_file_names:
    events_dfs[f"{file_name.replace('.json','')}"] = pd.read_json(os.path.join(events_directory,file_name))

In [4]:
events_dfs.keys()

dict_keys(['events_European_Championship', 'events_France', 'events_Italy', 'events_Spain', 'events_Germany', 'events_World_Cup', 'events_England'])

In [5]:
matches_dfs = {}
for file_name in matches_file_names:
    matches_dfs[f"{file_name.replace('.json','')}"] = pd.read_json(os.path.join(matches_directory,file_name))

In [6]:
matches_dfs.keys()

dict_keys(['matches_Italy', 'matches_European_Championship', 'matches_Germany', 'matches_France', 'matches_Spain', 'matches_World_Cup', 'matches_England'])

In [7]:
other_dfs = {}
for file_name in other_file_names:
    other_dfs[f"{file_name.replace('.json','')}"] = pd.read_json(os.path.join(other_directory,file_name))

In [8]:
other_dfs.keys()

dict_keys(['playerank', 'referees', 'competitions', 'coaches', 'teams', 'players'])

In [9]:
tags_dfs = {}
for file_name in tags_file_names:
    tags_dfs[f"{file_name.replace('.csv','')}"] = pd.read_csv(os.path.join(tags_directory,file_name))

In [10]:
tags_dfs.keys()

dict_keys(['eventid2name', 'tags2name'])

# Analyze data

## Events Dataset

In [11]:
events = events_dfs['events_Italy']

In [12]:
def accurate_not_accurate(events):
    accurate = []
    for index, row in events.iterrows():
        list = [events['tags'][index][i]['id'] for i in range(len(events['tags'][index]))]
        if 1801 in list:
            accurate.append(1)
        elif 1802 in list:
            accurate.append(0)
        else:
            accurate.append(None)
    events['accurate'] = accurate
    return events

In [13]:
events = accurate_not_accurate(events)

In [14]:
def matches_events(matches):
    home_passes = []
    home_accurate_passes = []
    home_shots = []
    home_accurate_shots = []

    away_passes = []
    away_accurate_passes = []
    away_shots = []
    away_accurate_shots = []
    
    condition_shot = (events['eventId']==10)
    condition_pass = (events['eventId']==8)
    condition_accurate = (events['accurate']==1)

    for index,row in matches.iterrows():
        match_id = matches['matchId'].loc[index]
        home_id = matches['homeId'].loc[index]
        away_id = matches['awayId'].loc[index]

        condition_match = (events['matchId']==match_id)
        condition_home_team = (events['teamId'] == home_id)
        condition_away_team = (events['teamId'] == away_id)
        try:
            home_passes.append(events[condition_match & condition_pass & condition_home_team].groupby('eventId').count()['id'].iloc[0])
        except:
            home_passes.append(0)
        try:
            home_accurate_passes.append(events[condition_match & condition_pass & condition_accurate & condition_home_team].groupby('eventId').count()['accurate'].iloc[0])
        except:
            home_accurate_passes.append(0)
        try:
            home_shots.append(events[condition_match & condition_shot & condition_home_team].groupby('eventId').count()['id'].iloc[0])
        except:
            home_shots.append(0)
        try:
            home_accurate_shots.append(events[condition_match & condition_shot & condition_accurate & condition_home_team].groupby('eventId').count()['accurate'].iloc[0])  
        except:
            home_accurate_shots.append(0)

        try:
            away_passes.append(events[condition_match & condition_pass & condition_away_team].groupby('eventId').count()['id'].iloc[0])
        except:
            away_passes.append(0)
        try:
            away_accurate_passes.append(events[condition_match & condition_pass & condition_accurate & condition_away_team].groupby('eventId').count()['accurate'].iloc[0])   
        except:
            away_accurate_passes.append(0)
        try:
            away_shots.append(events[condition_match & condition_shot & condition_away_team].groupby('eventId').count()['id'].iloc[0])
        except:
            away_shots.append(0)
        try:
            away_accurate_shots.append(events[condition_match & condition_shot & condition_accurate & condition_away_team].groupby('eventId').count()['accurate'].iloc[0])
        except:
            away_accurate_shots.append(0)
        
    matches['totalHomePasses'], matches['accurateHomePasses'], matches['totalHomeShots'], matches['accurateHomeShots'] = home_passes, home_accurate_passes, home_shots, home_accurate_shots
    matches['totalAwayPasses'], matches['accurateAwayPasses'], matches['totalAwayShots'], matches['accurateAwayShots'] = away_passes, away_accurate_passes, away_shots, away_accurate_shots
    return matches
        

In [16]:
tags_dfs['eventid2name'][tags_dfs['eventid2name']['event'].isin([3,8,10])].head(3)

Unnamed: 0,event,subevent,event_label,subevent_label
12,3,30,Free Kick,Corner
13,3,31,Free Kick,Free Kick
14,3,32,Free Kick,Free kick cross


In [17]:
tags_dfs['tags2name'].head(3)

Unnamed: 0,Tag,Label,Description
0,101,Goal,Goal
1,102,own_goal,Own goal
2,301,assist,Assist


## Playerank dataset

In [18]:
other_dfs['playerank'].head(3)

Unnamed: 0,goalScored,playerankScore,matchId,playerId,roleCluster,minutesPlayed
0,0,0.0053,2057991,10014,right CB,90
1,0,0.0009,2057992,10014,right CB,41
2,0,-0.0013,2057998,100140,central MF,90


In [19]:
other_dfs['playerank'][other_dfs['playerank']['playerId']==263591].head(3)

Unnamed: 0,goalScored,playerankScore,matchId,playerId,roleCluster,minutesPlayed
22039,0,0.0125,2576025,263591,central MF,34
22040,0,-0.0085,2576126,263591,right MF,19
22041,0,-0.0102,2576164,263591,right FW,28


## Competitions dataset

In [20]:
other_dfs['competitions'][['name','wyId', 'format', 'type']].head(3)

Unnamed: 0,name,wyId,format,type
0,Italian first division,524,Domestic league,club
1,English first division,364,Domestic league,club
2,Spanish first division,795,Domestic league,club


## Teams dataset

In [62]:
teams = other_dfs['teams'][['name', 'wyId']]
teams.head(100)

Unnamed: 0,name,wyId
0,Newcastle United,1613
1,Celta de Vigo,692
2,Espanyol,691
3,Deportivo Alav\u00e9s,696
4,Levante,695
...,...,...
95,Sevilla,680
96,M\u00e1laga,683
97,Villarreal,682
98,Korea Republic,14855


## Players dataset

In [22]:
other_dfs['players'][['firstName', 'lastName', 'currentTeamId', 'birthDate', 'role', 'wyId', 'currentNationalTeamId']].head(3)

Unnamed: 0,firstName,lastName,currentTeamId,birthDate,role,wyId,currentNationalTeamId
0,Harun,Tekin,4502,1989-06-17,"{'code2': 'GK', 'code3': 'GKP', 'name': 'Goalk...",32777,4687.0
1,Malang,Sarr,3775,1999-01-23,"{'code2': 'DF', 'code3': 'DEF', 'name': 'Defen...",393228,4423.0
2,Over,Mandanda,3772,1998-10-26,"{'code2': 'GK', 'code3': 'GKP', 'name': 'Goalk...",393230,


## Matches dataset

In [24]:
pd.DataFrame(matches_dfs['matches_Italy']).head(3)

Unnamed: 0,status,roundId,gameweek,teamsData,seasonId,dateutc,winner,venue,wyId,label,date,referees,duration,competitionId
0,Played,4406278,38,"{'3162': {'scoreET': 0, 'coachId': 251025, 'si...",181248,2018-05-20 18:45:00,3161,,2576335,"Lazio - Internazionale, 2 - 3","May 20, 2018 at 8:45:00 PM GMT+2","[{'refereeId': 377206, 'role': 'referee'}, {'r...",Regular,524
1,Played,4406278,38,"{'3158': {'scoreET': 0, 'coachId': 210119, 'si...",181248,2018-05-20 18:45:00,3158,MAPEI Stadium - Citt\u00e0 del Tricolore,2576336,"Sassuolo - Roma, 0 - 1","May 20, 2018 at 8:45:00 PM GMT+2","[{'refereeId': 377255, 'role': 'referee'}, {'r...",Regular,524
2,Played,4406278,38,"{'3173': {'scoreET': 0, 'coachId': 251044, 'si...",181248,2018-05-20 16:00:00,3173,,2576329,"Cagliari - Atalanta, 1 - 0","May 20, 2018 at 6:00:00 PM GMT+2","[{'refereeId': 377247, 'role': 'referee'}, {'r...",Regular,524


In [205]:
match_features = ['teamsData','seasonId', 'dateutc', 'winner', 'wyId', 'competitionId']

In [206]:
italy_games = pd.DataFrame(matches_dfs['matches_Italy'])[match_features]
italy_games.rename(columns = {'wyId':'matchId'}, inplace = True)
italy_games.head(3)

Unnamed: 0,teamsData,seasonId,dateutc,winner,matchId,competitionId
0,"{'3162': {'scoreET': 0, 'coachId': 251025, 'si...",181248,2018-05-20 18:45:00,3161,2576335,524
1,"{'3158': {'scoreET': 0, 'coachId': 210119, 'si...",181248,2018-05-20 18:45:00,3158,2576336,524
2,"{'3173': {'scoreET': 0, 'coachId': 251044, 'si...",181248,2018-05-20 16:00:00,3173,2576329,524


In [26]:
def create_home_away_cols(matches):
    home = []
    away = []
    for index, row in matches.iterrows():
        team0 = list(matches.loc[index].teamsData.keys())[0]
        team1 = list(matches.loc[index].teamsData.keys())[1]
        if matches.loc[index].teamsData[team0]['side'] == 'home':
            home.append(team0)
            away.append(team1)
        else:
            home.append(team1)
            away.append(team0)
    matches['homeId'] = home
    matches['homeId'] = matches['homeId'].astype(int)
    matches['awayId'] = away
    matches['awayId'] = matches['awayId'].astype(int)
    return matches

In [27]:
def get_home_away_names(matches):
    matches = matches.merge(teams, left_on = 'homeId', right_on = 'wyId')
    matches = matches.rename(columns = {'name' : 'homeTeam'}).drop(columns = 'wyId')
    matches = matches.merge(teams, left_on = 'awayId', right_on = 'wyId')
    matches = matches.rename(columns = {'name' : 'awayTeam'}).drop(columns = 'wyId')
    return matches.sort_values(by = 'dateutc', ascending = False)

In [28]:
def get_goals(matches):
    home_goals = []
    away_goals = []
    for index, row in matches.iterrows():
        team0 = list(matches.loc[index].teamsData.keys())[0]
        team1 = list(matches.loc[index].teamsData.keys())[1]
        goals0 = matches.loc[index].teamsData[team0]['score']
        goals1 = matches.loc[index].teamsData[team1]['score']
        if matches.loc[index].teamsData[team0]['side'] == 'home':
            home_goals.append(goals0)
            away_goals.append(goals1)
        else:
            home_goals.append(goals1)
            away_goals.append(goals0)
    matches['homeScore'] = home_goals
    matches['awayScore'] = away_goals
    return matches
        
        

In [29]:
def get_avg_playerank(matches):
    home_avg_ranks = []
    away_avg_ranks = []
    def calc_avg_scores(players, index):
        scores = []
        for player in players:           
            condition_player = other_dfs['playerank']['playerId'] == player
            condition_match = other_dfs['playerank']['matchId'] == matches.matchId.loc[index]
            try:
                player_score = other_dfs['playerank'][condition_player & condition_match]['playerankScore'].values[0]
                scores.append(player_score)
            except:
                None
        return np.mean(scores)

    for index, row in matches.iterrows():
        for team in list(matches.teamsData.loc[index].keys()):
            players = []
            for dic in matches.teamsData.loc[index][team]['formation']['lineup']:
                    players.append(dic['playerId'])
            if matches.loc[index].teamsData[team]['side'] == 'home':
                home_avg_ranks.append(calc_avg_scores(players, index))
            else:
                away_avg_ranks.append(calc_avg_scores(players, index))
    matches['homeTeam_matchRank'] = home_avg_ranks
    matches['awayTeam_matchRank'] = away_avg_ranks
    return matches
    
        

In [36]:
def accuracy_features(matches):
    matches['homePassAccuracy'] = matches['accurateHomePasses']/matches['totalHomePasses']
    matches['homeShotAccuracy'] = matches['accurateHomeShots']/matches['totalHomeShots']

    matches['awayPassAccuracy'] = matches['accurateAwayPasses']/matches['totalAwayPasses']
    matches['awayShotAccuracy'] = matches['accurateAwayShots']/matches['totalAwayShots']

    return matches

In [43]:
def matches_target(matches):
    def condition(row):
        if row['winner'] == row['homeId']:
            return 1
        elif row['winner'] == row['awayId']:
            return -1
        elif row['winner'] == 0:
            return 0
        else:
            return None
    matches['homeWins'] = matches.apply(condition, axis = 1)
    return matches

In [265]:
def get_wr(matches):
    matches = matches.sort_values(by = 'dateutc', ascending = False).reset_index()
    home_wins = matches['homeWins'] == 1
    home_loses = matches['homeWins'] == -1
    
    home_wr =[]
    away_wr =[]
    
    #matches_last_10_games_of_teams
    for index,row in matches.iterrows():
        home_id = matches['homeId'].loc[index]
        away_id = matches['awayId'].loc[index]
        date = matches['dateutc'].loc[index]
        
        condition_home_home = matches['homeId']==home_id
        condition_home_away = matches['awayId']==home_id
        condition_away_home = matches['homeId']==away_id
        condition_away_away = matches['awayId']==away_id
        condition_date = matches['dateutc']<date
        #home_10_games_wr
        if (matches[(condition_home_home | condition_home_away) & condition_date].head(10).shape[0] < 11) & (matches[(condition_home_home | condition_home_away) & condition_date].head(10).shape[0] > 1):
            try:
                home_wins = matches[(condition_home_home | condition_home_away)  & condition_date].groupby('winner').count().loc[home_id,'homeWins']#home last 10 games wins
                home_wr.append(home_wins/10)
            except:
                home_wr.append(0)
        else:
            home_wr.append(None)

        #away_10_gams_wr
        if (matches[(condition_away_home | condition_away_away)  & condition_date].head(10).shape[0] < 11) & (matches[(condition_away_home | condition_away_away)  & condition_date].head(10).shape[0] > 1):
            try:
                away_wins = matches[(condition_away_home | condition_away_away) & condition_date].head(10).groupby('winner').count().loc[away_id,'homeWins'] #away last 10 games wins
                away_wr.append(away_wins/10)
            except:
                away_wr.append(0)
        else:
            away_wr.append(None)
    matches['homeWRlast10Games'] = home_wr
    matches['awayWRlast10Games'] = away_wr

    return matches


In [739]:
def get_performance(matches):
    def get_curr_performance(matches):
        pass_weight = 0.2
        shot_weight = 0.5
        win_weight = 0.7
    
        matches['homePerformance'] = (matches['accurateHomePasses']*pass_weight) + \
                                    (matches['accurateHomeShots']*shot_weight) + \
                                    (matches['homeWins']*win_weight)
        matches['awayPerformance'] = (matches['accurateAwayPasses']*pass_weight) + \
                                    (matches['accurateAwayShots']*shot_weight) + \
                                    (matches['homeWins']*(-1)*win_weight)
        return matches

    matches = get_curr_performance(matches)
    matches = matches.sort_values(by = 'dateutc', ascending = False).reset_index()
    
    home_wins = matches['homeWins'] == 1
    home_loses = matches['homeWins'] == -1
    
    home_last_10_performance =[]
    away_last_10_performance =[]
    home_performance = []
    home_pass_accu = []
    home_shot_accu = []
    
    away_performance = []
    away_pass_accu = []
    away_shot_accu = []

    #matches_last_10_games_of_teams
    for index,row in matches.iterrows():
        home_id = matches['homeId'].loc[index]
        away_id = matches['awayId'].loc[index]
        date = matches['dateutc'].loc[index]
        
        condition_home_home = matches['homeId']==home_id
        condition_home_away = matches['awayId']==home_id
        condition_away_home = matches['homeId']==away_id
        condition_away_away = matches['awayId']==away_id
        condition_date = matches['dateutc']<date

        if (matches[(condition_home_home | condition_home_away) & condition_date].head(10).shape[0] < 11) & (matches[(condition_home_home | condition_home_away) & condition_date].head(10).shape[0] > 1):
            try:
                performance_home_as_home = matches[(condition_home_home | condition_home_away) & condition_date][['homeId','homePerformance']].head(10).groupby('homeId').mean().loc[home_id,'homePerformance']
                pass_accu_home_as_home = matches[(condition_home_home | condition_home_away) & condition_date][['homeId','homePassAccuracy']].head(10).groupby('homeId').mean().loc[home_id,'homePassAccuracy']
                shot_accu_home_as_home = matches[(condition_home_home | condition_home_away) & condition_date][['homeId','homeShotAccuracy']].head(10).groupby('homeId').mean().loc[home_id,'homeShotAccuracy']
            except:
                performance_home_as_home = None
                pass_accu_home_as_home = None
                shot_accu_home_as_home = None
            try:
                performance_home_as_away = matches[(condition_home_home | condition_home_away) & condition_date][['awayId','homePerformance']].head(10).groupby('awayId').mean().loc[home_id,'homePerformance']
                pass_accu_home_as_away = matches[(condition_home_home | condition_home_away) & condition_date][['awayId','homePassAccuracy']].head(10).groupby('awayId').mean().loc[home_id,'homePassAccuracy']
                shot_accu_home_as_away = matches[(condition_home_home | condition_home_away) & condition_date][['awayId','homeShotAccuracy']].head(10).groupby('awayId').mean().loc[home_id,'homeShotAccuracy']
            
            except:
                performance_home_as_away = None
                pass_accu_home_as_away = None
                shot_accu_home_as_away = None
            try:
                avg_performance_home = (performance_home_as_home + performance_home_as_away)/2
                avg_pass_accu_home = (pass_accu_home_as_home + pass_accu_home_as_away)/2
                avg_shot_accu_home = (shot_accu_home_as_home + shot_accu_home_as_away)/2
                
                home_performance.append(avg_performance_home)
                home_pass_accu.append(avg_pass_accu_home)
                home_shot_accu.append(avg_shot_accu_home)
            except:
                home_performance.append(None)
                home_pass_accu.append(None)
                home_shot_accu.append(None)
        else:
            home_performance.append(None)
            home_pass_accu.append(None)
            home_shot_accu.append(None)

        if (matches[(condition_away_home | condition_away_away)  & condition_date].head(10).shape[0] < 11) & (matches[(condition_away_home | condition_away_away)  & condition_date].head(10).shape[0] > 1):
            try:
                performance_away_as_home = matches[(condition_away_home | condition_away_away)  & condition_date][['homeId','homePerformance']].head(10).groupby('homeId').mean().loc[away_id,'homePerformance']
                pass_accu_away_as_home = matches[(condition_away_home | condition_away_away) & condition_date][['homeId','homePassAccuracy']].head(10).groupby('homeId').mean().loc[away_id,'homePassAccuracy']
                shot_accu_away_as_home = matches[(condition_away_home | condition_away_away) & condition_date][['homeId','homeShotAccuracy']].head(10).groupby('homeId').mean().loc[away_id,'homeShotAccuracy']
            except:
                performance_away_as_home = None
                pass_accu_away_as_home = None
                shot_accu_away_as_home = None
            try:
                performance_away_as_away = matches[(condition_away_home | condition_away_away)  & condition_date][['awayId','homePerformance']].head(10).groupby('awayId').mean().loc[away_id,'homePerformance']
                pass_accu_away_as_away = matches[(condition_away_home | condition_away_away) & condition_date][['awayId','homePassAccuracy']].head(10).groupby('awayId').mean().loc[away_id,'homePassAccuracy']
                shot_accu_away_as_away = matches[(condition_away_home | condition_away_away) & condition_date][['awayId','homeShotAccuracy']].head(10).groupby('awayId').mean().loc[away_id,'homeShotAccuracy']
            except:
                performance_away_as_away = None
                pass_accu_away_as_away = None
                shot_accu_away_as_away = None
                
            try:
                avg_performance_away = (performance_away_as_home + performance_away_as_away)/2
                avg_pass_accu_away = (pass_accu_away_as_home + pass_accu_away_as_away)/2
                avg_shot_accu_away = (shot_accu_away_as_home + shot_accu_away_as_away)/2

                
                away_performance.append(avg_performance_away)
                away_pass_accu.append(avg_pass_accu_away)
                away_shot_accu.append(avg_shot_accu_away)
            except:
                away_performance.append(None)
                away_pass_accu.append(None)
                away_shot_accu.append(None)
        else:
            away_performance.append(None)
            away_pass_accu.append(None)
            away_shot_accu.append(None)
    matches['avgHomePerformanceLast10Games'],matches['avgAwayPerformanceLast10Games'] = home_performance, away_performance
    matches['avgHomePassAccuLast10Games'],matches['avgHomeShotAccuLast10Games'] = home_pass_accu, home_shot_accu
    matches['avgAwayPassAccuLast10Games'],matches['avgAwayShotAccuLast10Games'] = away_pass_accu, away_shot_accu
    matches = matches.drop(columns = ['homePerformance', 'awayPerformance'])
    return matches
    

In [740]:
matches.keys()

Index(['level_0', 'seasonId', 'dateutc', 'winner', 'matchId', 'competitionId',
       'homeId', 'awayId', 'homeTeam', 'awayTeam', 'homeScore', 'awayScore',
       'homeTeam_matchRank', 'awayTeam_matchRank', 'totalHomePasses',
       'accurateHomePasses', 'totalHomeShots', 'accurateHomeShots',
       'totalAwayPasses', 'accurateAwayPasses', 'totalAwayShots',
       'accurateAwayShots', 'homePassAccuracy', 'homeShotAccuracy',
       'awayPassAccuracy', 'awayShotAccuracy', 'homeWins',
       'avgHomePerformanceLast10Games', 'avgAwayPerformanceLast10Games',
       'homeWRlast10Games', 'awayWRlast10Games'],
      dtype='object')

In [741]:
matches = create_home_away_cols(italy_games)

In [742]:
matches = get_home_away_names(matches)
matches = get_goals(matches)

In [743]:
matches = get_avg_playerank(matches)

In [744]:
matches = matches_events(matches)

In [745]:
matches = accuracy_features(matches)

In [746]:
matches = matches_target(matches)

In [747]:
matches = get_performance(matches)

In [748]:
matches = get_wr(matches)

In [749]:
matches = matches.drop(columns = ['teamsData', 'index'])

In [750]:
matches_cleaned = matches.dropna()

In [751]:
matches_cleaned.head(5)

Unnamed: 0,level_0,seasonId,dateutc,winner,matchId,competitionId,homeId,awayId,homeTeam,awayTeam,homeScore,awayScore,homeTeam_matchRank,awayTeam_matchRank,totalHomePasses,accurateHomePasses,totalHomeShots,accurateHomeShots,totalAwayPasses,accurateAwayPasses,totalAwayShots,accurateAwayShots,homePassAccuracy,homeShotAccuracy,awayPassAccuracy,awayShotAccuracy,homeWins,avgHomePerformanceLast10Games,avgAwayPerformanceLast10Games,avgHomePassAccuLast10Games,avgHomeShotAccuLast10Games,avgAwayPassAccuLast10Games,avgAwayShotAccuLast10Games,homeWRlast10Games,awayWRlast10Games
0,0,181248,2018-05-20 18:45:00,3161,2576335,524,3162,3161,Lazio,Internazionale,2,3,0.01185,0.00595,302,249,14,4,492,417,14,6,0.824503,0.285714,0.847561,0.428571,-1,64.42,88.95,0.799875,0.460678,0.851845,0.345496,2.1,0.5
1,1,181248,2018-05-20 18:45:00,3158,2576336,524,3315,3158,Sassuolo,Roma,0,1,0.00609,0.0101,337,262,14,3,500,423,15,6,0.777448,0.214286,0.846,0.4,-1,69.2875,66.67,0.776408,0.415245,0.785921,0.346328,1.1,0.6
2,2,181248,2018-05-20 16:00:00,3204,2576337,524,3204,3164,SPAL,Sampdoria,3,1,0.01765,-2e-05,363,315,14,5,446,386,9,3,0.867769,0.357143,0.865471,0.333333,1,58.71,77.14,0.793445,0.32702,0.830629,0.321833,0.7,0.3
3,3,181248,2018-05-20 16:00:00,3163,2576331,524,3163,3166,Udinese,Bologna,1,0,0.0152,0.00765,327,275,17,5,518,446,8,4,0.840979,0.294118,0.861004,0.5,1,75.11,68.59,0.818307,0.319709,0.806424,0.340947,1.1,0.1
4,4,181248,2018-05-20 16:00:00,3173,2576329,524,3173,3172,Cagliari,Atalanta,1,0,0.0058,0.004,232,173,6,4,587,513,11,4,0.74569,0.666667,0.873935,0.363636,1,71.6875,73.620833,0.792047,0.268286,0.822814,0.245637,1.0,0.5


# Model training

### Defining variables

In [905]:
X = matches_cleaned[['avgHomePassAccuLast10Games','avgHomeShotAccuLast10Games',
                     'avgAwayPassAccuLast10Games','avgAwayShotAccuLast10Games',
                     'homeWRlast10Games', 'awayWRlast10Games']]
y = matches_cleaned['homeWins']

In [906]:
y = pd.get_dummies(y,prefix = 'Class', dtype = int)

In [907]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping

In [908]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)

In [915]:
def initialize_model():
    model = Sequential()
    model.add(layers.Dense(80, input_dim = 6, activation = 'relu'))
    
    model.add(layers.Dense(20, activation = 'relu'))
    
    model.add(layers.Dense(16, activation = 'relu'))
    model.add(layers.Dense(6, activation = 'relu'))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(4, activation = 'relu'))
    #model.add(layers.Dropout(0.1))
    model.add(layers.Dense(3, activation = 'softmax'))

    model.compile(
        loss = 'categorical_crossentropy',
        optimizer = 'adam',
        metrics = 'accuracy'
    )

    return model

In [916]:
es = EarlyStopping(patience = 30, restore_best_weights=True)
model = initialize_model()
history = model.fit(
    X_train,
    y_train,
    validation_split = 0.3,
    shuffle = True,
    epochs = 200,
    batch_size =  64,
    verbose = 0
)

In [917]:
model.evaluate(X_test, y_test, verbose = 0)[1]

0.5092592835426331