In [None]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder


### Datos de cada BPI

Defino las rutinas para capturar las métricas,

In [52]:
def direct_play(x):

    maxi,pos=0,[]

    for i in range(len(x)-1):

        e1,e2 = x[i],x[i+1]


        dt = (e2["eventSec"]-e1["eventSec"])

        if dt>2:

            if e1["eventName"]=="Pass" or e1["eventName"]=="Free Kick":

                dist_vertical = (e2['positions'][0]['x']- e1['positions'][0]['x'])/dt

                if dist_vertical>maxi:
                    maxi=dist_vertical
                


    return maxi


def counterattack(x):

    x0, xf = x[0]["positions"][0]["x"], x[-1]["positions"][0]["x"]

    if x0<40 and xf>60:

        dt = x[-1]["eventSec"]-x[0]["eventSec"]

        if dt>0:

            v = (xf-x0)/dt
            return v
        
    
    return 0


def T_maintenance(x):
    
    dt = x[-1]['eventSec'] - x[0]['eventSec']

    if dt>2:

        if np.mean([ e['positions'][0]['x'] for e in x])<40:
            return dt
        
        else:
            return 0.0
    
    else:
        return 0.0
    

def T_build_up(x):

    dt = x[-1]['eventSec'] - x[0]['eventSec']

    if dt>2:

        if np.mean([ e['positions'][0]['x'] for e in x])>60:
            return dt
        
        else:
            return 0.0
    
    else:
        return 0.0


def T_zona_media(x):

    dt = x[-1]['eventSec'] - x[0]['eventSec']

    if dt>2:

        xm = np.mean([ e['positions'][0]['x'] for e in x])

        if xm<=60 and xm>=40:
            return dt
        else:
            return 0.0
    
    else:
        return 0.0


def flow_rate(x):

    R,T=[],[]

    for i in range(len(x)-1):

        e1,e2 = x[i], x[i+1]
        dt = e2["eventSec"]-e1["eventSec"] 

        if dt>0:
            T.append(dt)
            R.append( e1['positions'][0]['x'] )

    if len(T) and len(R):

        dtmean = np.mean(T)

        if np.mean(R)>50:# and dtmean<5:
        
            return 1.0/dtmean
   

    return 0.0
    

def crossing(x):

    for i in range(len(x)):

        e1= x[i]

        if e1["subEventName"]=="Cross":
            return 1

    return 0


def pressure_point(x, Tant):

    if Tant>2:

        xp = x[0]['positions'][0]['x']

        return xp 
    else:
        return 0.0


def pressure_loss(x,Tant):

    if Tant>2 and x[0]['positions'][0]['x']>40:

        return Tant 
    
    else:
        return 0
    

def shots(x):

    if np.mean([ e['positions'][0]['x'] for e in x])>50:

        for e in x:
            if e["eventName"]=="Shot":
                return 1
    
    return 0


tomo los datos,

In [53]:
Data={
    "direct_play":[],
    "counterattack":[],
    "T_maintenance":[],
    "T_build_up":[],
    "T_zona_media":[],
    "flow_rate":[],
    "crossing":[],
    "pressure_point":[],
    "pressure_loss":[],
    "shots":[],

    "match":[],
    "team1":[],
    "liga":[]
}

ligas = ["England","France","Germany", "Italy", "Spain"]

for liga in ligas:

    print (liga)

    eventos = json.load(open("/home/chacoma/Lineas/Futbol/raw_data/pappalardo/events/events_%s.json"%liga,'r'))
    N=len(eventos)

    count=0
    posesion = []
    Tpos_ant=0
    team_ant =''
    eant={"eventName":""}

    for i in range(N-1):

        e1 = eventos[i]
        e2 = eventos[i+1]

        if e1['matchId']==e2['matchId']:
			
            if e1['matchPeriod']==e2['matchPeriod']:

                if e1['teamId']==e2['teamId']:
                    posesion.append(e1)
                
                else:
                    posesion.append(e1)

                    if len(posesion)>3:

                        team1 = e1['teamId']
                        match= e1["matchId"]

                        d1 = direct_play(posesion)
                        d2 = counterattack(posesion)
                        d3 = T_maintenance(posesion)
                        d4 = T_build_up(posesion)
                        d5 = T_zona_media(posesion)
                        d6 = flow_rate(posesion) 
                        d7 = crossing(posesion)
                        d8 = pressure_point(posesion, Tpos_ant)
                        d9 = pressure_loss(posesion,Tpos_ant)
                        d10 = shots(posesion)
                        

                        if (d1+d2+d3+d4+d5+d6+d7+d8+d9+d10) >0.1:
                            
                            Data["direct_play"].append(d1)
                            Data["counterattack"].append(d2)
                            Data["T_maintenance"].append(d3)
                            Data["T_build_up"].append(d4)
                            Data["T_zona_media"].append(d5)
                            Data["flow_rate"].append(d6)
                            Data["crossing"].append(d7)
                            Data["pressure_point"].append(d8)
                            
                            if team1!=team_ant and team1!='':
                                Data["pressure_loss"].append(d9)
                            else:
                                Data["pressure_loss"].append(0)
                            
                            Data["shots"].append(d10)

                            Data["match"].append(match)
                            Data["team1"].append(team1)
                            Data["liga"].append(liga)    

                    # ----------
                    
                    if len(posesion):
                        if e1['matchId']==e2['matchId']:
                            Tpos_ant= e2["eventSec"] - posesion[0]["eventSec"]
                            team_ant = team1
                        else:
                            Tpos_ant=0
                            team_ant =''
                    else:
                        Tpos_ant=0
                        team_ant =''
                        
                    posesion = []

            else:
                posesion = []
        
        else:
                posesion = []


df0 = pd.DataFrame.from_dict(Data)

df0["liga"]= df0['liga'].astype('category')

df0.head()


England
France
Germany
Italy
Spain


Unnamed: 0,direct_play,counterattack,T_maintenance,T_build_up,T_zona_media,flow_rate,crossing,pressure_point,pressure_loss,shots,match,team1,liga
0,13.798826,0.0,0.0,0.0,9.790285,0.0,0,0.0,0.0,0,2499719,1609,England
1,0.0,0.0,0.0,0.0,5.661206,0.529922,0,0.0,0.0,0,2499719,1609,England
2,0.0,0.0,6.901369,0.0,0.0,0.0,0,24.0,0.0,0,2499719,1631,England
3,3.621163,0.0,0.0,7.822331,0.0,0.511357,1,0.0,0.0,1,2499719,1609,England
4,0.0,0.0,0.0,11.391057,0.0,0.263364,1,100.0,2.566969,0,2499719,1631,England


Como las columnas siguen aproximadamente una distribución lognormal, transformo a log(1+x) para trabajar con distribuciones normales

In [54]:
exclude_columns = ['liga', 'match', 'team1', "crossing", "shots"]

cols = df0.columns.difference(exclude_columns)

df1 = df0.copy(deep=True)

df1[cols] = np.log1p(df1[cols])                 # transormo features a log(1+x)

print (df1.shape)
df1.head()

(215681, 13)


Unnamed: 0,direct_play,counterattack,T_maintenance,T_build_up,T_zona_media,flow_rate,crossing,pressure_point,pressure_loss,shots,match,team1,liga
0,2.694548,0.0,0.0,0.0,2.378646,0.0,0,0.0,0.0,0,2499719,1609,England
1,0.0,0.0,0.0,0.0,1.896301,0.425217,0,0.0,0.0,0,2499719,1609,England
2,0.0,0.0,2.067036,0.0,0.0,0.0,0,3.218876,0.0,0,2499719,1631,England
3,1.530646,0.0,0.0,2.177286,0.0,0.413008,1,0.0,0.0,1,2499719,1609,England
4,0.0,0.0,0.0,2.516975,0.0,0.233778,1,4.615121,1.271716,0,2499719,1631,England


### Data agregada por partido y por equipo

In [55]:
def f(x):
    
    if x.name=="liga":
        return list(set(x))[0]

    
    elif x.name=="crossing" or x.name=="shots":  
        return np.sum(x)

    else:
        return np.sum(x)


df2 = df1.groupby(["match","team1" ]).agg( lambda x: f(x)).reset_index()#.drop("match", axis=1)


# una vex contados todos los centros y tiros del partido transformo a log
df2["crossing"]= np.log1p(df2["crossing"])
df2["shots"]= np.log1p(df2["shots"])



# Escaleo de las variables ------------
exclude_columns = ['liga', 'team1', 'match', "rank"]                                             # Columnas a excluir del escalado

columns_to_scale = df2.columns.difference(exclude_columns)

scaler = StandardScaler()                                                               # Crear un StandardScaler

df2[columns_to_scale] = scaler.fit_transform(df2[columns_to_scale])                     # Seleccionar solo las columnas que queremos escalar

print (df2.shape)

df2.tail()

(3652, 13)


Unnamed: 0,match,team1,direct_play,counterattack,T_maintenance,T_build_up,T_zona_media,flow_rate,crossing,pressure_point,pressure_loss,shots,liga
3647,2576336,3315,-0.125349,0.528804,-0.151589,0.667163,-0.846297,0.034914,0.264017,-0.223444,1.001169,1.109736,Italy
3648,2576337,3164,0.422704,1.373767,-0.1169,-0.085524,1.728833,0.350729,0.023985,0.780674,-0.582299,-2.056307,Italy
3649,2576337,3204,-1.16514,-1.009447,-1.215307,-0.366693,-0.615251,-0.352902,-0.98196,-1.18284,0.110486,0.393279,Italy
3650,2576338,3185,-0.792808,-0.891117,1.345147,-1.299499,-1.108084,-1.269483,-1.499088,-0.412187,-0.707438,-0.831514,Italy
3651,2576338,3193,1.367505,1.005242,0.923894,2.800833,0.219256,1.8902,0.836458,1.611067,0.530269,0.393279,Italy


In [56]:
df2.to_csv("metrics.csv",index=False)

### Rankings

Uso los puntos que obtuvieron en el campeonato para asignar un ranking

In [57]:
wyid2ptos= { 

    "Spain":{
        674:73,675:76,676:93,677:29,678:43,679:79,680:58,682:61,
        683:20,684:60,687:49,691:49,692:49,695:46,696:47,698:55,
        701:51,712:43,714:22,756:51
    },

    "England":{
        1609:63, 1610:70, 1611:81, 1612:75, 1613:44, 1619:36, 
        1623:49, 1624:77, 1625:100, 1627:31, 1628:44, 1631:47, 
        1633:42, 1639:33, 1644:41, 1646:54, 1651:40, 1659:44, 
        1673:37, 10531:33
    },
    "Italy":{
        3157:64, 3158:77, 3159:95, 3161:72, 3162:72, 3163:40, 3164:54, 
        3165:40, 3166:39, 3172:60, 3173:39, 3176:57, 3185:54, 3187:91, 
        3193:41, 3194:25, 3197:35, 3204:38, 3219:21, 3315:43
    },
    "France":{
        3766:78, 3767:93, 3770:52, 3771:77, 3772:55, 3774:58, 
        3775:54, 3776:38, 3777:26, 3779:38, 3780:37, 3782:55, 
        3783:38, 3785:47, 3787:51, 3789:45, 3795:33, 3799:41, 
        3804:48, 19830:80
    },
    "Germany":{
        2443:42, 2444:84, 2445:51, 2446:54, 2447:54.5, 2449:63, 
        2450:31, 2451:33, 2453:36, 2454:47, 2455:39, 2457:43, 
        2460:36, 2462:49, 2463:22, 2481:41, 2482:55, 2975:53 
    }

}

ligaEquipo2rank={ liga:{} for liga in wyid2ptos.keys() }

for liga in wyid2ptos.keys():

    rank=1

    for team, ptos in sorted(wyid2ptos[liga].items(), key=lambda a:a[1], reverse=True):

        ligaEquipo2rank[liga][team]=rank

        rank+=1


json.dump( ligaEquipo2rank, open("ligue2team2rank.json","w") )

### Metadata de matches

Busco info de los nombre de los equipos 

In [58]:
data = {
    'match':[],
    'home':[],
    'away':[],
    'winner':[],
    'score_home':[],
    'score_away':[],
    'label':[],
    'liga':[],
}

for liga in ligas:

    print (liga)

    arx = f'/home/chacoma/Lineas/Futbol/raw_data/pappalardo/matches/matches_{liga}.json'

    matches = json.load(open(arx,'r'))

    for i in range(len(matches)):

        data['liga'].append(liga)

        match = matches[i]['wyId']
        winner = matches[i]['winner']
        data['match'].append(match)
        data['winner'].append(winner)
        data['label'].append(matches[i]['label'])

        eq1, eq2 = list(matches[i]['teamsData'].keys())
        deq1 = matches[i]['teamsData'][eq1]
        deq2 = matches[i]['teamsData'][eq2]

        score1,side1 = deq1['score'], deq1['side']
        score2,side2 = deq2['score'], deq2['side']

        if side1=='home':
            data['home'].append(eq1)
            data['away'].append(eq2)
            data['score_home'].append(score1)
            data['score_away'].append(score2)
        else:
            data['home'].append(eq2)
            data['away'].append(eq1)
            data['score_home'].append(score2)
            data['score_away'].append(score1)


df_matches = pd.DataFrame(data)
  
df_matches.head()

England
France
Germany
Italy
Spain


Unnamed: 0,match,home,away,winner,score_home,score_away,label,liga
0,2500089,1646,1659,1659,1,2,"Burnley - AFC Bournemouth, 1 - 2",England
1,2500090,1628,1627,1628,2,0,"Crystal Palace - West Bromwich Albion, 2 - 0",England
2,2500091,1673,1609,1609,0,1,"Huddersfield Town - Arsenal, 0 - 1",England
3,2500092,1612,1651,1612,4,0,"Liverpool - Brighton & Hove Albion, 4 - 0",England
4,2500093,1611,1644,1611,1,0,"Manchester United - Watford, 1 - 0",England


In [59]:
df_matches.to_csv("matches.csv",index=False)