In [1]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder


### Datos de cada BPI

Defino las rutinas para capturar las mÃ©tricas,

In [2]:
def direct_play(x):

    maxi,pos=0,[]

    for i in range(len(x)-1):

        e1,e2 = x[i],x[i+1]


        dt = (e2["eventSec"]-e1["eventSec"])

        if dt>2:

            if e1["eventName"]=="Pass" or e1["eventName"]=="Free Kick":

                dist_vertical = (e2['positions'][0]['x']- e1['positions'][0]['x'])/dt

                if dist_vertical>maxi:
                    maxi=dist_vertical
                


    return maxi


def counterattack(x):

    x0, xf = x[0]["positions"][0]["x"], x[-1]["positions"][0]["x"]

    if x0<40 and xf>60:

        dt = x[-1]["eventSec"]-x[0]["eventSec"]

        if dt>0:

            v = (xf-x0)/dt
            return v
        
    
    return 0


def T_maintenance(x):
    
    dt = x[-1]['eventSec'] - x[0]['eventSec']

    if dt>2:

        if np.mean([ e['positions'][0]['x'] for e in x])<40:
            return dt
        
        else:
            return 0.0
    
    else:
        return 0.0
    

def T_build_up(x):

    dt = x[-1]['eventSec'] - x[0]['eventSec']

    if dt>2:

        if np.mean([ e['positions'][0]['x'] for e in x])>60:
            return dt
        
        else:
            return 0.0
    
    else:
        return 0.0


def T_zona_media(x):

    dt = x[-1]['eventSec'] - x[0]['eventSec']

    if dt>2:

        xm = np.mean([ e['positions'][0]['x'] for e in x])

        if xm<=60 and xm>=40:
            return dt
        else:
            return 0.0
    
    else:
        return 0.0


def flow_rate(x):

    R,T=[],[]

    for i in range(len(x)-1):

        e1,e2 = x[i], x[i+1]
        dt = e2["eventSec"]-e1["eventSec"] 

        if dt>0:
            T.append(dt)
            R.append( e1['positions'][0]['x'] )

    if len(T) and len(R):

        dtmean = np.mean(T)

        if np.mean(R)>50:# and dtmean<5:
        
            return 1.0/dtmean
   

    return 0.0
    

def crossing(x):

    for i in range(len(x)):

        e1= x[i]

        if e1["subEventName"]=="Cross":
            return 1

    return 0


def pressure_point(x, Tant):

    if Tant>2:

        xp = x[0]['positions'][0]['x']

        return xp 
    else:
        return 0.0


def pressure_loss(x,Tant):

    if Tant>2 and x[0]['positions'][0]['x']>40:

        return Tant 
    
    else:
        return 0
    

def shots(x):

    if np.mean([ e['positions'][0]['x'] for e in x])>50:

        for e in x:
            if e["eventName"]=="Shot":
                return 1
    
    return 0


tomo los datos,

In [3]:
Data={
    "direct_play":[],
    "counterattack":[],
    "T_maintenance":[],
    "T_build_up":[],
    "T_zona_media":[],
    "flow_rate":[],
    "crossing":[],
    "pressure_point":[],
    "pressure_loss":[],
    "shots":[],

    "match":[],
    "team1":[],
    "liga":[]
}

ligas = ["England","France","Germany", "Italy", "Spain"]

for liga in ligas:

    print (liga)

    eventos = json.load(open("/home/chacoma/Lineas/Futbol/raw_data/pappalardo/events/events_%s.json"%liga,'r'))
    N=len(eventos)

    count=0
    posesion = []
    Tpos_ant=0
    team_ant =''
    eant={"eventName":""}

    for i in range(N-1):

        e1 = eventos[i]
        e2 = eventos[i+1]

        if e1['matchId']==e2['matchId']:
			
            if e1['matchPeriod']==e2['matchPeriod']:

                if e1['teamId']==e2['teamId']:
                    posesion.append(e1)
                
                else:
                    posesion.append(e1)

                    if len(posesion)>3:

                        team1 = e1['teamId']
                        match= e1["matchId"]

                        d1 = direct_play(posesion)
                        d2 = counterattack(posesion)
                        d3 = T_maintenance(posesion)
                        d4 = T_build_up(posesion)
                        d5 = T_zona_media(posesion)
                        d6 = flow_rate(posesion) 
                        d7 = crossing(posesion)
                        d8 = pressure_point(posesion, Tpos_ant)
                        d9 = pressure_loss(posesion,Tpos_ant)
                        d10 = shots(posesion)
                        

                        if (d1+d2+d3+d4+d5+d6+d7+d8+d9+d10) >0.1:
                            
                            Data["direct_play"].append(d1)
                            Data["counterattack"].append(d2)
                            Data["T_maintenance"].append(d3)
                            Data["T_build_up"].append(d4)
                            Data["T_zona_media"].append(d5)
                            Data["flow_rate"].append(d6)
                            Data["crossing"].append(d7)
                            Data["pressure_point"].append(d8)
                            
                            if team1!=team_ant and team1!='':
                                Data["pressure_loss"].append(d9)
                            else:
                                Data["pressure_loss"].append(0)
                            
                            Data["shots"].append(d10)

                            Data["match"].append(match)
                            Data["team1"].append(team1)
                            Data["liga"].append(liga)    

                    # ----------
                    
                    if len(posesion):
                        if e1['matchId']==e2['matchId']:
                            Tpos_ant= e2["eventSec"] - posesion[0]["eventSec"]
                            team_ant = team1
                        else:
                            Tpos_ant=0
                            team_ant =''
                    else:
                        Tpos_ant=0
                        team_ant =''
                        
                    posesion = []

            else:
                posesion = []
        
        else:
                posesion = []


df0 = pd.DataFrame.from_dict(Data)

df0["liga"]= df0['liga'].astype('category')

print (df0.shape)
df0.head()


England
France
Germany
Italy
Spain
(215681, 13)


Unnamed: 0,direct_play,counterattack,T_maintenance,T_build_up,T_zona_media,flow_rate,crossing,pressure_point,pressure_loss,shots,match,team1,liga
0,13.798826,0.0,0.0,0.0,9.790285,0.0,0,0.0,0.0,0,2499719,1609,England
1,0.0,0.0,0.0,0.0,5.661206,0.529922,0,0.0,0.0,0,2499719,1609,England
2,0.0,0.0,6.901369,0.0,0.0,0.0,0,24.0,0.0,0,2499719,1631,England
3,3.621163,0.0,0.0,7.822331,0.0,0.511357,1,0.0,0.0,1,2499719,1609,England
4,0.0,0.0,0.0,11.391057,0.0,0.263364,1,100.0,2.566969,0,2499719,1631,England


### Data agregada por partido y por equipo

In [4]:
def f(x):
    
    if x.name=="liga":
        return list(set(x))[0]

    else:
        return np.sum(x)


df2 = df0.groupby(["match","team1" ]).agg( lambda x: f(x)).reset_index()#.drop("match", axis=1)

print(df2.shape)

df2.head()



(3652, 13)


Unnamed: 0,match,team1,direct_play,counterattack,T_maintenance,T_build_up,T_zona_media,flow_rate,crossing,pressure_point,pressure_loss,shots,liga
0,2499719,1609,356.903763,38.475388,253.640098,448.696552,716.029417,20.86818,8,2086.0,50.528341,9,England
1,2499719,1631,151.371756,14.92058,116.388608,432.76302,211.277036,7.828017,3,1135.0,31.902325,0,England
2,2499720,1625,395.131011,37.841597,190.94816,650.679142,1158.997713,18.684612,11,3026.0,198.09106,4,England
3,2499720,1651,126.913376,25.463668,105.211156,109.993449,143.042525,4.997537,2,825.0,30.294175,1,England
4,2499721,1610,361.440581,35.106563,499.289176,314.879369,642.552792,11.853229,8,1412.0,99.727621,3,England


In [5]:
metricas=["direct_play", "counterattack","T_build_up","T_zona_media","flow_rate","crossing","pressure_point","shots"]

ligas = ["England","France","Germany", "Italy", "Spain"]

Data = { liga:{ metrica:{} for metrica in metricas } for liga in ligas }



for index,row in df2.iterrows():

    team = row["team1"]
    liga = row["liga"]

    for met in metricas:

        v = row[met]

        try:
            Data[liga][met][team]+=v
        except:
            Data[liga][met][team]=v





In [6]:
arx = '/home/chacoma/Lineas/Futbol/Ranking/ranking-futbol/data/league2team2rank.json'
l2t2r = json.load( open(arx,'r') )

In [7]:
for liga in ligas:

    for met in metricas:

        x = {'team':[],"value":[]}

        for team in Data[liga][met]:

            x['team'].append( l2t2r[liga][str(team)] )
            x['value'].append( Data[liga][met][team] )

        df = pd.DataFrame( data=x).sort_values(by='team', ascending=True).reset_index(drop=True)

        arx = f'data/{liga}_{met}.csv'
        df.to_csv(arx, index=False)


