In [173]:
# imports and options
import pandas as pd
import numpy as np
pd.set_option("display.max_columns", None)

In [174]:
# read timelines and match ids and tournament ids
timelines_df = pd.read_csv('../data/match_timelines.csv')
team_names_df = pd.read_csv('../data/match_ids.csv')
tournaments_df = pd.read_csv('../data/tournament_ids.csv')

# select correct tournaments
tournaments_df = tournaments_df[tournaments_df['tournament_id'].isin([57, 58, 59, 60, 61, 62, 102, 103, 104, 105, 106, 107, 152, 153, 154, 155, 156, 157])]

In [175]:
# add column of age group and gender
tournaments_df['age'] = ['T12', 'P12', 'T13', 'T14', 'P13', 'P14',
                         'P13', 'T13', 'T12', 'P12', 'T14', 'P14',
                         'P13', 'T12', 'T13', 'T14', 'P14', 'P12']

# create year column from name
tournaments_df['year'] = tournaments_df['name'].str.extract(r'(\d{4})')

In [176]:
# list international teams
int_teams = ['Nords', 'jylland', 'Shamrock', 'Hamma', 'ndby', 'Vitesse', 'Bromma', 'Liding']

In [177]:
# create boolean variables for international teams
team_names_df['home_team_int'] = team_names_df['home_team'].apply(lambda x: 1 if any(sub in x for sub in int_teams) else 0)
team_names_df['away_team_int'] = team_names_df['away_team'].apply(lambda x: 1 if any(sub in x for sub in int_teams) else 0)

In [178]:
# add home and away team names to main dataframe
timelines_df = timelines_df.merge(team_names_df, on=['tournament_id', 'match_id'], how='left')

# add year and age group to main dataframe
timelines_df = timelines_df.merge(tournaments_df.drop('name', axis=1), on=['tournament_id'], how='left')

# convert times to datetime objects
# Convert to datetime
timelines_df['start_time'] = pd.to_datetime(timelines_df['start_time'], format='mixed')
timelines_df['end_time'] = pd.to_datetime(timelines_df['end_time'], format='mixed')

In [179]:
# Function to calculate Euclidean distance
def euclidean_distance(x1, y1, x2, y2):
    return np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)

# Apply distance calculation for "Pass" and "Dribble" events
timelines_df['distance'] = np.where(
    timelines_df['event_type'].isin(['Pass', 'Dribble']), 
    euclidean_distance(timelines_df['start_position_x'], timelines_df['start_position_y'], timelines_df['end_position_x'], timelines_df['end_position_y']),
    np.nan  # Assign NaN for other events
)

# calculate duration of event for possible events
timelines_df['duration'] = timelines_df['end_time'] - timelines_df['start_time']
timelines_df['duration'] = timelines_df['duration'].dt.total_seconds() #convert to seconds

# calculate speed for possible events
timelines_df['speed'] = timelines_df['distance'] / timelines_df['duration']
timelines_df.loc[(timelines_df['speed'].notna()) & (timelines_df['age'] == 'P14')]

# create column of international level (0=f-f, 1=i-f, 2=i-i)
timelines_df['int_level'] = timelines_df['home_team_int'] + timelines_df['away_team_int']

In [181]:
# group by "age" and "team_type", then compute statistics for "speed"
speed_stats = timelines_df.groupby(["age", "int_level"])["speed"].describe()
speed_stats

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
age,int_level,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
P12,0,6536.0,8.386466,3.187714,0.032197,6.553304,8.337476,10.22556,34.983379
P12,1,9894.0,8.33923,3.195743,0.048379,6.469891,8.296135,10.287914,57.672094
P12,2,1701.0,8.610807,3.275831,0.128897,6.705848,8.534817,10.695031,28.339725
P13,0,4857.0,8.826911,3.31949,0.107981,6.81193,8.746309,10.850972,29.916053
P13,1,8869.0,8.849077,3.365715,0.043011,6.755605,8.750902,10.867723,53.133373
P13,2,1759.0,8.804571,3.314532,0.060606,6.802764,8.784886,10.835265,18.838903
P14,0,9507.0,8.972046,3.634402,0.077784,6.878772,8.90594,11.007321,119.385814
P14,1,9032.0,9.153743,3.585598,0.004947,6.956367,9.040405,11.358998,35.054925
P14,2,1502.0,9.230459,3.762354,0.079672,6.898512,9.188117,11.598764,22.221114
T12,0,3939.0,7.654549,2.724556,0.139159,6.151501,7.666611,9.226173,21.992262
