In [None]:
import warnings
from pathlib import Path
import os
import pandas as pd
import tqdm
import random

import socceraction.spadl as spadl
import socceraction.atomic.spadl as atomicspadl
import socceraction.atomic.vaep.features as fs
import socceraction.atomic.vaep.labels as lab
import socceraction.atomic.vaep.formula as vaepformula
from socceraction.spadl.wyscout import convert_to_actions
from socceraction.data.wyscout import PublicWyscoutLoader

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression as logistic
from sklearn.ensemble import RandomForestClassifier as rfc
from catboost import CatBoostClassifier 
from xgboost import XGBClassifier as XGBc
from sklearn.metrics import brier_score_loss, roc_auc_score, log_loss


# Preprocessing

In [None]:
wyscout = PublicWyscoutLoader(root = '/Users/leonardoyang/Desktop/Tesi/Wyscout/raw/data', download=False)

In [None]:
competitions = wyscout.competitions()
competitions.info()

In [None]:
#Select competitions
comp = ['European_Championship', 'World_Cup']
#Info about selected competitions
selected_competitions = competitions[(competitions['country_name'] =='International')]
selected_competitions

### Retrieve the scoreline of a game

In [None]:
dfs_matches = []
#Retrieve matches of a selected competition
for competition in comp:
    competition_name = competition
    file_matches = f'matches_{competition_name}.json'
    df_matches = pd.read_json('/Users/leonardoyang/Desktop/Tesi/Wyscout/raw/Data/' + file_matches)
    dfs_matches.append(df_matches)
    
#Create a unique dataframe containing all the selected games
df_matches = pd.concat(dfs_matches)
df_matches.reset_index(drop = True, inplace = True)
df_matches.head()

In [None]:
#Extrapolate the scoreline of the game 
home_score, away_score = [],[]
for i in range(len(df_matches)) : 
    home_id, away_id = df_matches['teamsData'][i].keys()
    if df_matches['teamsData'][i][home_id]['side'] == 'away':
        home_id, away_id = away_id, home_id
    if df_matches['duration'][i] !='Regular':
        home_score.append(df_matches['teamsData'][i][home_id]['scoreET'])
        away_score.append(df_matches['teamsData'][i][away_id]['scoreET'])
    else:    
        home_score.append(df_matches['teamsData'][i][home_id]['score']+df_matches['teamsData'][i][home_id]['scoreET'])
        away_score.append(df_matches['teamsData'][i][away_id]['score']+df_matches['teamsData'][i][away_id]['scoreET'])

df_matches['home_score'] = home_score
df_matches['away_score'] = away_score
df_matches = df_matches[['seasonId','wyId','label','competitionId', 'home_score','away_score']]
df_matches.rename(columns={'seasonId':'season_id', 'wyId':'game_id','competitionId':'competition_id'}, inplace = True)

df_matches.head()

In [None]:
# Get games  with goals from all selected competitions
games = pd.concat([wyscout.games(row.competition_id, row.season_id) for row in selected_competitions.itertuples()])

match_goals = pd.merge(games, df_matches, on =['game_id','competition_id','season_id'])
match_goals.drop(['label'],axis = 1, inplace = True)
games = match_goals

games[["home_team_id", "away_team_id", "game_date","home_score", "away_score"]]

In [None]:
games.info()

# SPADL

In [None]:
#Store everything in the HDF
datafolder = "/Users/leonardoyang/Desktop/data-fifa/training"

In [None]:
#To upload the DF
spadl_h5 = os.path.join(datafolder, "atomic_spadl-wyscout.h5")

In [None]:
pd.HDFStore(spadl_h5)['teams'].reset_index().drop('index', axis =1)['team_name'].unique()

In [None]:
#Here an example of SPADL action
with pd.HDFStore(spadl_h5) as spadlstore:
    games = (
        spadlstore["games"]
        .merge(spadlstore["competitions"], how='left')
        .merge(spadlstore["teams"].add_prefix('home_'), how='left')
        .merge(spadlstore["teams"].add_prefix('away_'), how='left'))
    # Select Portugal vs France game at Euro 2016
    game = games[(games.home_team_name_short == "Portugal") & (games.away_team_name_short== "France")]
    game_id = game.game_id.values[0]
    #Get actions for the selected match
    atomic_actions = spadlstore[f"actions/game_{game_id}"]
    atomic_actions = (
        atomic_actions
        .merge(atomicspadl.actiontypes_df(), how="left")
        .merge(spadl.bodyparts_df(), how="left")
        .merge(spadlstore["players"], how="left")
        .merge(spadlstore["teams"], how="left")
    )

# use nickname if available else use full name
atomic_actions["player_name"] = atomic_actions[["nickname", "player_name"]].apply(lambda x: x[0] if x[0] else x[1],axis=1)
del atomic_actions['nickname']
atomic_actions[2930:2938]

In [None]:
#Plot
import matplotsoccer
for shot in list(atomic_actions[(atomic_actions.type_name == "goal")].index):
    a = atomic_actions[shot-7:shot+1].copy()

    a["start_x"] = a.x
    a["start_y"] = a.y
    a["end_x"] = a.x + a.dx
    a["end_y"] = a.y + a.dy

    g = game.iloc[0]
    if a.period_id.values[0] > 2:
        minute = int(90 + (a.period_id.values[0] - 3) * 15 + a.time_seconds.values[0] // 60)
    else:
        minute = int((a.period_id.values[0] - 1) * 45 + a.time_seconds.values[0] // 60)
        
    game_info = f"{g.game_date}, {g.home_team_name} - {g.away_team_name} : {g.home_score}-{g.away_score} , {a[-1:].player_name.values[0]} {minute + 1}'"
    print(game_info)

    def nice_time(row):
        if a.period_id.values[0] > 2:
            minute = int(90 + (a.period_id.values[0] - 3) * 15 + a.time_seconds.values[0] // 60)
        else:
            minute = int((a.period_id.values[0] - 1) * 45 + a.time_seconds.values[0] // 60)
        second = int(row.time_seconds % 60)
        return f"{minute}m{second}s"

    a["nice_time"] = a.apply(nice_time,axis=1)
    labels = a[["nice_time", "type_name", "player_name", "team_name"]]

    matplotsoccer.actions(
        location=a[["start_x", "start_y", "end_x", "end_y"]],
        action_type=a.type_name,
        team= a.team_name,
        label=labels,
        labeltitle=["time", "actiontype", "player_name", "team"],
        zoom=False,
        figsize=6
    )

# Compute features and labels

In [None]:
#Directory path
features_h5 = os.path.join(datafolder, "atomic_features.h5")
labels_h5 = os.path.join(datafolder, "atomic_labels.h5")

In [None]:
#Total games
games = pd.HDFStore(spadl_h5)['games']
print("nb of games:", len(games))

In [None]:
#Get features 
xfns = [ fs.actiontype,
    fs.actiontype_onehot,
    fs.bodypart,
    fs.bodypart_onehot,
    fs.goalscore,
    fs.location,
    fs.movement_polar,
    fs.polar,
    fs.direction,
    fs.team,
    fs.time,
    fs.time_delta]

with pd.HDFStore(spadl_h5) as spadlstore, pd.HDFStore(features_h5) as featurestore:
    for game in tqdm.tqdm(list(games.itertuples()),desc=f"Generating and storing features in {features_h5}"):
        actions = spadlstore[f"actions/game_{game.game_id}"] #Get actions for each game
        gamestates = fs.gamestates(atomicspadl.add_names(actions), 3) # Consider last 3 actions. Creates lags -1 and -2.
        gamestates = fs.play_left_to_right(gamestates, game.home_team_id)

        X = pd.concat([fn(gamestates) for fn in xfns], axis=1) 
        featurestore[f"game_{game.game_id}"] = X

In [None]:
#Get labels
yfns = [lab.scores, lab.concedes, lab.goal_from_shot]

with pd.HDFStore(spadl_h5) as spadlstore, pd.HDFStore(labels_h5) as labelstore:
    for game in tqdm.tqdm(list(games.itertuples()), desc=f"Computing and storing labels in {labels_h5}"):
        actions = spadlstore[f"actions/game_{game.game_id}"]
        Y = pd.concat([fn(atomicspadl.add_names(actions)) for fn in yfns], axis=1)
        labelstore[f"game_{game.game_id}"] = Y

In [None]:
#Example of final framework
print('Features')
display( pd.HDFStore(features_h5)["game_2057954"].head()) #With lags -1,-2.
print('Labels')
display(pd.HDFStore(labels_h5)["game_2057954"].head())

In [None]:
#Labels
#goal = True if goal from a shot. Non mi sembra venga usato dopo
#Score = True if within next 10 actions goal
#concedes = True if within next 10 actions goal
#Chi ha palla è score, se durante azione c'è qualche intervento avversario c'è concedes = true.
#Goal è true solo se shot, score e concedes ci sono comunque.

# Model training

In [None]:
#Directory path
predictions_h5 = os.path.join(datafolder, "atomic-predictions-one-action.h5")

In [None]:
#Split in train and test sets
traingames, testgames = train_test_split(games, test_size=0.3, random_state=42, shuffle=True)
traingames.head()

In [None]:
# 1. Select feature set X
xfns = [
    fs.actiontype_onehot,
    fs.bodypart_onehot,
    fs.goalscore,
    fs.location,
    fs.polar,
    fs.movement_polar,
    fs.direction,
    fs.team,
    fs.time,
    fs.time_delta
]
nb_prev_actions = 3
Xcols = fs.feature_column_names(xfns, nb_prev_actions)

#Function to select features and label of games
def getXY(games, Xcols):
    # generate the columns of the selected feature
    X = []
    for game_id in tqdm.tqdm(games.game_id, desc="Selecting features"): #aggiungi azioni di ogni partita
        Xi = pd.HDFStore(features_h5) [f"game_{game_id}"]
        X.append(Xi[Xcols])
    X = pd.concat(X).reset_index(drop=True)

    # 2. Select label Y
    Ycols = ["scores", "concedes"]
    Y = []
    for game_id in tqdm.tqdm(games.game_id, desc="Selecting label"):
        Yi = pd.HDFStore(labels_h5)[f"game_{game_id}"]
        Y.append(Yi[Ycols])
    Y = pd.concat(Y).reset_index(drop=True)
    return X, Y

In [None]:
#Function to evaluate predictions
def evaluate(models):
    Y_hat = pd.DataFrame()
    for col in testY.columns:
        Y_hat[col] = [p[1] for p in models[col].predict_proba(testX)] #[1] = prob of True
        print(f"### Y: {col} ###")
        p = sum(testY[col]) / len(testY[col])
        base = [p] * len(testY[col])
        brier = brier_score_loss(testY[col], Y_hat[col])
        print(f"  Brier score: %.5f (%.5f)" % (brier, brier / brier_score_loss(testY[col], base)))
        ll = log_loss(testY[col], Y_hat[col])
        print(f"  log loss score: %.5f (%.5f)" % (ll, ll / log_loss(testY[col], base)))
        print(f"  ROC AUC: %.5f" % roc_auc_score(testY[col], Y_hat[col]))

In [None]:
#Retrieve features and labels for both train and test games.
print('Train games')
X,Y = getXY(traingames, Xcols)
X = X.fillna(0)
#test
print('Test games')
testX, testY = getXY(testgames,Xcols)

In [None]:
#'goalscore_team','goalscore_opponent' = goal segnati fin'ora dalla squadra che fa l'ultima azione.

In [None]:
%%time
#Logistic regression
log = {}
for col in list(Y.columns):
    model = logistic(random_state = 42)
    model.fit(X, Y[col])
    log[col] = model

print('Logistic Regression:')
evaluate(log)

In [None]:
%%time
# 3. train classifiers F(X) = Y

Y_hat = pd.DataFrame()
boosting = {}
for col in list(Y.columns):
    model = XGBc(n_estimators=100, max_depth=5,random_state = 42)
    model.fit(X, Y[col])
    boosting[col] = model

print('XGBoosting:')
evaluate(boosting)

In [None]:
%%time
# 3. train classifiers F(X) = Y

Y_hat = pd.DataFrame()
forest = {}
for col in list(Y.columns):
    model = rfc(n_estimators=100, random_state = 42)
    model.fit(X, Y[col])
    forest[col] = model

print('Random forest:')
evaluate(forest)

In [None]:
%%time
#4. Catboost
cat = {}
for col in list(Y.columns):
    model = CatBoostClassifier(n_estimators=100, max_depth=5, random_state = 42, verbose = 0)
    model.fit(X, Y[col])
    cat[col] = model

print('CatBoost:')
evaluate(cat)

In [None]:
# Non mi interessa predire quando ci sarà un goal. Predice le probabilità prima del goal --> pericolosità di un'azione.

In [None]:
#Predict probabilities using Catboost and save predictions
Y_hat = pd.DataFrame()
for col in testY.columns:
        Y_hat[col] = [p[1] for p in cat[col].predict_proba(testX)] #[1] = prob of True

# Get rows with game id per action
A = []
for game_id in tqdm.tqdm(testgames.game_id, "Loading actions of each game"):  
    Ai = pd.HDFStore(spadl_h5) [f"actions/game_{game_id}"]
    A.append(Ai[["game_id"]])
A = pd.concat(A)
A = A.reset_index(drop=True)

# Concatenate action game id rows with predictions and save per game
grouped_predictions = pd.concat([A, Y_hat], axis=1).groupby("game_id")
for k,df in tqdm.tqdm(grouped_predictions, desc="Saving predictions per game"):
    df = df.reset_index(drop=True)
    df[Y_hat.columns].to_hdf(predictions_h5, f"game_{int(k)}")

# VAEP

In [None]:
with pd.HDFStore(spadl_h5) as spadlstore:
    games = (
        spadlstore["games"]
        .merge(spadlstore["competitions"], how='left')
        .merge(spadlstore["teams"].add_prefix('home_'), how='left')
        .merge(spadlstore["teams"].add_prefix('away_'), how='left'))
    players = spadlstore["players"]
    teams = spadlstore["teams"]
print("nb of games:", len(testgames))

In [None]:
#For each game in the test set, append the predictions and compute vaep
A = []
for game in tqdm.tqdm(list(testgames.itertuples()), desc="Loading actions"):
    actions = pd.HDFStore(spadl_h5) [f"actions/game_{game.game_id}"]
    actions = (
        atomicspadl.add_names(actions)
        .merge(players, how="left")
        .merge(teams, how="left",)
        .sort_values(["game_id", "period_id", "action_id"])
        .reset_index(drop=True)
    )
    preds = pd.HDFStore(predictions_h5) [f"game_{game.game_id}"]
    values = vaepformula.value(actions, preds.scores, preds.concedes)
    A.append(pd.concat([actions, preds, values], axis=1))
A = pd.concat(A).sort_values(["game_id", "period_id", "time_seconds"]).reset_index(drop=True)
A.columns

In [None]:
#Retrieve roles for each player
df_players = pd.read_json('/Users/leonardoyang/Desktop/Tesi/Wyscout/raw/players.json')
df_players = df_players[['wyId','shortName','role','currentTeamId']]
Role = []
for i in range(len(df_players)):
    Role.append(df_players['role'][i]['name'])

df_players['Role'] = Role
df_players.drop('role', axis = 1,inplace = True)

In [None]:
A["count"] = 1

# Compute each player's number of actions and total VAEP values
playersR = (
    A[["player_id","team_name", "vaep_value", "offensive_value", "defensive_value", "count"]]
    .groupby(["team_name","player_id"])
    .sum()
    .reset_index()
)
# Add player names
playersR = playersR.merge(players[["player_id", "nickname", "player_name"]], how="left")
playersR = pd.merge(playersR, df_players, left_on ='player_id', right_on = 'wyId', how = 'left').drop(['wyId','shortName'], axis = 1)
playersR["player_name"] = playersR[["nickname", "player_name"]].apply(lambda x: x[0] if x[0] else x[1],axis=1)
# Show results
playersR = playersR[["player_id", "player_name","Role", "team_name","vaep_value", "offensive_value", "defensive_value", "count"]]
playersR.sort_values("vaep_value", ascending=False)[:10]

In [None]:
# Normalize for minutes played
pg = pd.HDFStore(spadl_h5) ["player_games"]
pg = pg[pg.game_id.isin(games.game_id)]
mp = pg[["player_id", "minutes_played"]].groupby("player_id").sum().reset_index()

stats = playersR.merge(mp)
stats = stats[stats.minutes_played > 270] # at least 3 full games played
stats["vaep_rating"] = stats.vaep_value * 90 / stats.minutes_played
stats["offensive_rating"] = stats.offensive_value * 90 / stats.minutes_played
stats["defensive_rating"] = stats.defensive_value * 90 / stats.minutes_played
stats.sort_values("vaep_rating",ascending=False)[:10]

# Train model on whole dataset

In [None]:
#Features and labels of all games
X,Y = getXY(games, Xcols)

In [None]:
%%time
# 3. train classifiers F(X) = Y
Y_hat = pd.DataFrame()
models = {}
for col in list(Y.columns):
    model = CatBoostClassifier(n_estimators=100, max_depth=5, random_state = 42, verbose = False)
    model.fit(X, Y[col])
    models[col] = model

In [None]:
import pickle
filename = 'trained_model.sav'
pickle.dump(models, open(filename, 'wb'))