In [1]:
import requests
import pandas as pd
import re
from bs4 import BeautifulSoup
import time
import datetime
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler

### Wczytanie danych

In [61]:
matches = pd.read_csv("data/matches_with_rolling_stats_pl.csv")

### Metryka formacji - jedyny sensowny pomysł wiąże się ze sprawdzeniem czy drużyny grają z wahadłowymi czy bez

In [109]:
matches["formation_back_line"] = matches["formation_home"].apply(lambda text: int(text[0] == "4"))
# 1 - grają bez wahadłowych
# 0 - grają z wahadłowymi

In [41]:
# Count occurrences of outcomes for each formation match-up
outcome_counts = matches.groupby(
    ["formation_home", "formation_away", "outcome"]
).size().unstack(fill_value=0)

# Rename columns for clarity
outcome_counts.columns = ['draws', 'wins', 'losses']

# Reset index for easier manipulation
outcome_counts = outcome_counts.reset_index()
# Calculate win rate
outcome_counts['total_games'] = outcome_counts['wins'] + outcome_counts['losses'] + outcome_counts['draws']
outcome_counts['win_rate'] = outcome_counts['wins'] / outcome_counts['total_games']
outcome_counts['loss_rate'] = outcome_counts['losses'] / outcome_counts['total_games']
outcome_counts['draw_rate'] = outcome_counts['draws'] / outcome_counts['total_games']


In [51]:
outcome_counts[outcome_counts["total_games"] > 4].sort_values("loss_rate", ascending=False)

Unnamed: 0,formation_home,formation_away,draws,wins,losses,total_games,win_rate,loss_rate,draw_rate
192,5-3-2,4-2-3-1,0,0,7,7,0.000000,1.000000,0.000000
194,5-3-2,4-3-3,1,0,8,9,0.000000,0.888889,0.111111
184,4-5-1,4-3-3,0,2,9,11,0.181818,0.818182,0.000000
203,5-4-1,4-2-3-1,1,1,6,8,0.125000,0.750000,0.125000
17,3-4-1-2,4-2-3-1,1,1,4,6,0.166667,0.666667,0.166667
...,...,...,...,...,...,...,...,...,...
166,4-4-2,3-5-2,3,7,0,10,0.700000,0.000000,0.300000
150,4-3-3,5-4-1,2,11,0,13,0.846154,0.000000,0.153846
144,4-3-3,4-3-2-1,0,5,0,5,1.000000,0.000000,0.000000
139,4-3-3,4-1-2-1-2,0,5,0,5,1.000000,0.000000,0.000000


### Analiza danych - korelacje

In [11]:
matches.describe()

Unnamed: 0,round,attendance_value,home_minutes,home_goals,home_assists,home_pens_made,home_pens_att,home_shots,home_shots_on_target,home_cards_yellow,...,away_last30_fouled,away_last30_offsides,away_last30_pens_won,away_last30_pens_conceded,away_last30_own_goals,away_last30_ball_recoveries,away_last30_aerials_won,away_last30_aerials_lost,away_last30_aerials_won_pct,away_last30_points
count,2100.0,1689.0,2100.0,2100.0,2100.0,2100.0,2100.0,2100.0,2100.0,2100.0,...,2100.0,2100.0,2100.0,2100.0,2100.0,2100.0,2100.0,2100.0,2100.0,2100.0
mean,21.0,38529.040853,988.268571,1.562381,1.05381,0.131429,0.152857,13.76619,4.549048,1.675238,...,10.28698,1.811502,0.115381,0.143164,0.053718,52.390467,16.100583,16.083826,50.03187,1.38729
std,10.10191,17002.670205,9.65233,1.334275,1.097643,0.359798,0.390412,5.73814,2.532302,1.291765,...,1.628433,0.490567,0.112875,0.115526,0.060351,3.574435,3.837091,3.89282,3.856789,0.573937
min,4.0,2000.0,888.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.333333,0.0,0.0,0.0,0.0,40.333333,6.666667,5.666667,35.133333,0.0
25%,12.0,25541.0,990.0,1.0,0.0,0.0,0.0,10.0,3.0,1.0,...,9.25,1.5,0.033333,0.066667,0.0,49.914773,13.326923,13.4,47.593333,1.0
50%,21.0,32485.0,990.0,1.0,1.0,0.0,0.0,13.0,4.0,2.0,...,10.25,1.8,0.1,0.133333,0.04,52.631481,15.5,15.783333,49.965,1.333333
75%,30.0,53177.0,990.0,2.0,2.0,0.0,0.0,17.0,6.0,2.0,...,11.333333,2.133333,0.166667,0.2,0.083333,54.8,18.346154,18.56558,52.295833,1.75
max,38.0,81332.0,990.0,9.0,6.0,2.0,2.0,36.0,15.0,7.0,...,16.333333,4.333333,1.0,1.5,0.5,64.8,31.714286,28.6,66.66,3.0


In [20]:
correlation_matrix

Unnamed: 0,round,home_minutes,home_goals,home_assists,home_pens_made,home_shots,home_shots_on_target,home_cards_yellow,home_cards_red,home_touches,...,away_last30_dispossessed,away_last30_cards_yellow_red,away_last30_fouls,away_last30_fouled,away_last30_pens_conceded,away_last30_own_goals,away_last30_ball_recoveries,away_last30_aerials_won,away_last30_aerials_lost,away_last30_aerials_won_pct
round,1.000000,0.042948,0.016542,0.011644,0.004457,0.026005,0.027768,0.079298,0.035434,0.017346,...,0.073717,0.031764,0.041523,0.013117,0.081546,0.053684,0.144046,0.063732,0.070358,0.010550
home_minutes,0.042948,1.000000,0.077492,0.074944,0.025533,0.142598,0.107345,0.077444,0.793479,0.145989,...,0.003159,0.009131,0.011592,0.034862,0.002842,0.024540,0.021630,0.013151,0.009246,0.008226
home_goals,0.016542,0.077492,1.000000,0.842668,0.269715,0.326321,0.586933,0.127829,0.075525,0.215000,...,0.008363,0.055140,0.095035,0.045726,0.032899,0.051786,0.015717,0.042265,0.095037,0.095867
home_assists,0.011644,0.074944,0.842668,1.000000,0.007417,0.284970,0.515050,0.122742,0.080856,0.203817,...,0.004178,0.047453,0.063423,0.022811,0.013900,0.053717,0.014888,0.055408,0.096347,0.077119
home_pens_made,0.004457,0.025533,0.269715,0.007417,1.000000,0.033352,0.041551,0.001675,0.028041,0.052503,...,0.032062,0.025801,0.040293,0.022927,0.003390,0.048793,0.016364,0.015137,0.016096,0.056161
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
away_last30_own_goals,0.053684,0.024540,0.051786,0.053717,0.048793,0.064478,0.036903,0.007667,0.021853,0.061447,...,0.027462,0.015030,0.084912,0.100025,0.028700,1.000000,0.098885,0.070028,0.006841,0.106338
away_last30_ball_recoveries,0.144046,0.021630,0.015717,0.014888,0.016364,0.087723,0.054817,0.056782,0.029861,0.090250,...,0.253518,0.053074,0.056676,0.219093,0.147282,0.098885,1.000000,0.113837,0.070911,0.060310
away_last30_aerials_won,0.063732,0.013151,0.042265,0.055408,0.015137,0.107746,0.049731,0.140583,0.020041,0.258234,...,0.047123,0.041200,0.130899,0.140386,0.062386,0.070028,0.113837,1.000000,0.834647,0.210424
away_last30_aerials_lost,0.070358,0.009246,0.095037,0.096347,0.016096,0.147913,0.111128,0.146697,0.021013,0.280585,...,0.050882,0.023409,0.138205,0.079919,0.021851,0.006841,0.070911,0.834647,1.000000,0.332556


### Regresja logistyczna do feature selection

In [25]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [30]:
matches = pd.read_csv("matches_with_rolling_stats_pl.csv")
matches = matches[matches['round'] > 7]
matches.head(3)
rolling_stats = matches[[col for col in matches.columns if 'last' in col or 'outcome' in col]]

In [41]:
X = rolling_stats.drop(columns=["outcome"])  # Features
y = rolling_stats["outcome"]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
log_reg = LogisticRegression(penalty='l1', solver='liblinear', multi_class='ovr')
log_reg.fit(X_train, y_train)
model = SelectFromModel(log_reg, prefit=True)
selected_features = X.columns[model.get_support()]
print("Selected Features based on Logistic Regression:")
print(selected_features)
X_selected = model.transform(X_scaled)

Selected Features based on Logistic Regression:
Index(['home_last5_wins', 'home_last5_draws', 'home_last5_defeats',
       'home_last5_goals_conceded', 'home_last5_minutes', 'home_last5_goals',
       'home_last5_assists', 'home_last5_pens_made', 'home_last5_pens_att',
       'home_last5_shots',
       ...
       'away_last30_fouls', 'away_last30_fouled', 'away_last30_offsides',
       'away_last30_pens_won', 'away_last30_pens_conceded',
       'away_last30_own_goals', 'away_last30_ball_recoveries',
       'away_last30_aerials_won', 'away_last30_aerials_lost',
       'away_last30_aerials_won_pct'],
      dtype='object', length=339)


In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# Assuming `X_selected` contains only the selected features, and `y` is the target variable
# Re-split the data if necessary to align with selected features
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# Initialize and train the logistic regression model on selected features
log_reg_model = LogisticRegression(multi_class='ovr', solver='liblinear')
log_reg_model.fit(X_selected, y)

# Make predictions on the test set
y_pred = log_reg_model.predict(X_selected)

# Evaluate the model
accuracy = accuracy_score(y, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y, y_pred, target_names=["Draw", "Win", "Lose"]))

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y, y_pred))

Accuracy: 0.66

Classification Report:
              precision    recall  f1-score   support

        Draw       0.59      0.36      0.45       404
         Win       0.68      0.79      0.73       841
        Lose       0.66      0.67      0.66       615

    accuracy                           0.66      1860
   macro avg       0.64      0.61      0.61      1860
weighted avg       0.65      0.66      0.65      1860


Confusion Matrix:
[[147 169  88]
 [ 49 667 125]
 [ 52 152 411]]


In [44]:
# Get predicted probabilities for the test set
y_pred_proba = log_reg_model.predict_proba(X_selected)

# Example of printing probabilities for the first few predictions
for i, probs in enumerate(y_pred_proba[:5]):
    print(f"Sample {i + 1} - Draw: {probs[0]:.2f}, Win: {probs[1]:.2f}, Lose: {probs[2]:.2f}")


Sample 1 - Draw: 0.24, Win: 0.28, Lose: 0.48
Sample 2 - Draw: 0.07, Win: 0.66, Lose: 0.27
Sample 3 - Draw: 0.03, Win: 0.76, Lose: 0.22
Sample 4 - Draw: 0.03, Win: 0.16, Lose: 0.81
Sample 5 - Draw: 0.12, Win: 0.83, Lose: 0.06


In [46]:
matches.head(2)

Unnamed: 0,season,date,time,round,attendance_value,referee,formation_home,formation_away,home_team,away_team,home_minutes,home_goals,home_assists,home_pens_made,home_pens_att,home_shots,home_shots_on_target,home_cards_yellow,home_cards_red,home_touches,home_tackles,home_interceptions,home_blocks,home_xg,home_npxg,home_xg_assist,home_sca,home_gca,home_passes_completed,home_passes,home_passes_pct,home_progressive_passes,home_carries,home_progressive_carries,home_take_ons,home_take_ons_won,home_passes_total_distance,home_passes_progressive_distance,home_passes_completed_short,home_passes_short,home_passes_pct_short,home_passes_completed_medium,home_passes_medium,home_passes_pct_medium,home_passes_completed_long,home_passes_long,home_passes_pct_long,home_pass_xa,home_assisted_shots,home_passes_into_final_third,home_passes_into_penalty_area,home_crosses_into_penalty_area,home_passes_live,home_passes_dead,home_passes_free_kicks,home_through_balls,home_passes_switches,home_crosses,home_throw_ins,home_corner_kicks,home_corner_kicks_in,home_corner_kicks_out,home_corner_kicks_straight,home_passes_offsides,home_passes_blocked,home_tackles_won,home_tackles_def_3rd,home_tackles_mid_3rd,home_tackles_att_3rd,home_challenge_tackles,home_challenges,home_challenge_tackles_pct,home_challenges_lost,home_blocked_shots,home_blocked_passes,home_tackles_interceptions,home_clearances,home_errors,home_touches_def_pen_area,home_touches_def_3rd,home_touches_mid_3rd,home_touches_att_3rd,home_touches_att_pen_area,home_touches_live_ball,home_take_ons_won_pct,home_take_ons_tackled,home_take_ons_tackled_pct,home_carries_distance,home_carries_progressive_distance,home_carries_into_final_third,home_carries_into_penalty_area,home_miscontrols,home_dispossessed,home_passes_received,home_progressive_passes_received,home_cards_yellow_red,home_fouls,home_fouled,home_offsides,home_pens_won,home_pens_conceded,home_own_goals,home_ball_recoveries,home_aerials_won,home_aerials_lost,home_aerials_won_pct,away_minutes,away_goals,away_assists,away_pens_made,away_pens_att,away_shots,away_shots_on_target,away_cards_yellow,away_cards_red,away_touches,away_tackles,away_interceptions,away_blocks,away_xg,away_npxg,away_xg_assist,away_sca,away_gca,away_passes_completed,away_passes,away_passes_pct,away_progressive_passes,away_carries,away_progressive_carries,away_take_ons,away_take_ons_won,away_passes_total_distance,away_passes_progressive_distance,away_passes_completed_short,away_passes_short,away_passes_pct_short,away_passes_completed_medium,away_passes_medium,away_passes_pct_medium,away_passes_completed_long,away_passes_long,away_passes_pct_long,away_pass_xa,away_assisted_shots,away_passes_into_final_third,away_passes_into_penalty_area,away_crosses_into_penalty_area,away_passes_live,away_passes_dead,away_passes_free_kicks,away_through_balls,away_passes_switches,away_crosses,away_throw_ins,away_corner_kicks,away_corner_kicks_in,away_corner_kicks_out,away_corner_kicks_straight,away_passes_offsides,away_passes_blocked,away_tackles_won,away_tackles_def_3rd,away_tackles_mid_3rd,away_tackles_att_3rd,away_challenge_tackles,away_challenges,away_challenge_tackles_pct,away_challenges_lost,away_blocked_shots,away_blocked_passes,away_tackles_interceptions,away_clearances,away_errors,away_touches_def_pen_area,away_touches_def_3rd,away_touches_mid_3rd,away_touches_att_3rd,away_touches_att_pen_area,away_touches_live_ball,away_take_ons_won_pct,away_take_ons_tackled,away_take_ons_tackled_pct,away_carries_distance,away_carries_progressive_distance,away_carries_into_final_third,away_carries_into_penalty_area,away_miscontrols,away_dispossessed,away_passes_received,away_progressive_passes_received,away_cards_yellow_red,away_fouls,away_fouled,away_offsides,away_pens_won,away_pens_conceded,away_own_goals,away_ball_recoveries,away_aerials_won,away_aerials_lost,away_aerials_won_pct,total_goals,Stadium,Club,Location,Capacity,Pitch length m,Pitch width m,latitude,longitude,outcome,home_points,away_points,home_last5_matches_played,home_last5_wins,home_last5_draws,home_last5_defeats,home_last5_goal_difference,home_last5_goals_conceded,home_last5_minutes,home_last5_goals,home_last5_assists,home_last5_pens_made,home_last5_pens_att,home_last5_shots,home_last5_shots_on_target,home_last5_cards_yellow,home_last5_cards_red,home_last5_touches,home_last5_tackles,home_last5_interceptions,home_last5_blocks,home_last5_xg,home_last5_npxg,home_last5_xg_assist,home_last5_sca,home_last5_gca,home_last5_passes_completed,home_last5_passes,home_last5_passes_pct,home_last5_progressive_passes,home_last5_carries,home_last5_progressive_carries,home_last5_take_ons,home_last5_take_ons_won,home_last5_passes_total_distance,home_last5_passes_progressive_distance,home_last5_passes_completed_short,home_last5_passes_short,home_last5_passes_pct_short,home_last5_passes_completed_medium,home_last5_passes_medium,home_last5_passes_pct_medium,home_last5_passes_completed_long,home_last5_passes_long,home_last5_passes_pct_long,home_last5_pass_xa,home_last5_assisted_shots,home_last5_passes_into_final_third,home_last5_passes_into_penalty_area,home_last5_crosses_into_penalty_area,home_last5_passes_live,home_last5_passes_dead,home_last5_passes_free_kicks,home_last5_through_balls,home_last5_passes_switches,home_last5_crosses,home_last5_throw_ins,home_last5_corner_kicks,home_last5_corner_kicks_in,home_last5_corner_kicks_out,home_last5_corner_kicks_straight,home_last5_passes_offsides,home_last5_passes_blocked,home_last5_tackles_won,home_last5_tackles_def_3rd,home_last5_tackles_mid_3rd,home_last5_tackles_att_3rd,home_last5_challenge_tackles,home_last5_challenges,home_last5_challenge_tackles_pct,home_last5_challenges_lost,home_last5_blocked_shots,home_last5_blocked_passes,home_last5_tackles_interceptions,home_last5_clearances,home_last5_errors,home_last5_touches_def_pen_area,home_last5_touches_def_3rd,home_last5_touches_mid_3rd,home_last5_touches_att_3rd,home_last5_touches_att_pen_area,home_last5_touches_live_ball,home_last5_take_ons_won_pct,home_last5_take_ons_tackled,home_last5_take_ons_tackled_pct,home_last5_carries_distance,home_last5_carries_progressive_distance,home_last5_carries_into_final_third,home_last5_carries_into_penalty_area,home_last5_miscontrols,home_last5_dispossessed,home_last5_passes_received,home_last5_progressive_passes_received,home_last5_cards_yellow_red,home_last5_fouls,home_last5_fouled,home_last5_offsides,home_last5_pens_won,home_last5_pens_conceded,home_last5_own_goals,home_last5_ball_recoveries,home_last5_aerials_won,home_last5_aerials_lost,home_last5_aerials_won_pct,home_last5_points,away_last5_matches_played,away_last5_wins,away_last5_draws,away_last5_defeats,away_last5_goal_difference,away_last5_goals_conceded,away_last5_minutes,away_last5_goals,away_last5_assists,away_last5_pens_made,away_last5_pens_att,away_last5_shots,away_last5_shots_on_target,away_last5_cards_yellow,away_last5_cards_red,away_last5_touches,away_last5_tackles,away_last5_interceptions,away_last5_blocks,away_last5_xg,away_last5_npxg,away_last5_xg_assist,away_last5_sca,away_last5_gca,away_last5_passes_completed,away_last5_passes,away_last5_passes_pct,away_last5_progressive_passes,away_last5_carries,away_last5_progressive_carries,away_last5_take_ons,away_last5_take_ons_won,away_last5_passes_total_distance,away_last5_passes_progressive_distance,away_last5_passes_completed_short,away_last5_passes_short,away_last5_passes_pct_short,away_last5_passes_completed_medium,away_last5_passes_medium,away_last5_passes_pct_medium,away_last5_passes_completed_long,away_last5_passes_long,away_last5_passes_pct_long,away_last5_pass_xa,away_last5_assisted_shots,away_last5_passes_into_final_third,away_last5_passes_into_penalty_area,away_last5_crosses_into_penalty_area,away_last5_passes_live,away_last5_passes_dead,away_last5_passes_free_kicks,away_last5_through_balls,away_last5_passes_switches,away_last5_crosses,away_last5_throw_ins,away_last5_corner_kicks,away_last5_corner_kicks_in,away_last5_corner_kicks_out,away_last5_corner_kicks_straight,away_last5_passes_offsides,away_last5_passes_blocked,away_last5_tackles_won,away_last5_tackles_def_3rd,away_last5_tackles_mid_3rd,away_last5_tackles_att_3rd,away_last5_challenge_tackles,away_last5_challenges,away_last5_challenge_tackles_pct,away_last5_challenges_lost,away_last5_blocked_shots,away_last5_blocked_passes,away_last5_tackles_interceptions,away_last5_clearances,away_last5_errors,away_last5_touches_def_pen_area,away_last5_touches_def_3rd,away_last5_touches_mid_3rd,away_last5_touches_att_3rd,away_last5_touches_att_pen_area,away_last5_touches_live_ball,away_last5_take_ons_won_pct,away_last5_take_ons_tackled,away_last5_take_ons_tackled_pct,away_last5_carries_distance,away_last5_carries_progressive_distance,away_last5_carries_into_final_third,away_last5_carries_into_penalty_area,away_last5_miscontrols,away_last5_dispossessed,away_last5_passes_received,away_last5_progressive_passes_received,away_last5_cards_yellow_red,away_last5_fouls,away_last5_fouled,away_last5_offsides,away_last5_pens_won,away_last5_pens_conceded,away_last5_own_goals,away_last5_ball_recoveries,away_last5_aerials_won,away_last5_aerials_lost,away_last5_aerials_won_pct,away_last5_points,home_last30_matches_played,home_last30_wins,home_last30_draws,home_last30_defeats,home_last30_goal_difference,home_last30_goals_conceded,home_last30_minutes,home_last30_goals,home_last30_assists,home_last30_pens_made,home_last30_pens_att,home_last30_shots,home_last30_shots_on_target,home_last30_cards_yellow,home_last30_cards_red,home_last30_touches,home_last30_tackles,home_last30_interceptions,home_last30_blocks,home_last30_xg,home_last30_npxg,home_last30_xg_assist,home_last30_sca,home_last30_gca,home_last30_passes_completed,home_last30_passes,home_last30_passes_pct,home_last30_progressive_passes,home_last30_carries,home_last30_progressive_carries,home_last30_take_ons,home_last30_take_ons_won,home_last30_passes_total_distance,home_last30_passes_progressive_distance,home_last30_passes_completed_short,home_last30_passes_short,home_last30_passes_pct_short,home_last30_passes_completed_medium,home_last30_passes_medium,home_last30_passes_pct_medium,home_last30_passes_completed_long,home_last30_passes_long,home_last30_passes_pct_long,home_last30_pass_xa,home_last30_assisted_shots,home_last30_passes_into_final_third,home_last30_passes_into_penalty_area,home_last30_crosses_into_penalty_area,home_last30_passes_live,home_last30_passes_dead,home_last30_passes_free_kicks,home_last30_through_balls,home_last30_passes_switches,home_last30_crosses,home_last30_throw_ins,home_last30_corner_kicks,home_last30_corner_kicks_in,home_last30_corner_kicks_out,home_last30_corner_kicks_straight,home_last30_passes_offsides,home_last30_passes_blocked,home_last30_tackles_won,home_last30_tackles_def_3rd,home_last30_tackles_mid_3rd,home_last30_tackles_att_3rd,home_last30_challenge_tackles,home_last30_challenges,home_last30_challenge_tackles_pct,home_last30_challenges_lost,home_last30_blocked_shots,home_last30_blocked_passes,home_last30_tackles_interceptions,home_last30_clearances,home_last30_errors,home_last30_touches_def_pen_area,home_last30_touches_def_3rd,home_last30_touches_mid_3rd,home_last30_touches_att_3rd,home_last30_touches_att_pen_area,home_last30_touches_live_ball,home_last30_take_ons_won_pct,home_last30_take_ons_tackled,home_last30_take_ons_tackled_pct,home_last30_carries_distance,home_last30_carries_progressive_distance,home_last30_carries_into_final_third,home_last30_carries_into_penalty_area,home_last30_miscontrols,home_last30_dispossessed,home_last30_passes_received,home_last30_progressive_passes_received,home_last30_cards_yellow_red,home_last30_fouls,home_last30_fouled,home_last30_offsides,home_last30_pens_won,home_last30_pens_conceded,home_last30_own_goals,home_last30_ball_recoveries,home_last30_aerials_won,home_last30_aerials_lost,home_last30_aerials_won_pct,home_last30_points,away_last30_matches_played,away_last30_wins,away_last30_draws,away_last30_defeats,away_last30_goal_difference,away_last30_goals_conceded,away_last30_minutes,away_last30_goals,away_last30_assists,away_last30_pens_made,away_last30_pens_att,away_last30_shots,away_last30_shots_on_target,away_last30_cards_yellow,away_last30_cards_red,away_last30_touches,away_last30_tackles,away_last30_interceptions,away_last30_blocks,away_last30_xg,away_last30_npxg,away_last30_xg_assist,away_last30_sca,away_last30_gca,away_last30_passes_completed,away_last30_passes,away_last30_passes_pct,away_last30_progressive_passes,away_last30_carries,away_last30_progressive_carries,away_last30_take_ons,away_last30_take_ons_won,away_last30_passes_total_distance,away_last30_passes_progressive_distance,away_last30_passes_completed_short,away_last30_passes_short,away_last30_passes_pct_short,away_last30_passes_completed_medium,away_last30_passes_medium,away_last30_passes_pct_medium,away_last30_passes_completed_long,away_last30_passes_long,away_last30_passes_pct_long,away_last30_pass_xa,away_last30_assisted_shots,away_last30_passes_into_final_third,away_last30_passes_into_penalty_area,away_last30_crosses_into_penalty_area,away_last30_passes_live,away_last30_passes_dead,away_last30_passes_free_kicks,away_last30_through_balls,away_last30_passes_switches,away_last30_crosses,away_last30_throw_ins,away_last30_corner_kicks,away_last30_corner_kicks_in,away_last30_corner_kicks_out,away_last30_corner_kicks_straight,away_last30_passes_offsides,away_last30_passes_blocked,away_last30_tackles_won,away_last30_tackles_def_3rd,away_last30_tackles_mid_3rd,away_last30_tackles_att_3rd,away_last30_challenge_tackles,away_last30_challenges,away_last30_challenge_tackles_pct,away_last30_challenges_lost,away_last30_blocked_shots,away_last30_blocked_passes,away_last30_tackles_interceptions,away_last30_clearances,away_last30_errors,away_last30_touches_def_pen_area,away_last30_touches_def_3rd,away_last30_touches_mid_3rd,away_last30_touches_att_3rd,away_last30_touches_att_pen_area,away_last30_touches_live_ball,away_last30_take_ons_won_pct,away_last30_take_ons_tackled,away_last30_take_ons_tackled_pct,away_last30_carries_distance,away_last30_carries_progressive_distance,away_last30_carries_into_final_third,away_last30_carries_into_penalty_area,away_last30_miscontrols,away_last30_dispossessed,away_last30_passes_received,away_last30_progressive_passes_received,away_last30_cards_yellow_red,away_last30_fouls,away_last30_fouled,away_last30_offsides,away_last30_pens_won,away_last30_pens_conceded,away_last30_own_goals,away_last30_ball_recoveries,away_last30_aerials_won,away_last30_aerials_lost,away_last30_aerials_won_pct,away_last30_points
40,2023-2024,2023-10-07,12:30,8,10290.0,John Brooks,4-4-2,4-2-3-1,Luton Town,Tottenham Hotspur,990,0,0,0,0,12,2,2,0,440,26,12,19,0.8,0.8,0.6,20,0,195,306,63.7,23,168,13,19,8,4241,2198,79,105,75.2,73,99,73.7,38,78,48.7,1.0,10,24,6,4,253,49,11,0,6,26,21,5,2,3,0,4,11,15,16,6,4,15,24,62.5,9,2,17,38,25,1,66,167,130,149,29,440,42.1,11,57.9,988,458,6,8,17,9,195,23,0,16,5,4,0,0,0,54,9,15,37.5,944,1,1,0,0,15,4,3,1,740,20,9,12,2.0,2.0,1.8,25,2,502,602,83.4,57,425,19,27,9,7908,3002,252,271,93.0,199,218,91.3,29,65,44.6,2.0,11,38,13,2,534,64,19,4,2,13,29,6,3,1,0,4,18,14,8,8,4,11,19,57.9,8,4,8,29,29,0,82,260,305,187,37,740,33.3,15,55.6,2625,1220,15,5,22,11,499,56,1,7,16,4,0,0,0,50,15,9,62.5,1,Kenilworth Road,Luton Town,Luton,10265.0,100.6,65.8,51°53′03″N,0°25′54″W,2,0,3,1.0,0.142857,0.142857,0.714286,-1.142857,2.0,990.0,1.0,0.6,0.2,0.2,13.8,2.0,1.6,0.0,511.8,18.4,8.8,11.2,1.38,1.22,0.96,25.6,1.6,259.8,387.6,66.24,32.8,231.6,15.6,17.6,7.0,4916.0,2167.0,116.8,141.6,81.24,96.0,127.6,74.02,37.6,86.8,42.3,0.96,11.2,23.6,8.6,3.2,332.4,53.6,12.0,0.0,3.2,26.0,23.0,6.8,3.8,2.6,0.4,1.6,13.0,11.2,10.2,6.8,1.4,9.8,17.4,56.22,7.6,3.0,8.2,27.2,18.8,0.2,55.0,164.6,198.0,154.4,27.0,511.6,40.42,8.8,48.82,1332.0,659.8,12.0,5.0,16.2,9.4,256.8,32.0,0.0,11.6,9.4,1.6,0.0,0.0,0.0,61.6,23.4,20.8,53.34,0.8,1.0,0.714286,0.285714,0.0,1.285714,1.142857,990.0,2.6,2.2,0.0,0.0,20.6,8.0,4.2,0.0,728.0,21.8,9.6,11.4,2.04,2.04,1.68,37.4,4.4,523.0,608.2,85.9,61.4,473.6,30.6,25.4,11.4,8207.0,2875.4,281.0,302.0,93.08,196.0,221.2,88.64,31.0,53.4,58.32,1.52,16.4,38.2,15.2,1.4,555.6,50.6,17.2,3.2,1.2,17.6,17.6,7.2,2.2,2.0,0.0,2.0,10.8,13.6,11.0,7.2,3.6,10.6,19.2,54.98,8.6,2.8,8.6,31.4,18.4,0.4,74.8,228.4,294.0,214.8,43.4,728.0,44.2,10.8,43.4,2592.0,1315.0,18.8,12.6,12.6,10.2,519.8,61.2,0.0,12.6,14.2,2.0,0.0,0.2,0.2,56.8,9.6,12.8,43.62,2.6,1.0,0.142857,0.142857,0.714286,-1.142857,2.0,990.0,0.857143,0.428571,0.285714,0.285714,12.571429,1.857143,1.857143,0.0,483.0,18.142857,8.285714,10.428571,1.257143,1.028571,0.814286,23.428571,1.285714,249.142857,364.571429,67.914286,31.571429,229.428571,14.142857,15.857143,6.142857,4767.714286,2080.285714,110.428571,133.285714,81.857143,95.571429,122.142857,77.528571,35.571429,82.285714,42.2,0.914286,9.857143,23.0,7.285714,2.714286,312.571429,50.428571,12.142857,0.0,3.571429,24.285714,19.428571,6.428571,3.142857,3.0,0.285714,1.571429,11.142857,11.142857,10.142857,6.428571,1.571429,9.285714,16.571429,55.871429,7.285714,3.571429,6.857143,26.428571,20.142857,0.428571,59.857143,162.285714,187.142857,139.0,24.428571,482.714286,38.5,7.857143,48.714286,1341.0,658.428571,11.0,4.857143,14.571429,8.428571,246.285714,31.0,0.0,11.714286,10.142857,1.571429,0.0,0.142857,0.0,56.571429,20.571429,18.571429,51.471429,0.571429,1.0,0.714286,0.285714,0.0,1.285714,1.142857,990.0,2.428571,1.857143,0.0,0.0,19.714286,7.428571,3.714286,0.0,739.857143,20.0,9.0,11.857143,1.885714,1.885714,1.542857,34.857143,4.0,533.285714,620.571429,85.8,57.571429,492.857143,29.857143,26.285714,12.428571,8300.0,2817.714286,286.714286,307.285714,93.185714,199.571429,226.142857,88.285714,29.857143,52.142857,56.885714,1.542857,15.142857,42.571429,13.571429,1.428571,570.428571,48.571429,15.857143,3.142857,1.714286,18.142857,17.428571,6.714286,2.428571,2.142857,0.0,1.571429,11.571429,12.428571,9.714286,6.857143,3.428571,9.571429,17.857143,52.757143,8.285714,3.714286,8.142857,29.0,18.571429,0.285714,77.714286,225.714286,296.714286,226.285714,40.857143,739.857143,46.628571,11.0,42.5,2622.142857,1342.428571,19.857143,11.571429,12.714286,9.571429,529.428571,57.285714,0.0,12.142857,12.857143,1.571429,0.0,0.285714,0.142857,54.714286,8.857143,12.714286,40.5,2.428571
41,2023-2024,2023-10-07,15:00,8,38815.0,David Coote,4-4-1-1,4-2-3-1,Everton,Bournemouth,990,3,1,0,0,25,8,1,0,590,28,5,15,2.5,2.5,1.5,40,3,332,443,74.9,31,299,23,22,12,6271,2924,156,178,87.6,114,138,82.6,49,84,58.3,1.6,13,23,8,1,386,55,15,5,1,25,23,8,8,0,0,2,12,12,10,11,7,5,17,29.4,12,5,10,33,18,0,58,173,234,191,38,590,54.5,7,31.8,1733,891,15,10,19,13,330,31,0,14,15,2,0,0,0,55,23,16,59.0,990,0,0,0,0,11,3,2,0,632,20,6,15,0.7,0.7,0.7,21,0,386,488,79.1,34,332,23,19,12,7200,2717,151,171,88.3,175,197,88.8,46,89,51.7,1.3,10,20,11,5,426,62,16,2,1,18,24,7,2,0,1,0,10,6,11,7,2,7,19,36.8,12,5,10,26,29,2,101,272,224,141,27,632,63.2,5,26.3,1917,1019,9,4,23,23,382,34,0,15,13,0,0,0,0,53,16,23,41.0,3,Goodison Park,Everton,Liverpool,39414.0,100.48,68.0,53°26′20″N,002°57′59″W,1,3,0,1.0,0.142857,0.142857,0.714286,-0.857143,1.714286,990.0,1.2,0.8,0.0,0.0,16.0,4.8,1.6,0.0,587.2,21.4,7.8,15.2,1.98,1.98,1.44,26.6,2.0,356.8,471.0,74.84,34.2,329.6,17.0,16.0,6.6,6149.0,2488.4,177.2,200.8,87.12,131.2,160.6,81.84,38.0,81.0,45.84,1.16,11.6,28.6,9.6,2.8,421.0,48.6,12.2,1.4,2.2,21.6,23.8,4.2,3.2,0.8,0.2,1.4,9.6,13.8,10.6,7.4,3.4,10.4,17.2,60.14,6.8,3.8,11.4,29.2,15.8,0.2,56.2,177.0,251.4,162.0,23.2,587.2,41.22,8.2,51.0,1660.6,850.8,14.2,4.4,13.0,9.2,352.6,34.2,0.0,11.0,10.2,1.4,0.0,0.0,0.2,59.2,20.2,15.6,56.32,0.8,1.0,0.0,0.428571,0.571429,-1.428571,2.142857,990.0,0.6,0.6,0.0,0.0,11.2,3.2,1.4,0.0,500.6,18.4,9.0,12.0,1.1,1.1,0.82,19.8,0.8,290.0,383.8,75.36,28.4,283.2,17.8,26.8,12.8,5218.8,1910.8,128.2,144.4,88.34,123.2,145.4,84.94,31.0,71.0,43.76,0.72,8.0,25.2,7.6,1.2,336.6,45.6,12.2,1.2,2.8,14.8,18.6,3.8,3.2,0.2,0.2,1.6,8.4,9.4,8.4,7.2,2.8,9.2,18.2,50.4,9.0,3.8,8.2,27.4,17.4,0.8,62.6,166.2,202.8,140.2,27.0,500.6,48.62,11.6,41.74,1713.6,886.4,14.2,5.4,13.6,9.4,287.8,28.0,0.0,14.8,11.2,1.6,0.0,0.4,0.2,50.2,8.0,11.0,40.66,0.4,1.0,0.142857,0.142857,0.714286,-0.857143,1.714286,990.0,0.857143,0.571429,0.0,0.0,15.428571,5.0,1.714286,0.0,563.714286,19.142857,8.285714,13.857143,1.885714,1.885714,1.457143,26.142857,1.428571,344.428571,454.285714,75.185714,32.0,318.0,16.428571,15.285714,7.0,6053.857143,2370.857143,163.428571,184.714286,87.728571,132.142857,159.0,83.242857,38.428571,82.142857,46.014286,1.042857,11.285714,28.0,8.571429,2.285714,403.571429,48.714286,11.285714,1.285714,2.714286,21.714286,23.0,5.285714,3.285714,1.714286,0.142857,2.0,9.428571,12.142857,9.428571,6.285714,3.428571,9.428571,17.0,54.885714,7.571429,3.142857,10.714286,27.428571,15.0,0.428571,53.571429,166.285714,243.428571,157.857143,22.714286,563.714286,46.471429,7.142857,45.771429,1592.428571,815.142857,13.428571,4.285714,13.428571,8.0,340.285714,31.857143,0.0,11.285714,9.285714,2.0,0.0,0.142857,0.142857,56.857143,16.857143,14.428571,52.542857,0.571429,1.0,0.0,0.428571,0.571429,-1.428571,2.142857,990.0,0.714286,0.571429,0.0,0.0,11.857143,3.714286,1.571429,0.0,528.0,18.142857,9.428571,13.714286,1.185714,1.185714,0.742857,20.714286,1.0,308.428571,406.857143,75.371429,31.285714,296.0,17.0,26.142857,12.428571,5546.857143,1978.714286,135.428571,153.428571,87.957143,132.428571,155.857143,84.628571,32.714286,73.428571,44.142857,0.657143,8.142857,26.142857,8.0,1.285714,358.142857,47.142857,12.142857,1.285714,2.285714,15.285714,19.285714,4.428571,3.285714,0.142857,0.285714,1.571429,9.285714,9.142857,8.0,7.285714,2.857143,9.428571,19.714286,48.385714,10.285714,4.571429,9.142857,27.571429,18.142857,0.714286,64.285714,177.428571,219.142857,139.714286,25.142857,528.0,48.114286,11.285714,42.128571,1763.857143,932.714286,12.857143,5.142857,14.285714,10.142857,305.0,30.857143,0.0,13.714286,11.142857,1.571429,0.0,0.428571,0.142857,53.285714,9.571429,14.714286,39.371429,0.428571


## Metryka zmęczenia zależna od ilości dni od ostatniego meczu + Metryka nowego menadżera
Trzeba dodać rozgrywki europejskie
Można ustawić od kiedy są menadżerowie na początku pierwszego sezonu danych, ale nie powinno to nic specjalnie zmienić

In [62]:
def calculate_tiredness_factor(days_since_last_match, decay_rate=0.1):
    # Higher decay rate means quicker recovery. Adjust as needed.
    return np.exp(-decay_rate * days_since_last_match)

In [63]:
matches['last_match_date'] = pd.NaT
matches['date'] = pd.to_datetime(matches['date'])
matches = matches.sort_values(by='date', ignore_index=True)
# Iterate through each match row
for i, row in matches.iterrows():
    # Find the home team and away team
    home_team = row['home_team']
    away_team = row['away_team']
    
    # Initialize last match date to NaT (Not a Time) for both teams
    last_home_date = pd.NaT
    last_away_date = pd.NaT

    home_team_last_manager = None
    away_team_last_manager = None
    
    # For home team: Search for the last match where the team played (either home or away)
    for j in range(i-1, -1, -1):  # Iterate backward from the current match
        if matches.iloc[j]['home_team'] == home_team:
            last_home_date = matches.iloc[j]['date']
            home_team_last_manager = matches.iloc[j]["home_manager"]
            break  # Stop once the match is found
        if matches.iloc[j]['away_team'] == home_team:
            last_home_date = matches.iloc[j]['date']
            home_team_last_manager = matches.iloc[j]["away_manager"]
            break
    
    # For away team: Search for the last match where the team played (either home or away)
    for j in range(i-1, -1, -1):  # Iterate backward from the current match
        if matches.iloc[j]['home_team'] == away_team:
            last_away_date = matches.iloc[j]['date']
            away_team_last_manager = matches.iloc[j]["home_manager"]
            break  # Stop once the match is found
        if matches.iloc[j]['away_team'] == away_team:
            last_away_date = matches.iloc[j]['date']
            away_team_last_manager = matches.iloc[j]["away_manager"]
            break

    home_matches_since_last_manager = None
    away_matches_since_last_manager = None
    if home_team_last_manager == None:
        home_team_last_manager = row["home_manager"]
        home_matches_since_last_manager = 20
    if away_team_last_manager == None:
        away_team_last_manager = row["away_manager"]
        away_matches_since_last_manager = 20

    if home_team_last_manager == row["home_manager"]:
        new_home_manager = False
    else:
        new_home_manager = True

    if away_team_last_manager and away_team_last_manager == row["away_manager"]:
        new_away_manager = False
    else:
        new_away_manager = True

    # Assign the last match date for home and away teams
    matches.at[i, 'last_match_date_home'] = last_home_date
    matches.at[i, 'last_match_date_away'] = last_away_date
    matches.at[i, 'last_home_manager'] = home_team_last_manager
    matches.at[i, 'last_away_manager'] = away_team_last_manager
    matches.at[i, 'new_home_manager'] = new_home_manager
    matches.at[i, 'new_away_manager'] = new_away_manager
    matches.at[i, 'matches_since_new_home_manager'] = home_matches_since_last_manager
    matches.at[i, 'matches_since_new_away_manager'] = away_matches_since_last_manager    

matches['days_since_last_home'] = (matches['date'] - matches['last_match_date_home']).dt.days
matches['days_since_last_away'] = (matches['date'] - matches['last_match_date_away']).dt.days
matches['days_since_last_home'] = matches['days_since_last_home'].fillna(7)
matches['days_since_last_away'] = matches['days_since_last_away'].fillna(7)
matches['home_tiredness'] = calculate_tiredness_factor(matches['days_since_last_home'])
matches['away_tiredness'] = calculate_tiredness_factor(matches['days_since_last_away'])


matches[(matches['home_team'] == "Burnley") | (matches['away_team'] == "Burnley")][["home_team", "away_team", "date", "last_match_date_home", "last_match_date_away", "days_since_last_away", "days_since_last_home", "home_tiredness", "away_tiredness"]].head(6)

Unnamed: 0,home_team,away_team,date,last_match_date_home,last_match_date_away,days_since_last_away,days_since_last_home,home_tiredness,away_tiredness
9,Burnley,Manchester United,2018-09-02,NaT,NaT,7.0,7.0,0.496585,0.496585
17,Wolverhampton Wanderers,Burnley,2018-09-16,2018-09-01,2018-09-02,14.0,15.0,0.22313,0.246597
25,Burnley,Bournemouth,2018-09-22,2018-09-16,2018-09-15,7.0,6.0,0.548812,0.496585
38,Cardiff City,Burnley,2018-09-30,2018-09-22,2018-09-22,8.0,8.0,0.449329,0.449329
46,Burnley,Huddersfield Town,2018-10-06,2018-09-30,2018-09-29,7.0,6.0,0.548812,0.496585
53,Manchester City,Burnley,2018-10-20,2018-10-07,2018-10-06,14.0,13.0,0.272532,0.246597


In [None]:
matches

Unnamed: 0,season,date,time,round,attendance_value,referee,home_manager,away_manager,home_captain,away_captain,...,last_home_manager,last_away_manager,new_home_manager,new_away_manager,matches_since_new_home_manager,matches_since_new_away_manager,days_since_last_home,days_since_last_away,home_tiredness,away_tiredness
0,2018-2019,2018-09-01,12:30,4,32149.0,Paul Tierney,Claude Puel,Jürgen Klopp,Wes Morgan,Jordan Henderson,...,Claude Puel,Jürgen Klopp,False,False,20.0,20.0,7.0,7.0,0.496585,0.496585
1,2018-2019,2018-09-01,15:00,4,25495.0,Martin Atkinson,Roy Hodgson,Mark Hughes,Luka Milivojević,Ryan Bertrand,...,Roy Hodgson,Mark Hughes,False,False,20.0,20.0,7.0,7.0,0.496585,0.496585


In [69]:
for i in range (len(matches)):
    # Find the home team and away team
    home_team = matches.iloc[i]['home_team']
    away_team = matches.iloc[i]['away_team']

    new_home_manager = matches.iloc[i]["new_home_manager"]
    new_away_manager = matches.iloc[i]["new_away_manager"]

    if new_home_manager:
        matches.at[i, 'matches_since_new_home_manager'] = 0
    if new_away_manager:
        matches.at[i, 'matches_since_new_away_manager'] = 0

    for j in range(i+1, len(matches)):
        if matches.iloc[j]['home_team'] == home_team:
            matches.at[j, 'matches_since_new_home_manager'] = matches.iloc[i]["matches_since_new_home_manager"] + 1
            break
        if matches.iloc[j]['away_team'] == home_team:
            matches.at[j, 'matches_since_new_away_manager'] = matches.iloc[i]["matches_since_new_home_manager"] + 1
            break

    for j in range(i+1, len(matches)):
        if matches.iloc[j]['home_team'] == away_team:
            matches.at[j, 'matches_since_new_home_manager'] = matches.iloc[i]["matches_since_new_away_manager"] + 1
            break
        if matches.iloc[j]['away_team'] == away_team:
            matches.at[j, 'matches_since_new_away_manager'] = matches.iloc[i]["matches_since_new_away_manager"] + 1
            break
    
    

In [78]:
matches[230:].head(2)

Unnamed: 0,season,date,time,round,attendance_value,referee,home_manager,away_manager,home_captain,away_captain,...,last_home_manager,last_away_manager,new_home_manager,new_away_manager,matches_since_new_home_manager,matches_since_new_away_manager,days_since_last_home,days_since_last_away,home_tiredness,away_tiredness
230,2018-2019,2019-02-11,20:00,26,30687.0,Graham Scott,Nuno Espírito Santo,Rafael Benítez,Conor Coady,Jamaal Lascelles,...,Nuno Espírito Santo,Rafael Benítez,False,False,42.0,42.0,9.0,9.0,0.40657,0.40657
231,2018-2019,2019-02-22,19:45,27,59950.0,Lee Mason,Manuel Pellegrini,Claudio Ranieri,Mark Noble,Tom Cairney,...,Manuel Pellegrini,Claudio Ranieri,False,False,43.0,14.0,13.0,13.0,0.272532,0.272532


## Metryki H2H

In [3]:
matches[(matches['away_team'] == "Everton") & (matches["home_team"] == "Sheffield United")][["home_team", "outcome"]]

Unnamed: 0,home_team,outcome
1,Sheffield United,0
1163,Sheffield United,2
1734,Sheffield United,2


In [4]:
def get_h2h_metrics(row, df, num_matches=6):
    past_h2h = df[((df['home_team'] == row['home_team']) & (df['away_team'] == row['away_team'])) |
                  ((df['home_team'] == row['away_team']) & (df['away_team'] == row['home_team']))]
    past_h2h = past_h2h[past_h2h['date'] < row['date']].sort_values(by='date', ascending=False)
    
    past_h2h = past_h2h.head(num_matches)
    num_past_h2h = len(past_h2h)

    past_h2h_home_home = past_h2h[past_h2h["home_team"] == row["home_team"]]
    past_h2h_home_away = past_h2h[past_h2h["home_team"] == row["away_team"]]

    if not past_h2h.empty:
        metrics = {
            'h2h_win_ratio': ((past_h2h_home_home['outcome'] == 1).sum() + (past_h2h_home_away['outcome'] == 2).sum()) / num_past_h2h,
            'h2h_draw_ratio': (past_h2h['outcome'] == 0).sum() / num_past_h2h,
            'h2h_avg_goals_scored_home_team': (past_h2h_home_home['home_goals'].sum() + past_h2h_home_away['away_goals'].sum()) / num_past_h2h,
            'h2h_avg_goals_scored_away_team': (past_h2h_home_home['away_goals'].sum() + past_h2h_home_away['home_goals'].sum()) / num_past_h2h,
            'h2h_avg_xG_home_team': (past_h2h_home_home['home_xg'].sum() + past_h2h_home_away['away_xg'].sum()) / num_past_h2h,
            'h2h_avg_xG_away_team': (past_h2h_home_home['away_xg'].sum() + past_h2h_home_away['home_xg'].sum()) / num_past_h2h,
            # Add other metrics as needed
        }
    else:
        metrics = {
            'h2h_win_ratio': None,
            'h2h_draw_ratio': None,
            'h2h_avg_goals_scored_home_team': None,
            'h2h_avg_goals_scored_away_team': None,
            'h2h_avg_xG_home_team': None,
            'h2h_avg_xG_away_team': None,
        }

    return pd.Series(metrics)

matches_with_h2h = matches.join(matches.apply(lambda row: get_h2h_metrics(row, matches), axis=1))

In [5]:
matches_with_h2h.head(4)

Unnamed: 0,season,date,time,round,attendance_value,referee,home_manager,away_manager,home_captain,away_captain,...,away_last5_aerials_won,away_last5_aerials_lost,away_last5_aerials_won_pct,away_last5_points,h2h_win_ratio,h2h_draw_ratio,h2h_avg_goals_scored_home_team,h2h_avg_goals_scored_away_team,h2h_avg_xG_home_team,h2h_avg_xG_away_team
0,2023-2024,2023-09-01,20:00,4,10802.0,Paul Tierney,Rob Edwards,David Moyes,Carlton Morris,Kurt Zouma,...,17.666667,16.0,50.366667,2.333333,,,,,,
1,2023-2024,2023-09-02,12:30,4,31124.0,Andy Madley,Paul Heckingbottom,Sean Dyche,John Egan,James Tarkowski,...,10.333333,10.666667,49.033333,0.0,0.5,0.0,0.75,0.5,0.9,0.925
2,2023-2024,2023-09-02,15:00,4,39820.0,Tim Robinson,Mauricio Pochettino,Steve Cooper,Ben Chilwell,Joe Worrall,...,23.666667,14.333333,62.7,1.0,0.0,1.0,1.5,1.5,1.05,1.35
3,2023-2024,2023-09-02,15:00,4,52899.0,Michael Oliver,Pep Guardiola,Marco Silva,Kyle Walker,Tom Cairney,...,13.333333,10.0,57.266667,1.333333,1.0,0.0,2.333333,0.333333,2.75,0.45


## Zawodnicy

In [126]:
players = pd.read_csv("data/players_pl_17-18_fbref.csv")
fifa = pd.read_csv("data/sofifa_players_17_18.csv")

In [127]:
players.head(2)

Unnamed: 0,team,where,date,round,season,player,shirtnumber,nationality,position,age,...,fouls,fouled,offsides,pens_won,pens_conceded,own_goals,ball_recoveries,aerials_won,aerials_lost,aerials_won_pct
0,Arsenal,home,2017-08-11,1,2017-2018,Alexandre Lacazette,9,FRA,"FW,LW,LM",26-075,...,2,2,2,0,0,0,2,1,2,33.3
1,Arsenal,home,2017-08-11,1,2017-2018,Danny Welbeck,23,ENG,"AM,FW",26-258,...,1,1,1,0,0,0,2,1,1,50.0


In [145]:
def longest_two_substrings(s1, s2):
    # Initialize the matrix for storing common substring lengths
    matrix = [[0] * (len(s2) + 1) for _ in range(len(s1) + 1)]
    longest_substrings = []
    
    # Populate the matrix
    for i in range(1, len(s1) + 1):
        for j in range(1, len(s2) + 1):
            if s1[i - 1] == s2[j - 1]:
                matrix[i][j] = matrix[i - 1][j - 1] + 1
                if matrix[i][j] > 1:  # Only consider substrings longer than 1
                    longest_substrings.append(s1[i - matrix[i][j]:i])
            else:
                matrix[i][j] = 0

    # Sort substrings by length and pick the top 2 longest ones
    unique_substrings = sorted(set(longest_substrings), key=len, reverse=True)
    return unique_substrings[:2]
def similarity_score(name1, name2):
    substrings = longest_two_substrings(name1, name2)
    return sum(len(sub) for sub in substrings)

In [146]:
def find_best_match_with_longest_substrings(name, choices):
    scores = {choice: similarity_score(name, choice) for choice in choices}
    best_match = max(scores, key=scores.get)  # Find the match with the highest score
    return best_match

In [161]:
fifa_names = fifa["name"].unique()

# find_best_match_with_longest_substrings(players["player"][1], fifa_names)
players["player_name"] = players["player"].apply(lambda name: find_best_match_with_longest_substrings(name, fifa_names))

players[["player_name", "player"]]

Unnamed: 0,player_name,player
0,Alexandre Lacazette,Alexandre Lacazette
1,Daniel Nii Tackie Mensah Welbeck,Danny Welbeck
2,Theo Walcott,Theo Walcott
3,Mesut Özil,Mesut Özil
4,Granit Xhaka,Granit Xhaka
...,...,...
10443,Jacob Harry Maguire,Harry Maguire
10444,Wes Morgan,Wes Morgan
10445,Danny Simpson,Danny Simpson
10446,Hamza Choudhury,Hamza Choudhury


In [169]:
find_best_match_with_longest_substrings(players["player"][10349], fifa_names)

'Martin Braithwaite Christensen'