In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Ielādē datus
all_seasons = pd.read_csv("/content/drive/MyDrive/bakalauraDarbs/Premier_League_All_Seasons_xG.csv")
fixtures_df = pd.read_csv("/content/drive/MyDrive/bakalauraDarbs/Premier_League_Fixtures_Summary.csv")

# Konvertē datu tipus
all_seasons['Wk'] = all_seasons['Wk'].astype(int)
all_seasons['Date'] = pd.to_datetime(all_seasons['Date'], errors='coerce')
fixtures_df['Wk'] = fixtures_df['Wk'].astype(int)


# =============================================================================
# Funkcija aprēķina katras komandas svaroto xG (uzbrukuma potenciālu), xGc (ielaisto vārtu potenciālu)
# un kopējo komandas "stiprumu" ņemot vērā trīs periodus:
#   - pēdējās 10 spēles (vislielākais svars),
#   - pārējās šīs sezonas spēles,
#   - spēles iepriekšējās sezonās (mazākais svars).
# Aprēķini tiek veikti atsevišķi par uzbrukumu un aizsardzību kā mājinieks un kā viesis.
# Iegūtie vidējie rādītāji tiek apkopoti - uzbrukumam ('xG'), aizsardzībai ('xGc'),
# kā arī iegūts "TeamStrength" – komandas novirze no līgas mediānas (uzbrukums - uzbrukuma mediāna + aizsardzība - aizsardzības mediāna).
# Šī funkcija palīdz iegūt aktuālo spēka bilanci katrai komandai uz jebkuru izvēlēto datumu (current_date).
# =============================================================================
def calculate_weighted_xG_and_strength(df, current_date):
    last_10_weight = 0.40
    season_weight = 0.55
    historical_weight = 0.05
    weighted_xG = {}
    teams = set(df["Home"].unique()).union(set(df["Away"].unique()))
    for team in teams:
        team_home_games = df[df["Home"] == team].copy().sort_values("Date", ascending=False)
        team_away_games = df[df["Away"] == team].copy().sort_values("Date", ascending=False)
        last_10_home = team_home_games[:10]
        last_10_away = team_away_games[:10]
        season_home = team_home_games[10:]
        season_away = team_away_games[10:]
        historical_home = df[(df["Home"] == team) & (~df["Date"].dt.year.isin([current_date.year]))]
        historical_away = df[(df["Away"] == team) & (~df["Date"].dt.year.isin([current_date.year]))]
        xG_Home = (
            last_10_home["xG_Home"].mean() * last_10_weight +
            season_home["xG_Home"].mean() * season_weight +
            historical_home["xG_Home"].mean() * historical_weight
        )
        xG_Away = (
            last_10_away["xG_Away"].mean() * last_10_weight +
            season_away["xG_Away"].mean() * season_weight +
            historical_away["xG_Away"].mean() * historical_weight
        )
        xG_Conceded_Home = (
            last_10_home["xG_Away"].mean() * last_10_weight +
            season_home["xG_Away"].mean() * season_weight +
            historical_home["xG_Away"].mean() * historical_weight
        )
        xG_Conceded_Away = (
            last_10_away["xG_Home"].mean() * last_10_weight +
            season_away["xG_Home"].mean() * season_weight +
            historical_away["xG_Home"].mean() * historical_weight
        )
        weighted_xG[team] = {
            "xG": (xG_Home + xG_Away) / 2,
            "xGAway": xG_Away,
            "xGHome": xG_Home,
            "xGc": (xG_Conceded_Home + xG_Conceded_Away) / 2,
            "xGcAway": xG_Conceded_Away,
            "xGcHome": xG_Conceded_Home,
        }
    ranking_df = pd.DataFrame.from_dict(weighted_xG, orient="index")
    median_offense = ranking_df["xG"].median()
    median_defense = ranking_df["xGc"].median()
    ranking_df["TeamStrength"] = (ranking_df["xG"] - median_offense) - (ranking_df["xGc"] - median_defense)
    weakest_strength = ranking_df["TeamStrength"].min()
    ranking_df["TeamStrength"] = ranking_df["TeamStrength"].fillna(weakest_strength)
    return ranking_df

teams = sorted(set(all_seasons['Home']).union(set(all_seasons['Away'])))
xg_table = pd.DataFrame(index=teams, columns=range(1, 39))  # Kārtas 1 līdz 38


# =============================================================================
# Funkcija prognozē xG konkrētai spēlei (mājas un viesu komandai).
# Tiek ņemts vērā komandu attiecīgais uzbrukuma/aizsardzības "stiprums" (team_strengths),
# no kā izveido svērtos rādītājus.
# Šie svērtie lielumi tiek normalizēti ar iepriekš uztrenētu skalētāju (scaler) un padoti apmācītajam XGBoost modelim.
# Modelis atgriež prognozētos xG abām komandām – tos ieraksta specializētā rezultātu tabulā (xg_table) attiecīgajā komandai un kārtā.
# =============================================================================
def predict_xg(home_team, away_team, scaler, model, week, team_strengths):
    home_strength = team_strengths.get(home_team, 0)
    away_strength = team_strengths.get(away_team, 0)
    home_offensive = 1 + home_strength
    away_defensive = 1 - away_strength
    home_defensive = 1 - home_strength
    away_offensive = 1 + away_strength
    home_xG_weighted = home_offensive / away_defensive
    away_xG_weighted = away_offensive / home_defensive
    match_data = pd.DataFrame({
        'xG_Home_Weighted': [home_xG_weighted],
        'xG_Away_Weighted': [away_xG_weighted]
    })
    match_data_scaled = scaler.transform(match_data)
    xg_prediction = model.predict(match_data_scaled).flatten()
    home_xg, away_xg = float(xg_prediction[0]), float(xg_prediction[1])
    xg_table.at[home_team, week] = home_xg
    xg_table.at[away_team, week] = away_xg

# ===================================================
# Galvenais cikls pa visām nedēļām/kārtām kalendārā
# ===================================================
for week in fixtures_df['Wk'].unique():  # Iziet cauri katrai unikālai kārtai .csv failā
    # Atlasām visas šīs kārtas spēles (pašreizējās nedēļas mači)
    current_week_fixtures = fixtures_df[fixtures_df['Wk'] == week]

    # Treniņiem izmantojam tikai pagātnes datus – visus mačus līdz konkrētai nedēļai un iepriekšējos gadus
    past_data = all_seasons[(all_seasons['Wk'] < week) | (all_seasons['Season'] < '2024-2025')].copy()

    # Simulējam "šīs nedēļas" datumu (līdz kuram rēķina komandu svarotos rādītājus)
    current_date = pd.Timestamp("2024-08-01") + pd.DateOffset(weeks=int(week))

    # Aprēķina svarotos komandu xG/xGc un spēka indeksus
    ranking_df = calculate_weighted_xG_and_strength(past_data, current_date)
    team_strengths = ranking_df['TeamStrength'].to_dict()

    # Katram vēsturiskajam mačam izveido svērtos xG rādītājus, izmantojot attiecīgo komandu spēka rādītājus
    past_data['xG_Home_Weighted'] = past_data.apply(
        lambda row: (1 + team_strengths.get(row['Home'], 0)) / (1 - team_strengths.get(row['Away'], 0)), axis=1
    )
    past_data['xG_Away_Weighted'] = past_data.apply(
        lambda row: (1 + team_strengths.get(row['Away'], 0)) / (1 - team_strengths.get(row['Home'], 0)), axis=1
    )

    # Iepriekšējo spēļu xG rādītāji (pazīmes/mērķa vērtības) modelēšanas apmācībai
    X_past = past_data[['xG_Home_Weighted', 'xG_Away_Weighted']]
    y_past = past_data[['xG_Home', 'xG_Away']]

    # Ja nav datu, šo kārtu izlaižam
    if X_past.empty:
        continue

    # Sadalām apmācību/testa datu kopa; standarta mērogošana
    X_train, X_test, y_train, y_test = train_test_split(
        X_past, y_past, test_size=0.2, random_state=42
    )
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)

    # Apmācam XGBoost regresijas modeli ar vēsturiskajiem datiem, prognozējot 'xG_Home' un 'xG_Away'
    model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
    model.fit(X_train_scaled, y_train)

    # Caur katru šīs kārtas spēli: prognozējam xG abām komandām un pierakstām rezultātu gala tabulā
    for _, row in current_week_fixtures.iterrows():
        try:
            predict_xg(row['Home'], row['Away'], scaler, model, int(week), team_strengths)
        except ValueError as e:
            print(f"Kļūda prognozējot {row['Home']} vs {row['Away']}: {e}")
# Saglabā xG tabulu excel
xg_table.to_excel("/content/drive/MyDrive/bakalauraDarbs/Premier_League_xG_Predictions.xlsx")