In [None]:
import pandas as pd
import numpy as np
from datetime import datetime as dt

# --- 1. LOAD DATA (CONSOLIDATED) ---

# Paths to your files
path_prefix = '/content/drive/MyDrive/archive-2/Data_with_90_minutes/'
file_paths = {
    '19-20': '2019-20_weather_90_full.csv',
    '20-21': '2020-21_weather_90.csv',
    '21-22': '2021-22_weather_90.csv',
}

# Columns we need
columns_req = [
    'Date', 'Time',
    'HomeTeam', 'AwayTeam', 'FTR',
    'FTHG', 'FTAG', 'HS', 'AS', 'HST', 'AST',
    'HC', 'AC', 'HF', 'AF', 'HY', 'AY', 'HR', 'AR',
    'Start_Temp_C', 'Start_Wind_kmh', 'Start_Wind_Degree',
    'Start_Humidity', 'Start_Precip_mm', 'Start_Conditions'
]

data_frames = []

# Load and prepare all CSVs
for key, file_path in file_paths.items():
    try:
        df = pd.read_csv(f"{path_prefix}{file_path}")
        df = df[columns_req]

        # Correct/unify date format
        df.loc[:,'Date'] = pd.to_datetime(df['Date'], #format='%d/%m/%y',
                                          errors='coerce', dayfirst=True)

        data_frames.append(df)
    except Exception as e:
        print(f"Error loading {file_path}: {e}")

# --- 2. CREATE SINGLE MASTER DATAFRAME ---

all_seasons_df = pd.concat(data_frames, ignore_index=True)
print(f'all seasons befor dropna: {all_seasons_df.shape}')
all_seasons_df = all_seasons_df.dropna(subset=['Date']) # Remove rows with invalid dates
all_seasons_df = all_seasons_df.sort_values(by='Date').reset_index(drop=True)
all_seasons_df['match_id'] = all_seasons_df.index

print(f"Data loaded and processed. Total shape: {all_seasons_df.shape}")
all_seasons_df.head(10)
all_seasons_df.to_csv("all_seasons.csv", index=False)


all seasons befor dropna: (1140, 25)
Data loaded and processed. Total shape: (1140, 26)


# One-hot encoding für Wetterdaten

In [None]:
df_work = all_seasons_df

print("Starting weather-classification for DataFrame...")

def map_weather_description(description):
    """
    Sorts weather descriptions into a main category based on severity.
    Order: Dangerous -> Precipitation -> Visibility -> Cloud cover -> Clear
    """
    if pd.isna(description):
        return 'Unknown'

    desc = str(description).lower()

    # 1. Thunderstorm
    if 'thunderstorm' in desc:
        return 'thunderstorm'

    # 2. snow and ice
    if 'snow' in desc or 'sleet' in desc or 'freezing' in desc:
        return 'snow'

    # 3. heavy rain
    if 'heavy' in desc or 'extreme' in desc or 'ragged' in desc:
        return 'heavy_rain'

    # 4. moderate rain
    if 'moderate' in desc or 'shower' in desc:
        return 'moderate_rain'

    # 5. Rain
    if 'rain' in desc or 'drizzle' in desc:
        return 'rain'

    # 6. Visibility
    if 'mist' in desc or 'fog' in desc or 'haze' in desc:
        return 'fog'

    # 7. Cloudy
    if 'clouds' in desc:
        return 'cloudy'

    # 8. clear sky
    if 'clear' in desc or 'sun' in desc:
        return 'clear_sky'

    # Fallback
    return 'else'

column_name = 'Start_Conditions'

if column_name in df_work.columns:
    print(f"Column '{column_name}' found. Start categorization")

    # A) Categories
    df_work['Weather_categories'] = df_work[column_name].apply(map_weather_description)

    print("Distribution of weather categories:")
    print(df_work['Weather_categories'].value_counts())

    # B) One-Hot Encoding
    print("Apply One-Hot Encoding ...")
    weather_features = pd.get_dummies(df_work['Weather_categories'], prefix='Weather', dtype=int)

    # C) Concate
    df_work = pd.concat([df_work, weather_features], axis=1)

    # D) clean up
    df_work.drop([column_name, 'Weather_categories'], axis=1, inplace=True)

    print("New Column:", weather_features.columns.tolist())


else:
    print(f"Warning: column '{column_name}' not found!")
    print("Verfügbare Spalten:", df_work.columns.tolist())

Starting weather-classification for DataFrame...
Column 'Start_Conditions' found. Start categorization
Distribution of weather categories:
Weather_categories
cloudy           585
rain             234
clear_sky        208
moderate_rain     69
fog               34
snow               6
heavy_rain         3
thunderstorm       1
Name: count, dtype: int64
Apply One-Hot Encoding ...
New Column: ['Weather_clear_sky', 'Weather_cloudy', 'Weather_fog', 'Weather_heavy_rain', 'Weather_moderate_rain', 'Weather_rain', 'Weather_snow', 'Weather_thunderstorm']


# Feature-Functions

In [None]:
def classify_kickoff_time(df):
    """
    Classifies the kickoff time ('Time') into Afternoon (0) or Evening (1).
    Creates the 'TMKO' column.

    Logic:
    - 0 (Afternoon): Kickoff before 5 PM (17:00)
    - 1 (Evening): Kickoff at 5 PM (17:00) or later
    """

    df_out = df.copy()

    # --- STEP 0: Clean columns to prevent _x/_y suffixes ---
    # Find all columns starting with TMKO (TMKO, TMKO_x, TMKO_y)
    cols_to_drop = [col for col in df_out.columns if col.startswith('TMKO')]
    if cols_to_drop:
        df_out = df_out.drop(columns=cols_to_drop)

    # --- STEP 1: Extract hour ---
    # Ensures the 'Time' column is treated as datetime
    # and extracts the hour as a number (e.g., "16:30" -> 16)
    try:
        # Fill missing times (NaN) with '00:00' before conversion
        hour = pd.to_datetime(df_out['Time'].fillna('00:00'), format='%H:%M').dt.hour
    except ValueError as e:
        print(f"Error converting the 'Time' column: {e}")
        print("Ensure all times are in 'HH:MM' format (e.g., 16:30).")
        # Show a few problematic values
        print("Problem data (examples):")
        print(df_out[pd.to_datetime(df_out['Time'], format='%H:%M', errors='coerce').isna()]['Time'].head())
        return df # Return original DF if an error occurs

    # --- STEP 2: Classify ---
    # np.where(condition, value_if_True, value_if_False)
    df_out['TMKO'] = np.where(hour >= 17, 1, 0)

    return df_out

def get_strict_rolling_avg(df, stats_map, window_size=3):
    """
    Calculates the rolling average of the LAST 'window_size' matches.
    The CURRENT match is NOT included in the calculation.
    """

    # 1. Create a "long" version of the data
    generic_cols = list(stats_map.keys())
    home_df = df[['match_id', 'Date', 'HomeTeam'] + generic_cols].copy()
    home_df.rename(columns={'HomeTeam': 'Team'}, inplace=True)
    home_df['is_home_match'] = 1

    away_cols = list(stats_map.values())
    away_df = df[['match_id', 'Date', 'AwayTeam'] + away_cols].copy()
    away_df.rename(columns={'AwayTeam': 'Team'}, inplace=True)
    away_rename_map = dict(zip(away_cols, generic_cols))
    away_df.rename(columns=away_rename_map, inplace=True)
    away_df['is_home_match'] = 0

    # 2. Combine and sort by date
    all_stats_df = pd.concat([home_df, away_df], ignore_index=True)
    all_stats_df = all_stats_df.sort_values(by=['Team', 'Date'])

    # 3. Calculate and assign rolling average
    df_out = df.copy()

    # Prepare temporary DataFrames for home/away merges
    home_merge_df = all_stats_df[all_stats_df['is_home_match'] == 1][['match_id', 'Team']].copy()
    home_merge_df.rename(columns={'Team': 'HomeTeam'}, inplace=True)
    away_merge_df = all_stats_df[all_stats_df['is_home_match'] == 0][['match_id', 'Team']].copy()
    away_merge_df.rename(columns={'Team': 'AwayTeam'}, inplace=True)

    for home_stat, away_stat in stats_map.items():
        # 1. Calculate rolling avg (includes current match)
        rolling_avg_series = all_stats_df.groupby('Team')[home_stat].rolling(window=window_size).mean()
        rolling_avg_series = rolling_avg_series.reset_index(level=0, drop=True)

        # 2. Assign average as a *temporary* column
        all_stats_df['TEMP_AVG'] = rolling_avg_series

        # 3. Get the average value from the PREVIOUS row (.shift(1))
        all_stats_df['PREV_AVG'] = all_stats_df.groupby('Team')['TEMP_AVG'].shift(1)

        # --- 4. Merge back to "wide" format ---

        # Get home team data and rename column (e.g., 'AHST')
        home_data = all_stats_df[all_stats_df['is_home_match'] == 1][['match_id', 'PREV_AVG']]
        home_data = home_data.rename(columns={'PREV_AVG': f'A{home_stat}'})
        home_merge_df = home_merge_df.merge(home_data, on='match_id', how='left')

        # Get away team data and rename column (e.g., 'AAST')
        away_data = all_stats_df[all_stats_df['is_home_match'] == 0][['match_id', 'PREV_AVG']]
        away_data = away_data.rename(columns={'PREV_AVG': f'A{away_stat}'})
        away_merge_df = away_merge_df.merge(away_data, on='match_id', how='left')

    # --- 5. Final Merges ---
    df_out = df_out.merge(home_merge_df, on=['match_id', 'HomeTeam'], how='left')
    df_out = df_out.merge(away_merge_df, on=['match_id', 'AwayTeam'], how='left')

    return df_out


def get_previous_game_form(df):
    """
    Determines the result (Won/NotWin) of each team's last match.
    """

    # 1. Calculate result (Form) for each match from the team's perspective

    # --- Home Data ---
    home_df = df[['match_id', 'Date', 'HomeTeam', 'FTR']].copy()
    home_df.rename(columns={'HomeTeam': 'Team'}, inplace=True)
    home_df['is_home_match'] = 1
    home_df['Won'] = (home_df['FTR'] == 'H').astype(int)
    home_df['NotWin'] = (home_df['FTR'] != 'H').astype(int)

    # --- Away Data ---
    away_df = df[['match_id', 'Date', 'AwayTeam', 'FTR']].copy()
    away_df.rename(columns={'AwayTeam': 'Team'}, inplace=True)
    away_df['is_home_match'] = 0
    away_df['Won'] = (away_df['FTR'] == 'A').astype(int)
    away_df['NotWin'] = (away_df['FTR'] != 'A').astype(int)

    # 2. Combine and sort by date
    all_stats_df = pd.concat([home_df, away_df], ignore_index=True)
    all_stats_df = all_stats_df.sort_values(by=['Team', 'Date'])

    # 3. Create "Previous" form for each team
    all_stats_df['P_Won'] = all_stats_df.groupby('Team')['Won'].shift(1)
    all_stats_df['P_NotWin'] = all_stats_df.groupby('Team')['NotWin'].shift(1)

    # 4. Merge back to "wide" format
    df_out = df.copy()

    # Home teams
    home_prev_data = all_stats_df[all_stats_df['is_home_match'] == 1].copy()
    home_prev_data.rename(columns={'Team': 'HomeTeam', 'P_Won': 'PHFR_Won', 'P_NotWin': 'PHFR_NotWin'}, inplace=True)
    cols_to_merge_home = ['match_id', 'HomeTeam', 'PHFR_Won', 'PHFR_NotWin']
    df_out = df_out.merge(home_prev_data[cols_to_merge_home], on=['match_id', 'HomeTeam'], how='left')

    # Away teams
    away_prev_data = all_stats_df[all_stats_df['is_home_match'] == 0].copy()
    away_prev_data.rename(columns={'Team': 'AwayTeam', 'P_Won': 'PAFR_Won', 'P_NotWin': 'PAFR_NotWin'}, inplace=True)
    cols_to_merge_away = ['match_id', 'AwayTeam', 'PAFR_Won', 'PAFR_NotWin']
    df_out = df_out.merge(away_prev_data[cols_to_merge_away], on=['match_id', 'AwayTeam'], how='left')

    return df_out


def get_last_match_overall(df, stats_map):
    """
    Gets the stats of the last match for each team,
    regardless of venue (Home or Away).
    """

    # 1. Create a "long" version of the data
    generic_cols = list(stats_map.keys())
    home_df = df[['match_id', 'Date', 'HomeTeam'] + generic_cols].copy()
    home_df.rename(columns={'HomeTeam': 'Team'}, inplace=True)
    home_df['is_home_match'] = 1

    away_cols = list(stats_map.values())
    away_df = df[['match_id', 'Date', 'AwayTeam'] + away_cols].copy()
    away_df.rename(columns={'AwayTeam': 'Team'}, inplace=True)
    away_rename_map = dict(zip(away_cols, generic_cols))
    away_df.rename(columns=away_rename_map, inplace=True)
    away_df['is_home_match'] = 0

    # 2. Combine and sort by date
    all_stats_df = pd.concat([home_df, away_df], ignore_index=True)
    all_stats_df = all_stats_df.sort_values(by=['Team', 'Date'])

    # 3. Create "Previous" stats for each team
    prev_cols = {}
    for col in generic_cols:
        prev_cols[f'P_{col}'] = all_stats_df.groupby('Team')[col].shift(1)

    prev_stats_df = pd.DataFrame(prev_cols)
    all_stats_df = pd.concat([all_stats_df, prev_stats_df], axis=1)

    # 4. Merge back to "wide" format
    df_out = df.copy()

    # --- Merge for Home teams ---
    home_prev_data = all_stats_df[all_stats_df['is_home_match'] == 1].copy()
    home_prev_data.rename(columns={'Team': 'HomeTeam'}, inplace=True)
    # Rename columns: 'P_HST' -> 'PHST'
    home_rename_map = {f'P_{col}': f'P{col}' for col in generic_cols}
    home_prev_data.rename(columns=home_rename_map, inplace=True)
    cols_to_merge_home = ['match_id', 'HomeTeam'] + list(home_rename_map.values())
    df_out = df_out.merge(home_prev_data[cols_to_merge_home], on=['match_id', 'HomeTeam'], how='left')

    # --- Merge for Away teams ---
    away_prev_data = all_stats_df[all_stats_df['is_home_match'] == 0].copy()
    away_prev_data.rename(columns={'Team': 'AwayTeam'}, inplace=True)
    # Rename columns: 'P_HST' -> 'PAST' (uses the 'away_stat' name)
    away_rename_map = {}
    for home_stat, away_stat in stats_map.items():
        away_rename_map[f'P_{home_stat}'] = f'P{away_stat}' # e.g., 'P_HST' -> 'PAST'
    away_prev_data.rename(columns=away_rename_map, inplace=True)
    cols_to_merge_away = ['match_id', 'AwayTeam'] + list(away_rename_map.values())
    df_out = df_out.merge(away_prev_data[cols_to_merge_away], on=['match_id', 'AwayTeam'], how='left')

    return df_out



def get_time_between_games(df):
    """
    Calculates the number of days since the last match for
    the home and away teams (TBGH, TBGA).

    """

    df_out = df.copy()

    # --- SCHRITT 0: cleaning columns ---
    base_cols = ['TBGH', 'TBGA']

    cols_to_drop = []
    for col_in_df in df_out.columns:
        base_name = col_in_df.split('_x')[0].split('_y')[0]
        if base_name in base_cols:
            cols_to_drop.append(col_in_df)

    if cols_to_drop:
        df_out = df_out.drop(columns=list(set(cols_to_drop)))


    # 1.

    # --- Home-Team ---
    home_df = df_out[['match_id', 'Date', 'HomeTeam']].copy()
    home_df.rename(columns={'HomeTeam': 'Team'}, inplace=True)
    home_df['is_home_match'] = 1

    # --- Away-Team ---
    away_df = df_out[['match_id', 'Date', 'AwayTeam']].copy()
    away_df.rename(columns={'AwayTeam': 'Team'}, inplace=True)
    away_df['is_home_match'] = 0

    # 2. Concating and sorting
    all_stats_df = pd.concat([home_df, away_df], ignore_index=True)
    all_stats_df = all_stats_df.sort_values(by=['Team', 'Date'])

    # 3. "Previous Date": calculate differences

    all_stats_df['Prev_Date'] = all_stats_df.groupby('Team')['Date'].shift(1)

    # Ensureing both columns are datetime objects before subtraction
    all_stats_df['Date'] = pd.to_datetime(all_stats_df['Date'], errors='coerce')
    all_stats_df['Prev_Date'] = pd.to_datetime(all_stats_df['Prev_Date'], errors='coerce')

    all_stats_df['TBG'] = (all_stats_df['Date'] - all_stats_df['Prev_Date']).dt.days

    all_stats_df = all_stats_df.sort_values(by='Date')

    # 4. Merging to DataFrame

    home_prev_data = all_stats_df[all_stats_df['is_home_match'] == 1].copy()
    home_prev_data.rename(columns={'Team': 'HomeTeam', 'TBG': 'TBGH'}, inplace=True)

    cols_to_merge_home = ['match_id', 'HomeTeam', 'TBGH']
    df_out = df_out.merge(
        home_prev_data[cols_to_merge_home],
        on=['match_id', 'HomeTeam'],
        how='left'
    )

    away_prev_data = all_stats_df[all_stats_df['is_home_match'] == 0].copy()
    away_prev_data.rename(columns={'Team': 'AwayTeam', 'TBG': 'TBGA'}, inplace=True)

    cols_to_merge_away = ['match_id', 'AwayTeam', 'TBGA']
    df_out = df_out.merge(
        away_prev_data[cols_to_merge_away],
        on=['match_id', 'AwayTeam'],
        how='left'
    )

    return df_out


def get_number_of_wins(df, window_size=3):
    """
    Calculates the *number* of wins in the last 'window_size' games
    (NOWH, NOWA).

    """

    df_out = df.copy()

    # --- Cleaning ---
    base_cols = ['NOWH', 'NOWA']

    cols_to_drop = []
    for col_in_df in df_out.columns:
        base_name = col_in_df.split('_x')[0].split('_y')[0]
        if base_name in base_cols:
            cols_to_drop.append(col_in_df)

    if cols_to_drop:
        df_out = df_out.drop(columns=list(set(cols_to_drop)))
    # --- END cleaning ---


    # 1.

    # --- Home-Team ---
    home_df = df_out[['match_id', 'Date', 'HomeTeam', 'FTR']].copy()
    home_df.rename(columns={'HomeTeam': 'Team'}, inplace=True)
    home_df['is_home_match'] = 1
    home_df['Won'] = (home_df['FTR'] == 'H').astype(int) # 1 für Heimsieg, 0 sonst

    # --- Away-Team ---
    away_df = df_out[['match_id', 'Date', 'AwayTeam', 'FTR']].copy()
    away_df.rename(columns={'AwayTeam': 'Team'}, inplace=True)
    away_df['is_home_match'] = 0
    away_df['Won'] = (away_df['FTR'] == 'A').astype(int) # 1 für Auswärtssieg, 0 sonst

    # 2. sort
    all_stats_df = pd.concat([home_df, away_df], ignore_index=True)
    all_stats_df = all_stats_df.sort_values(by=['Team', 'Date'])

    # 3. calculate sum of wins

    # a. Rolling sum
    rolling_sum_series = all_stats_df.groupby('Team')['Won'] \
                                     .rolling(window=window_size) \
                                     .sum()
    rolling_sum_series = rolling_sum_series.reset_index(level=0, drop=True)

    all_stats_df['TEMP_NOW'] = rolling_sum_series

    all_stats_df['NOW'] = all_stats_df.groupby('Team')['TEMP_NOW'].shift(1)

    # 4. sorting back
    all_stats_df = all_stats_df.sort_values(by='Date')

    # 5. Merging back

    # --- Merge ---
    home_prev_data = all_stats_df[all_stats_df['is_home_match'] == 1].copy()
    home_prev_data.rename(columns={'Team': 'HomeTeam', 'NOW': 'NOWH'}, inplace=True)

    cols_to_merge_home = ['match_id', 'HomeTeam', 'NOWH']
    df_out = df_out.merge(
        home_prev_data[cols_to_merge_home],
        on=['match_id', 'HomeTeam'],
        how='left'
    )

    away_prev_data = all_stats_df[all_stats_df['is_home_match'] == 0].copy()
    away_prev_data.rename(columns={'Team': 'AwayTeam', 'NOW': 'NOWA'}, inplace=True)

    cols_to_merge_away = ['match_id', 'AwayTeam', 'NOWA']
    df_out = df_out.merge(
        away_prev_data[cols_to_merge_away],
        on=['match_id', 'AwayTeam'],
        how='left'
    )

    # clean up
    df_out = df_out.drop(columns=['NOWH_x', 'NOWA_x', 'NOWH_y', 'NOWA_y'], errors='ignore')

    return df_out


def get_avg_time_between_games(df, window_size=3):
    """
    calculates the *average* number of days since the
    last 'window_size' games (ATBGH, ATBGA).

    """

    df_out = df.copy()

    base_cols = ['ATBGH', 'ATBGA']

    cols_to_drop = []
    for col_in_df in df_out.columns:
        base_name = col_in_df.split('_x')[0].split('_y')[0]
        if base_name in base_cols:
            cols_to_drop.append(col_in_df)

    if cols_to_drop:
        df_out = df_out.drop(columns=list(set(cols_to_drop)))


    # 1.
    home_df = df_out[['match_id', 'Date', 'HomeTeam']].copy()
    home_df.rename(columns={'HomeTeam': 'Team'}, inplace=True)
    home_df['is_home_match'] = 1

    away_df = df_out[['match_id', 'Date', 'AwayTeam']].copy()
    away_df.rename(columns={'AwayTeam': 'Team'}, inplace=True)
    away_df['is_home_match'] = 0

    # 2.
    all_stats_df = pd.concat([home_df, away_df], ignore_index=True)
    all_stats_df = all_stats_df.sort_values(by=['Team', 'Date'])

    # 3.
    all_stats_df['Prev_Date'] = all_stats_df.groupby('Team')['Date'].shift(1)

    all_stats_df['Date'] = pd.to_datetime(all_stats_df['Date'], errors='coerce')
    all_stats_df['Prev_Date'] = pd.to_datetime(all_stats_df['Prev_Date'], errors='coerce')

    all_stats_df['TBG'] = (all_stats_df['Date'] - all_stats_df['Prev_Date']).dt.days.fillna(110)

    # 4.
    rolling_avg_series = all_stats_df.groupby('Team')['TBG'] \
                                     .rolling(window=window_size) \
                                     .mean()
    rolling_avg_series = rolling_avg_series.reset_index(level=0, drop=True)


    all_stats_df['TEMP_ATBG'] = rolling_avg_series


    all_stats_df['ATBG'] = all_stats_df.groupby('Team')['TEMP_ATBG'].shift(1)

    # 5.
    all_stats_df = all_stats_df.sort_values(by='Date')

    # 6.

    home_prev_data = all_stats_df[all_stats_df['is_home_match'] == 1].copy()
    home_prev_data.rename(columns={'Team': 'HomeTeam', 'ATBG': 'ATBGH'}, inplace=True)

    cols_to_merge_home = ['match_id', 'HomeTeam', 'ATBGH']
    df_out = df_out.merge(
        home_prev_data[cols_to_merge_home],
        on=['match_id', 'HomeTeam'],
        how='left'
    )

    away_prev_data = all_stats_df[all_stats_df['is_home_match'] == 0].copy()
    away_prev_data.rename(columns={'Team': 'AwayTeam', 'ATBG': 'ATBGA'}, inplace=True)

    cols_to_merge_away = ['match_id', 'AwayTeam', 'ATBGA']
    df_out = df_out.merge(
        away_prev_data[cols_to_merge_away],
        on=['match_id', 'AwayTeam'],
        how='left'
    )

    df_out = df_out.drop(columns=['TBGH_x', 'TBGA_x', 'TBGH_y', 'TBGA_y'], errors='ignore')

    return df_out

Pipeline to create the dataFrame

In [None]:
def feature_engineering_pipeline(input_df, window_size=3):
    """
    Executes all feature engineering steps sequentially.
    """

    print(f"Starting pipeline with {input_df.shape[0]} matches...")
    df = input_df.copy()

    # --- 1. Define Stat-Mapping ---
    # (Home Stat : Away Stat)
    avg_stats_map = {
        'FTHG': 'FTAG',
        'HS': 'AS',
        'HST': 'AST',
        'HC': 'AC',
        'HF': 'AF',
        'HY': 'AY',
        'HR': 'AR'
    }

    # --- 2. Call functions sequentially ---

    print("Classifying kickoff time (TMKO)...")
    df = classify_kickoff_time(df)

    print("Calculating rolling average...")
    df = get_strict_rolling_avg(df, avg_stats_map, window_size=window_size)

    print("Calculating form (last match)...")
    df = get_previous_game_form(df)

    print("Calculating last match stats...")
    df = get_last_match_overall(df, avg_stats_map)

    print("Calculating time between games (TBGH, TBGA)...")
    df = get_time_between_games(df)

    print("Calculating number of wins in the last 3 games (NOWH, NOWA)")
    df = get_number_of_wins(df, window_size=window_size)

    print("claculation avarage time between games")
    df = get_avg_time_between_games(df, window_size=window_size)

    print("Pipeline finished.")
    return df

In [None]:
# --- 4. EXECUTE PIPELINE ---

# Apply the entire pipeline to the prepared DataFrame - original
features_df = feature_engineering_pipeline(df_work, window_size=3)

# --- 5. CHECK RESULT ---

print("\n--- Result DataFrame Info ---")
features_df.info()

# Example columns to verify the result
print("\n--- Example Data (Mid-Season) ---")
example_cols = [
    'Time', 'TMKO',       #From classify_kickoff_time
    'HomeTeam', 'AwayTeam',
    'HST', 'AST',         # Original
    'AHST', 'AAST',      # From get_strict_rolling_avg
    'PHFR_Won', 'PAFR_Won', # From get_previous_game_form
    'PHST', 'PAST'        # From get_last_match_overall
]

# Show games from mid-season
print(features_df[example_cols].iloc[100:105])

Starting pipeline with 1140 matches...
Classifying kickoff time (TMKO)...
Calculating rolling average...
Calculating form (last match)...
Calculating last match stats...
Calculating time between games (TBGH, TBGA)...
Calculating number of wins in the last 3 games (NOWH, NOWA)
claculation avarage time between games
Pipeline finished.

--- Result DataFrame Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1140 entries, 0 to 1139
Data columns (total 72 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Date                   1140 non-null   object 
 1   Time                   1140 non-null   object 
 2   HomeTeam               1140 non-null   object 
 3   AwayTeam               1140 non-null   object 
 4   FTR                    1140 non-null   object 
 5   FTHG                   1140 non-null   int64  
 6   FTAG                   1140 non-null   int64  
 7   HS                     1140 non-null   int64  
 8   AS

In [None]:
features_df.keys()

Index(['Date', 'Time', 'HomeTeam', 'AwayTeam', 'FTR', 'FTHG', 'FTAG', 'HS',
       'AS', 'HST', 'AST', 'HC', 'AC', 'HF', 'AF', 'HY', 'AY', 'HR', 'AR',
       'Start_Temp_C', 'Start_Wind_kmh', 'Start_Wind_Degree', 'Start_Humidity',
       'Start_Precip_mm', 'match_id', 'Weather_clear_sky', 'Weather_cloudy',
       'Weather_fog', 'Weather_heavy_rain', 'Weather_moderate_rain',
       'Weather_rain', 'Weather_snow', 'Weather_thunderstorm', 'TMKO', 'AFTHG',
       'AHS', 'AHST', 'AHC', 'AHF', 'AHY', 'AHR', 'AFTAG', 'AAS', 'AAST',
       'AAC', 'AAF', 'AAY', 'AAR', 'PHFR_Won', 'PHFR_NotWin', 'PAFR_Won',
       'PAFR_NotWin', 'PFTHG', 'PHS', 'PHST', 'PHC', 'PHF', 'PHY', 'PHR',
       'PFTAG', 'PAS', 'PAST', 'PAC', 'PAF', 'PAY', 'PAR', 'TBGH', 'TBGA',
       'NOWH', 'NOWA', 'ATBGH', 'ATBGA'],
      dtype='object')

# Using features:
'Start_Temp_C', 'Start_Wind_kmh', 'Start_Wind_Degree', 'Start_Humidity',
       'Start_Precip_mm', 'match_id', 'Weather_clear_sky', 'Weather_cloudy',
       'Weather_fog', 'Weather_heavy_rain', 'Weather_moderate_rain',
       'Weather_rain', 'Weather_snow', 'Weather_thunderstorm',
            'TMKO', 'PHFR_Won', 'PAFR_Won', 'PFTHG', 'PFTAG', 'PHS', 'PAS', 'PHST', 'PAST',
            'PHC', 'PAC', 'PHF', 'PAF', 'PHY', 'PAY', 'PHR', 'PAR', 'TBGH', 'TBGA', 'ATBGH', 'ATBGA',
            'AFTHG', 'AFTAG', 'AHS', 'AAS', 'AHST', 'AAST', 'AHC', 'AAC', 'AHF', 'AAF', 'AHY', 'AAY',
            'AHR', 'AAR', 'NOWH', 'NOWA'




---
Without weather:

'TMKO', 'PHFR_Won', 'PAFR_Won', 'PFTHG', 'PFTAG', 'PHS', 'PAS', 'PHST', 'PAST',
            'PHC', 'PAC', 'PHF', 'PAF', 'PHY', 'PAY', 'PHR', 'PAR', 'TBGH', 'TBGA', 'ATBGH', 'ATBGA',
            'AFTHG', 'AFTAG', 'AHS', 'AAS', 'AHST', 'AAST', 'AHC', 'AAC', 'AHF', 'AAF', 'AHY', 'AAY',
            'AHR', 'AAR', 'NOWH', 'NOWA'

In [None]:
features = [
       'Start_Temp_C', 'Start_Wind_kmh', 'Start_Wind_Degree', 'Start_Humidity',
       'Start_Precip_mm', 'Weather_clear_sky', 'Weather_cloudy',
       'Weather_fog', 'Weather_heavy_rain', 'Weather_moderate_rain',
       'Weather_rain', 'Weather_snow', 'Weather_thunderstorm',
            'TMKO', 'PHFR_Won', 'PAFR_Won', 'PFTHG', 'PFTAG', 'PHS', 'PAS', 'PHST', 'PAST',
            'PHC', 'PAC', 'PHF', 'PAF', 'PHY', 'PAY', 'PHR', 'PAR', 'TBGH', 'TBGA', 'ATBGH', 'ATBGA',
            'AFTHG', 'AFTAG', 'AHS', 'AAS', 'AHST', 'AAST', 'AHC', 'AAC', 'AHF', 'AAF', 'AHY', 'AAY',
            'AHR', 'AAR', 'NOWH', 'NOWA']

# Training, Testing

In [None]:
# 1. Clean up to length
df_clean = features_df.dropna().copy()

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# 1. check format
df_clean['Date'] = pd.to_datetime(df_clean['Date'])

# 2. Create a target variable (1 = home win, 0 = no home win)
df_clean['HomeWin_Binary'] = df_clean['FTR'].apply(lambda x: 1 if x == 'H' else 0)

# 3. Chronological split (time separation)
split_date = '2021-08-12'

# Everything before or on August 13, 2021 is training.
train_df = df_clean[df_clean['Date'] <= split_date].copy()

# Everything after is testing
test_df = df_clean[df_clean['Date'] > split_date].copy()

print(f"Training till {split_date}: {len(train_df)} games")
print(f"Testing since {split_date}:       {len(test_df)} games")

# 4. Assign features and targets
X_train = train_df[features]
y_train = train_df['HomeWin_Binary']

X_test = test_df[features].dropna()
# dropping for consistency with paper
X_test = X_test.iloc[27:]
y_test = test_df['HomeWin_Binary']
y_test = y_test.iloc[27:]

X_train.to_csv("X_train.csv",index=False)
df_clean.to_csv("df_clean.csv", index=False)

Training till 2021-08-12: 722 games
Testing since 2021-08-12:       377 games


In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((722, 50), (722,), (350, 50), (350,))

# Fitting the models

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, matthews_corrcoef, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn import svm
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
def train_and_evaluate(model, X_train, y_train, X_test, y_test, model_name="Modell"):
    # 1. Training
    model.fit(X_train, y_train)

    # 2. Prediction
    y_pred = model.predict(X_test)

    # 3. Calculate metrics
    cm = confusion_matrix(y_test, y_pred)

    tn, fp, fn, tp = cm.ravel()

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    mcc = matthews_corrcoef(y_test, y_pred)

    # 4.
    print(f"\n{'='*10} Results: {model_name} {'='*10}")
    print(f"TP (True Home victory): {tp}")
    print(f"TN (Home loss detected): {tn}")
    print(f"FP (False alarm):          {fp}")
    print(f"FN (Missed win):           {fn}")
    print("-" * 30)
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f} ")
    print(f"Recall:    {rec:.4f}")
    print(f"F1-Score:  {f1:.4f}")
    print(f"MCC:       {mcc:.4f}")
    print("\n=== Classification Report ===")
    print(classification_report(y_test, y_pred))

    return model

In [None]:
param_lr = {'C': 1, 'max_iter': 10000, 'penalty': 'l2', 'solver': 'liblinear', 'tol': 0.0001}
param_rf = {'max_depth': 5, 'max_features': 0.8, 'max_samples': 1.0, 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 50}
param_svm = {'C': 1, 'class_weight': 'balanced', 'gamma': 0.01, 'kernel': 'rbf', 'tol': 0.001}
param_xgb = {'colsample_bytree': 1.0, 'lambda': 10, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
param_gbm = {'max_depth': 4, 'num_leaves': 42, 'learning_rate': 0.13867651282378762, 'min_child_samples': 57, 'subsample': 0.6231794015571818, 'colsample_bytree': 0.6335058814681457, 'lambda_l1': 8.071956365090848, 'lambda_l2': 2.8873424651371224}


In [None]:
# Definition der Modelle
models = {
    "Logistic Regression": Pipeline([
        ('scaler', StandardScaler()),
        ('lr', LogisticRegression(C=1.0, solver='liblinear', penalty="l2", tol=0.0001))
    ]),

    "Random Forest": RandomForestClassifier(
        max_depth=5,
        min_samples_split=2,
        min_samples_leaf=5,
        n_estimators= 50,
        max_features=0.8,
        max_samples= 1.0,
        random_state=42
    ),

    "SVM": Pipeline([
    ('scaler', StandardScaler()),
    ('svm', svm.SVC(C=1, class_weight= "balanced", kernel='rbf', gamma= 0.01, probability=True, tol=0.001))
    ]),

    "XGBoost": XGBClassifier(
        n_estimators=150,
        learning_rate=0.01,
        max_depth=5,
        subsample=0.8,
        colsample_bytree = 1.0,
        reg_lambda = 10,
        random_state=42
    ),

    "Gradient Boosting (GBM)": GradientBoostingClassifier(
        max_depth=4,
        learning_rate= 0.1387,
        subsample=0.623,
        random_state=42
    ),

    "LightGBM": LGBMClassifier(
        n_estimators=100,
        learning_rate=0.1387,
        num_leaves=42,
        min_child_samples=57,
        subsample=0.623,
        colsample_bytree=0.634,
        reg_lambda=2.89,
        max_depth=-4,
        random_state=42,
        verbose=-1
    ),
}

# Loop over all models
results = {}
for name, model in models.items():
    print(f"Trainiere {name}...")
    trained_model = train_and_evaluate(model, X_train, y_train, X_test, y_test, model_name=name)
    results[name] = trained_model

Trainiere Logistic Regression...

TP (True Home victory): 60
TN (Home loss detected): 155
FP (False alarm):          46
FN (Missed win):           89
------------------------------
Accuracy:  0.6143
Precision: 0.5660 
Recall:    0.4027
F1-Score:  0.4706
MCC:       0.1871

=== Classification Report ===
              precision    recall  f1-score   support

           0       0.64      0.77      0.70       201
           1       0.57      0.40      0.47       149

    accuracy                           0.61       350
   macro avg       0.60      0.59      0.58       350
weighted avg       0.61      0.61      0.60       350

Trainiere Random Forest...

TP (True Home victory): 51
TN (Home loss detected): 161
FP (False alarm):          40
FN (Missed win):           98
------------------------------
Accuracy:  0.6057
Precision: 0.5604 
Recall:    0.3423
F1-Score:  0.4250
MCC:       0.1615

=== Classification Report ===
              precision    recall  f1-score   support

           0      

#Ensemble

In [None]:
# Group 1
estimators_all = [
    ('lr', models['Logistic Regression']),
    ('rf', models['Random Forest']),
    ('svm', models['SVM']),
    ('xgb', models['XGBoost']),
    ('gbm', models['Gradient Boosting (GBM)'])
]

# Group 2
estimators_subset = [
    ('lr', models['Logistic Regression']),
    ('svm', models['SVM']),
    ('lgbm', models['LightGBM'],)
]

In [None]:
# Stacking
stack_all = StackingClassifier(
    estimators=estimators_all,
    final_estimator= LogisticRegression(),
    #final_estimator = svm.SVC(C=1, class_weight= "balanced", kernel='rbf', gamma= 0.01, probability=True),
    cv=5
)

# Voting
vote_all = VotingClassifier(
    estimators=estimators_all,
    voting='hard'
)


# --- Szenario 2: Subset (LR, SVM, LightGBM) ---

# Stacking
stack_subset = StackingClassifier(
    estimators=estimators_subset,
    final_estimator=LogisticRegression(),
    #final_estimator = svm.SVC(C=1, class_weight= "balanced", kernel='rbf', gamma= 0.01, probability=True),
    cv=5
)

# Voting
vote_subset = VotingClassifier(
    estimators=estimators_subset,
    voting='hard'
)


choosing the final estimator makes a big difference for the stacking. With svm only home wins are picked

In [None]:
ensemble_models = {
    "Stacking (All)": stack_all,
    "Voting (All)": vote_all,
    "Stacking (Subset)": stack_subset,
    "Voting (Subset)": vote_subset
}

models.update(ensemble_models)

# Results

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef

# 1. Empty list for results
results = []

print("Start training of models...")

for name, model in models.items():
    # Training
    model.fit(X_train, y_train)

    # Prediction
    y_pred = model.predict(X_test)

    # Calculate matrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, pos_label=1)
    rec = recall_score(y_test, y_pred, pos_label=1)
    f1 = f1_score(y_test, y_pred, pos_label=1)
    mcc = matthews_corrcoef(y_test, y_pred)

    # Save results
    results.append({
        "Model": name,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1-Score": f1,
        "MCC": mcc
    })

df_results = pd.DataFrame(results)

df_results = df_results.round(4)

transposed = df_results.transpose()
transposed.to_csv("results_paper_with_weather_lin_estimator.csv", index=True)

# show table
display(transposed)

Start training of models...


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Model,Logistic Regression,Random Forest,SVM,XGBoost,Gradient Boosting (GBM),LightGBM,Stacking (All),Voting (All),Stacking (Subset),Voting (Subset)
Accuracy,0.6143,0.6057,0.6171,0.6114,0.5943,0.6314,0.5743,0.6086,0.5714,0.6343
Precision,0.566,0.5604,0.5517,0.6,0.5276,0.582,0.5,0.5682,0.4,0.5827
Recall,0.4027,0.3423,0.5369,0.2617,0.4497,0.4765,0.0067,0.3356,0.0134,0.4966
F1-Score,0.4706,0.425,0.5442,0.3645,0.4855,0.524,0.0132,0.4219,0.026,0.5362
MCC,0.1871,0.1615,0.2143,0.1683,0.1554,0.2312,0.0114,0.167,-0.0063,0.2396
