# Euro 2024 Prediction

Loading the imports that will be used in this project

In [225]:
import pandas as pd
import numpy as np

Gathering the euro summary finals and putting it into one dataframe and then gathering all euro matches from 1960-2020 and putting it into one dataframe.

In [226]:
groups = {
    "Group A": ["Germany", "Scotland", "Hungary", "Switzerland"],
    "Group B": ["Spain", "Croatia", "Italy", "Albania"],
    "Group C": ["England", "Denmark", "Serbia", "Slovenia"],
    "Group D": ["France", "Netherlands", "Austria", "Poland"],
    "Group E": ["Belgium", "Slovakia", "Romania", "Ukraine"],
    "Group F": ["Portugal", "Czechia", "Turkey", "Georgia"]
}

# Load euro_summary_finals.csv
euro_summary_finals = pd.read_csv('euro_summary_finals.csv')

# List of years we want to include
years = [str(year) for year in range(1960, 2021, 4)]

# List to hold dataframes
matches_list = []

# Read each file and append to the list
for year in years:
    file_path = f'matches/{year}.csv'
    df = pd.read_csv(file_path)
    matches_list.append(df)

# Concatenate all match data into one DataFrame
all_matches = pd.concat(matches_list, ignore_index=True)


Taking a look at the euro_summary_finals dataframe

In [227]:
euro_summary_finals.head()

Unnamed: 0,year,winner,final,result,matches,goals,red_cards,attendance,attendance_avg
0,1960,USSR,USSR - Yugoslavia,1 - 1,4,17,0,78958.0,19739.5
1,1964,Spain,Spain - USSR,2 - 2,4,13,0,156253.0,39063.25
2,1968,Italy,Italy - Yugoslavia,2 - 2,5,7,1,260936.0,52187.2
3,1972,West Germany,West Germany - USSR,3 - 3,4,10,0,106510.0,26627.5
4,1976,Czechoslovakia,Czechoslovakia - West Germany,2 - 2 (5 - 3),4,19,3,106087.0,26521.75


Taking a look at the all_matches dataframe

In [228]:
all_matches.head()

Unnamed: 0,home_team,away_team,home_team_code,away_team_code,home_score,away_score,home_penalty,away_penalty,home_score_total,away_score_total,...,penalties_missed,penalties,red_cards,game_referees,stadium_city,stadium_name,stadium_name_media,stadium_name_official,stadium_name_event,stadium_name_sponsor
0,USSR,Yugoslavia,URS,YUG,1.0,1.0,,,2.0,1.0,...,,,,[],Paris,Parc des Princes,Parc des Princes,Parc des Princes,Parc des Princes,Parc des Princes
1,Czechoslovakia,France,TCH,FRA,2.0,0.0,,,2.0,0.0,...,,,,[],Marseille,Stade de Marseille,Stade de Marseille,Stade de Marseille,Stade de Marseille,Orange Vélodrome
2,Czechoslovakia,USSR,TCH,URS,0.0,3.0,,,0.0,3.0,...,,,,[],Marseille,Stade de Marseille,Stade de Marseille,Stade de Marseille,Stade de Marseille,Orange Vélodrome
3,France,Yugoslavia,FRA,YUG,4.0,5.0,,,4.0,5.0,...,,,,[],Paris,Parc des Princes,Parc des Princes,Parc des Princes,Parc des Princes,Parc des Princes
4,Spain,USSR,ESP,URS,2.0,1.0,,,2.0,1.0,...,,,,[],Madrid,Estadio Santiago Bernabéu,Estadio Santiago Bernabéu,Estadio Santiago Bernabéu,Estadio Santiago Bernabéu,


We have to predict the 2024 matches so let's add those matches to its own dataframe

In [229]:
matches_2024 = pd.read_csv('matches/2024.csv')
matches_2024.head()

Unnamed: 0,home_team,away_team,home_team_code,away_team_code,home_score,away_score,home_penalty,away_penalty,home_score_total,away_score_total,...,penalties_missed,penalties,red_cards,game_referees,stadium_city,stadium_name,stadium_name_media,stadium_name_official,stadium_name_event,stadium_name_sponsor
0,W49,W50,,,,,,,,,...,,,,[],Berlin,Olympiastadion,Olympiastadion,Olympiastadion,Olympiastadion,Olympiastadion
1,W47,W48,,,,,,,,,...,,,,[],Dortmund,BVB Stadion Dortmund,BVB Stadion Dortmund,BVB Stadion Dortmund,BVB Stadion Dortmund,Signal Iduna Park
2,W45,W46,,,,,,,,,...,,,,[],Munich,Munich Football Arena,Football Arena Munich,Fußball Arena München,Munich Football Arena,Allianz Arena
3,W43,W44,,,,,,,,,...,,,,[],Berlin,Olympiastadion,Olympiastadion,Olympiastadion,Olympiastadion,Olympiastadion
4,W40,W38,,,,,,,,,...,,,,[],Dusseldorf,Düsseldorf Arena,Düsseldorf Arena,Düsseldorf Arena,Düsseldorf Arena,ESPRIT arena


In [230]:
# Data Preprocessing


# Convert date columns to datetime
if 'date' in all_matches.columns:
    all_matches['date'] = pd.to_datetime(all_matches['date'])
if 'date' in matches_2024.columns:
    matches_2024['date'] = pd.to_datetime(matches_2024['date'])

# Convert the relevant columns to numeric
numeric_columns = ['home_score', 'away_score']
if all(col in all_matches.columns for col in numeric_columns):
    all_matches[numeric_columns] = all_matches[numeric_columns].apply(pd.to_numeric)
if all(col in matches_2024.columns for col in numeric_columns):
    matches_2024[numeric_columns] = matches_2024[numeric_columns].apply(pd.to_numeric)



In [231]:
# Feature Engineering
# Create binary outcome columns in all_matches
all_matches['home_win'] = (all_matches['home_score'] > all_matches['away_score']).astype(int)
all_matches['away_win'] = (all_matches['away_score'] > all_matches['home_score']).astype(int)
all_matches['draw'] = (all_matches['home_score'] == all_matches['away_score']).astype(int)

# Aggregate historical performance for home and away teams separately
home_performance = all_matches.groupby('home_team').agg({
    'home_score': ['mean', 'sum'],
    'away_score': ['mean', 'sum'],
    'home_win': 'sum',
    'away_win': 'sum',
    'draw': 'sum'
}).reset_index()

home_performance.columns = ['home_team', 'avg_home_score', 'total_home_score', 'avg_home_conceded', 'total_home_conceded', 'home_wins', 'home_losses', 'home_draws']

away_performance = all_matches.groupby('away_team').agg({
    'away_score': ['mean', 'sum'],
    'home_score': ['mean', 'sum'],
    'home_win': 'sum',
    'away_win': 'sum',
    'draw': 'sum'
}).reset_index()

away_performance.columns = ['away_team', 'avg_away_score', 'total_away_score', 'avg_away_conceded', 'total_away_conceded', 'away_losses', 'away_wins', 'away_draws']

# Merge historical performance into the main dataset
all_matches = all_matches.merge(home_performance, how='left', left_on='home_team', right_on='home_team')
all_matches = all_matches.merge(away_performance, how='left', left_on='away_team', right_on='away_team')

# Ensure the same feature engineering is applied to matches_2024
matches_2024 = matches_2024.merge(home_performance, how='left', left_on='home_team', right_on='home_team')
matches_2024 = matches_2024.merge(away_performance, how='left', left_on='away_team', right_on='away_team')

all_matches = all_matches.fillna(method='ffill').fillna(0)
matches_2024 = matches_2024.fillna(method='ffill').fillna(0)

*Model Training and Prediction*

In [232]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Select features and target
features = [
    'avg_home_score', 'total_home_score', 'avg_home_conceded', 'total_home_conceded', 'home_wins', 'home_losses', 'home_draws',
    'avg_away_score', 'total_away_score', 'avg_away_conceded', 'total_away_conceded', 'away_wins', 'away_losses', 'away_draws'
]

X = all_matches[features]
y_home = all_matches['home_score']
y_away = all_matches['away_score']

# Split data into training and testing sets
X_train, X_test, y_home_train, y_home_test, y_away_train, y_away_test = train_test_split(X, y_home, y_away, test_size=0.2, random_state=42)

# Train a model for home scores
home_model = RandomForestRegressor(random_state=42)
home_model.fit(X_train, y_home_train)

# Train a model for away scores
away_model = RandomForestRegressor(random_state=42)
away_model.fit(X_train, y_away_train)

# Evaluate the models
home_pred = home_model.predict(X_test)
away_pred = away_model.predict(X_test)

home_mse = mean_squared_error(y_home_test, home_pred)
away_mse = mean_squared_error(y_away_test, away_pred)

print(f"Home Score Model MSE: {home_mse}")
print(f"Away Score Model MSE: {away_mse}")

# Predict 2024 scores
X_2024 = matches_2024[features]  # Ensure matches_2024 has the same feature columns

home_scores_2024 = home_model.predict(X_2024)
away_scores_2024 = away_model.predict(X_2024)

# Add predictions to the 2024 matches DataFrame
matches_2024['predicted_home_score'] = home_scores_2024
matches_2024['predicted_away_score'] = away_scores_2024

# Check the predictions
print(matches_2024[['home_team', 'away_team', 'predicted_home_score', 'predicted_away_score']])

Home Score Model MSE: 1.449487939881045
Away Score Model MSE: 1.6122727175120048
      home_team     away_team  predicted_home_score  predicted_away_score
0           W49           W50              0.602000              0.507500
1           W47           W48              0.602000              0.507500
2           W45           W46              0.602000              0.507500
3           W43           W44              0.602000              0.507500
4           W40           W38              0.602000              0.507500
5           W41           W42              0.602000              0.507500
6           W39           W37              0.602000              0.507500
7        1st: D        2nd: F              0.602000              0.507500
8        1st: E  3rd: A/B/C/D              0.602000              0.507500
9        1st: F    3rd: A/B/C              0.602000              0.507500
10       2nd: D        2nd: E              0.602000              0.507500
11       1st: B  3rd: A/D/E/F  

In [233]:
def predict_match_result(home_team, away_team):
    match = matches_2024[(matches_2024['home_team'] == home_team) & (matches_2024['away_team'] == away_team)]
    if not match.empty:
        predicted_home_score = match['predicted_home_score'].values[0]
        predicted_away_score = match['predicted_away_score'].values[0]
        if predicted_home_score > predicted_away_score:
            winner = home_team
        elif predicted_home_score < predicted_away_score:
            winner = away_team
        else:
            winner = "Draw"
        return f"Game:\n Home team:{home_team} \n Away team: {away_team}\n Winner: {winner}"
    else:
        return "Match not found in the 2024 dataset."

In [234]:
# Interactive Terminal Input
while True:
    user_input = input("Enter the match in the format 'HomeTeam vs AwayTeam' (or type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        break
    try:
        home_team, away_team = user_input.split(" vs ")
        result = predict_match_result(home_team.strip(), away_team.strip())
        print(result)
    except ValueError:
        print("Invalid format. Please enter the match in the format 'HomeTeam vs AwayTeam'.")

In [235]:
def print_groups():
    print("Group Stages:")
    for group, teams in groups.items():
        print(f"{group}: {', '.join(teams)}")



In [236]:
print_groups()

Group Stages:
Group A: Germany, Scotland, Hungary, Switzerland
Group B: Spain, Croatia, Italy, Albania
Group C: England, Denmark, Serbia, Slovenia
Group D: France, Netherlands, Austria, Poland
Group E: Belgium, Slovakia, Romania, Ukraine
Group F: Portugal, Czechia, Turkey, Georgia
