<h3 style="color:darkblue;">Premier League Match Predictor - Machine Learning Approach</h3>

<h4 style="color:darkred;">Import libraries, download, load and a quick lookup of data</h4>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Download multiple Premier League seasons
seasons = ['2324', '2425', '2526']  # Last 3 seasons
all_data = []

for season in seasons:
    url = f"https://www.football-data.co.uk/mmz4281/{season}/E0.csv"
    try:
        df_season = pd.read_csv(url)
        df_season['Season'] = season
        all_data.append(df_season)
        print(f"Season {season}: {len(df_season)} matches")
    except:
        print(f"Season {season}: Failed to download")

# Combine all seasons
df = pd.concat(all_data, ignore_index=True)
print(f"\nTotal matches downloaded: {len(df)}")

# Save combined data
df.to_csv('combined_seasons.csv', index=False)
print("Combined data saved as 'combined_seasons.csv'")

Season 2324: 380 matches
Season 2425: 380 matches
Season 2526: 70 matches

Total matches downloaded: 830
Combined data saved as 'combined_seasons.csv'


In [3]:
# Load and preview the data
df = pd.read_csv('combined_seasons.csv')
df.head()

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,BMGMCA,BVCH,BVCD,BVCA,CLCH,CLCD,CLCA,LBCH,LBCD,LBCA
0,E0,11/08/2023,20:00,Burnley,Man City,0,3,A,0,2,...,,,,,,,,,,
1,E0,12/08/2023,12:30,Arsenal,Nott'm Forest,2,1,H,2,0,...,,,,,,,,,,
2,E0,12/08/2023,15:00,Bournemouth,West Ham,1,1,D,0,0,...,,,,,,,,,,
3,E0,12/08/2023,15:00,Brighton,Luton,4,1,H,1,0,...,,,,,,,,,,
4,E0,12/08/2023,15:00,Everton,Fulham,0,1,A,0,0,...,,,,,,,,,,


In [4]:
print(f"Dataset shape: {df.shape}")

Dataset shape: (830, 163)


In [5]:
print(f"Dataset Columns: {df.columns.tolist()}")

Dataset Columns: ['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG', 'HTR', 'Referee', 'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR', 'B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA', 'IWH', 'IWD', 'IWA', 'PSH', 'PSD', 'PSA', 'WHH', 'WHD', 'WHA', 'VCH', 'VCD', 'VCA', 'MaxH', 'MaxD', 'MaxA', 'AvgH', 'AvgD', 'AvgA', 'B365>2.5', 'B365<2.5', 'P>2.5', 'P<2.5', 'Max>2.5', 'Max<2.5', 'Avg>2.5', 'Avg<2.5', 'AHh', 'B365AHH', 'B365AHA', 'PAHH', 'PAHA', 'MaxAHH', 'MaxAHA', 'AvgAHH', 'AvgAHA', 'B365CH', 'B365CD', 'B365CA', 'BWCH', 'BWCD', 'BWCA', 'IWCH', 'IWCD', 'IWCA', 'PSCH', 'PSCD', 'PSCA', 'WHCH', 'WHCD', 'WHCA', 'VCCH', 'VCCD', 'VCCA', 'MaxCH', 'MaxCD', 'MaxCA', 'AvgCH', 'AvgCD', 'AvgCA', 'B365C>2.5', 'B365C<2.5', 'PC>2.5', 'PC<2.5', 'MaxC>2.5', 'MaxC<2.5', 'AvgC>2.5', 'AvgC<2.5', 'AHCh', 'B365CAHH', 'B365CAHA', 'PCAHH', 'PCAHA', 'MaxCAHH', 'MaxCAHA', 'AvgCAHH', 'AvgCAHA', 'Season', 'BFH', 'BFD', 'BFA', '1XBH', '1XBD', '1XBA', 'BFEH'

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 830 entries, 0 to 829
Columns: 163 entries, Div to LBCA
dtypes: float64(138), int64(17), object(8)
memory usage: 1.0+ MB


In [7]:
# Check key columns
print(f"Key columns we'll use:")
print(f"- HomeTeam: {df['HomeTeam'].nunique()} unique teams")
print(f"- AwayTeam: {df['AwayTeam'].nunique()} unique teams")
print(f"- FTHG (Full Time Home Goals): {df['FTHG'].dtype}")
print(f"- FTAG (Full Time Away Goals): {df['FTAG'].dtype}")
print(f"- FTR (Full Time Result): {df['FTR'].value_counts().to_dict()}")

Key columns we'll use:
- HomeTeam: 25 unique teams
- AwayTeam: 25 unique teams
- FTHG (Full Time Home Goals): int64
- FTAG (Full Time Away Goals): int64
- FTR (Full Time Result): {'H': 364, 'A': 273, 'D': 193}


<h4 style="color:darkred;">Data Cleaning & Preparation</h4>

In [8]:
# Select only the needed columns
columns_needed = ['Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR']
df = df[columns_needed]

print("New shape after needed selection:", df.shape)

New shape after needed selection: (830, 6)


In [9]:
# Check for missing values in each column
missing_values = df.isnull().sum()

print("Missing values in each column:")
print(missing_values)

Missing values in each column:
Date        0
HomeTeam    0
AwayTeam    0
FTHG        0
FTAG        0
FTR         0
dtype: int64


In [10]:
# Convert Date to datetime
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y', errors='coerce')

# Sort by date
df = df.sort_values('Date').reset_index(drop=True)

print("New Data preview:")
print(df.head(10))

New Data preview:
        Date          HomeTeam        AwayTeam  FTHG  FTAG FTR
0 2023-08-11           Burnley        Man City     0     3   A
1 2023-08-12           Arsenal   Nott'm Forest     2     1   H
2 2023-08-12       Bournemouth        West Ham     1     1   D
3 2023-08-12          Brighton           Luton     4     1   H
4 2023-08-12           Everton          Fulham     0     1   A
5 2023-08-12  Sheffield United  Crystal Palace     0     1   A
6 2023-08-12         Newcastle     Aston Villa     5     1   H
7 2023-08-13         Brentford       Tottenham     2     2   D
8 2023-08-13           Chelsea       Liverpool     1     1   D
9 2023-08-14        Man United          Wolves     1     0   H


In [11]:
print("Data types:")
print(df.dtypes)

Data types:
Date        datetime64[ns]
HomeTeam            object
AwayTeam            object
FTHG                 int64
FTAG                 int64
FTR                 object
dtype: object


In [12]:
print("Full Time Result Distribution:")
print(df['FTR'].value_counts())

print("\nDate range:")
print(f"From: {df['Date'].min()} To {df['Date'].max()}")

Full Time Result Distribution:
FTR
H    364
A    273
D    193
Name: count, dtype: int64

Date range:
From: 2023-08-11 00:00:00 To 2025-10-05 00:00:00


<h4 style="color:darkred;">Feature Engineering</h4>

In [13]:
# Feature Engineering
def add_form_features(df, n_matches=5):
    """Add team form features"""
    df['home_points_last5'] = np.nan
    df['home_goals_scored_last5'] = np.nan
    df['home_goals_conceded_last5'] = np.nan
    df['away_points_last5'] = np.nan
    df['away_goals_scored_last5'] = np.nan
    df['away_goals_conceded_last5'] = np.nan
    
    teams = pd.concat([df['HomeTeam'], df['AwayTeam']]).unique()
    
    for team in teams:
        team_matches = df[(df['HomeTeam'] == team) | (df['AwayTeam'] == team)].index
        
        for idx in team_matches:
            prev = df[((df['HomeTeam'] == team) | (df['AwayTeam'] == team)) & (df.index < idx)].tail(n_matches)
            
            if len(prev) < 3:
                continue
            
            points = goals_for = goals_against = 0
            
            for _, match in prev.iterrows():
                if match['HomeTeam'] == team:
                    goals_for += match['FTHG']
                    goals_against += match['FTAG']
                    points += 3 if match['FTR'] == 'H' else (1 if match['FTR'] == 'D' else 0)
                else:
                    goals_for += match['FTAG']
                    goals_against += match['FTHG']
                    points += 3 if match['FTR'] == 'A' else (1 if match['FTR'] == 'D' else 0)
            
            if df.at[idx, 'HomeTeam'] == team:
                df.at[idx, 'home_points_last5'] = points
                df.at[idx, 'home_goals_scored_last5'] = goals_for
                df.at[idx, 'home_goals_conceded_last5'] = goals_against
            else:
                df.at[idx, 'away_points_last5'] = points
                df.at[idx, 'away_goals_scored_last5'] = goals_for
                df.at[idx, 'away_goals_conceded_last5'] = goals_against
    
    return df.dropna()

df_features = add_form_features(df)
print(f"Dataset with features shape: {df_features.shape}")
print("\nFeature columns added:")
print(df_features.columns.tolist())
print("\nSample with features:")
print(df_features[['HomeTeam', 'AwayTeam', 'FTR', 'home_points_last5', 'away_points_last5']].head(10))

Dataset with features shape: (784, 12)

Feature columns added:
['Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'home_points_last5', 'home_goals_scored_last5', 'home_goals_conceded_last5', 'away_points_last5', 'away_goals_scored_last5', 'away_goals_conceded_last5']

Sample with features:
            HomeTeam        AwayTeam FTR  home_points_last5  away_points_last5
30  Sheffield United         Everton   D                0.0                0.0
31         Brentford     Bournemouth   D                5.0                1.0
33           Chelsea   Nott'm Forest   A                4.0                3.0
34          Man City          Fulham   H                9.0                4.0
35          Brighton       Newcastle   H                6.0                3.0
36           Arsenal      Man United   H                7.0                6.0
37    Crystal Palace          Wolves   H                4.0                3.0
38         Liverpool     Aston Villa   H                7.0             

In [14]:
# Prepare data
feature_columns = [
    'home_points_last5', 'home_goals_scored_last5', 'home_goals_conceded_last5',
    'away_points_last5', 'away_goals_scored_last5', 'away_goals_conceded_last5'
]

X = df_features[feature_columns]
y = df_features['FTR']

# Encode target for XGBoost: H=0, D=1, A=2
y_encoded = y.map({'H': 0, 'D': 1, 'A': 2})

print(f"Final dataset: {len(X)} matches")
print(f"Target distribution:\n{y.value_counts()}")

Final dataset: 784 matches
Target distribution:
FTR
H    343
A    256
D    185
Name: count, dtype: int64


In [16]:
from scipy.stats import chisquare
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, FixedTicker, HoverTool
from bokeh.layouts import column

output_notebook()

In [17]:
# Observed frequencies: Home Wins, Away Wins, Draws
observed = [343, 256, 185]
total_matches = sum(observed)

# Expected frequencies if balanced
expected = [total_matches / 3] * 3

# Chi-square test
chi2_stat, p_value = chisquare(f_obs=observed, f_exp=expected)

print("Chi-square Statistic:", chi2_stat)
print("p-value:", p_value)
print("\nInterpretation:", "Data is IMBALANCED" if p_value < 0.05 else "Data is balanced")

# Match Distribution
categories = ['Home Win', 'Away Win', 'Draw']
percentages = [(x/total_matches)*100 for x in observed]

p = figure(x_range=categories, width=700, height=400, 
           title="Match Outcome Distribution",
           toolbar_location="right", tools="pan,wheel_zoom,box_zoom,reset,save")

p.vbar(x=categories, top=percentages, width=0.6, color=['#2ecc71', '#e74c3c', '#f39c12'])

p.xaxis.axis_label = "Outcome"
p.yaxis.axis_label = "Percentage (%)"
p.xgrid.grid_line_color = None

show(p)

Chi-square Statistic: 47.92602040816327
p-value: 3.917390795744035e-11

Interpretation: Data is IMBALANCED


**Interpretation:** The match outcome data is IMBALANCED. This means that home wins happen much more often than away wins or draws. This imbalance is common and expected in football because of **"home advantage"** (teams usually play better at home due to crowd support, familiarity, less travel, etc.)

**Note:** This imbalance can affect the accuracy of prediction models.

<h4 style="color:darkred;">Split-Train-Test</h4>

In [18]:
# Split: 70% train, 30% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.3, random_state=42, shuffle=True
)

print(f"Training: {len(X_train)} | Testing: {len(X_test)}")

Training: 548 | Testing: 236


<h4 style="color:darkred;">Define Models</h4>

In [19]:
# Train 4 different models (all use encoded labels)
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, max_depth=5, random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42, eval_metric='mlogloss')
}
results = {}
label_map = {0: 'Home Win', 1: 'Draw', 2: 'Away Win'}

<h4 style="color:darkred;">Evaluate Models</h4>

In [20]:
for name, model in models.items():
    print(f"{name}:")
    print("-" * 40)
    
    # Train
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    
    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    
    print(f"Accuracy: {accuracy:.2%}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=['Home Win', 'Draw', 'Away Win'], zero_division=0))

Random Forest:
----------------------------------------
Accuracy: 50.85%
Classification Report:
              precision    recall  f1-score   support

    Home Win       0.52      0.70      0.60       101
        Draw       0.39      0.12      0.18        59
    Away Win       0.52      0.55      0.54        76

    accuracy                           0.51       236
   macro avg       0.48      0.46      0.44       236
weighted avg       0.49      0.51      0.47       236

Gradient Boosting:
----------------------------------------
Accuracy: 44.07%
Classification Report:
              precision    recall  f1-score   support

    Home Win       0.50      0.60      0.55       101
        Draw       0.24      0.14      0.17        59
    Away Win       0.44      0.46      0.45        76

    accuracy                           0.44       236
   macro avg       0.39      0.40      0.39       236
weighted avg       0.41      0.44      0.42       236

Logistic Regression:
---------------------

<h4 style="color:darkred;">Compare Models Results</h4>

In [21]:
# Summary
print("Model Comparison Summary")
print("-"*40)
for name, acc in sorted(results.items(), key=lambda x: x[1], reverse=True):
    print(f"{name:25s}: {acc:.2%}")

best_model_name = max(results, key=results.get)
print(f"\nBest Model: {best_model_name} ({results[best_model_name]:.2%})")

Model Comparison Summary
----------------------------------------
Logistic Regression      : 52.97%
Random Forest            : 50.85%
XGBoost                  : 50.42%
Gradient Boosting        : 44.07%

Best Model: Logistic Regression (52.97%)


<h4 style="color:darkred;">Match Predictions and Visuals</h4>

In [22]:
# Function to get team's recent form
def get_team_form(df, team, n_matches=5):
    """Get team's last n matches statistics"""
    team_matches = df[((df['HomeTeam'] == team) | (df['AwayTeam'] == team))].tail(n_matches)
    
    if len(team_matches) < 3:
        return None
    
    points = goals_for = goals_against = 0
    
    for _, match in team_matches.iterrows():
        if match['HomeTeam'] == team:
            goals_for += match['FTHG']
            goals_against += match['FTAG']
            if match['FTR'] == 'H':
                points += 3
            elif match['FTR'] == 'D':
                points += 1
        else:
            goals_for += match['FTAG']
            goals_against += match['FTHG']
            if match['FTR'] == 'A':
                points += 3
            elif match['FTR'] == 'D':
                points += 1
    
    return {
        'points': points,
        'goals_scored': goals_for,
        'goals_conceded': goals_against
    }

In [23]:
# Predict a match
def predict_match(model, df, home_team, away_team):
    """Predict outcome of a match between two teams"""
    
    # Get form for both teams
    home_form = get_team_form(df, home_team)
    away_form = get_team_form(df, away_team)
    
    if home_form is None or away_form is None:
        return "Not enough data for prediction"
    
    # Prepare features as DataFrame (to match training format)
    feature_columns = [
        'home_points_last5', 
        'home_goals_scored_last5', 
        'home_goals_conceded_last5',
        'away_points_last5', 
        'away_goals_scored_last5', 
        'away_goals_conceded_last5'
    ]
    
    features = pd.DataFrame([[
        home_form['points'],
        home_form['goals_scored'],
        home_form['goals_conceded'],
        away_form['points'],
        away_form['goals_scored'],
        away_form['goals_conceded']
    ]], columns=feature_columns)
    
    # Get prediction probabilities
    probabilities = model.predict_proba(features)[0]
    prediction = model.predict(features)[0]
    
    # Map back to labels
    outcome_map = {0: 'Home Win', 1: 'Draw', 2: 'Away Win'}
    predicted_outcome = outcome_map[prediction]
    
    return {
        'home_team': home_team,
        'away_team': away_team,
        'prediction': predicted_outcome,
        'home_win_prob': probabilities[0] * 100,
        'draw_prob': probabilities[1] * 100,
        'away_win_prob': probabilities[2] * 100,
        'home_form': home_form,
        'away_form': away_form
    }

In [24]:
# Get unique teams from dataset
teams = sorted(df_features['HomeTeam'].unique())
print(f"Available teams: {', '.join(teams[:10])}... (showing first 10)")

# Predict a match (change teams as needed)
home = teams[0]  # First team
away = teams[1]  # Second team

result = predict_match(models['Logistic Regression'], df_features, home, away)

print(f"\n{result['home_team']} vs {result['away_team']}")
print("-"*30)
print(f"Prediction: {result['prediction']}")
print(f"\nProbabilities:")
print(f"  Home Win: {result['home_win_prob']:.1f}%")
print(f"  Draw:     {result['draw_prob']:.1f}%")
print(f"  Away Win: {result['away_win_prob']:.1f}%")
print(f"\nTeam Form (Last 5 matches):")
print(f"  {result['home_team']}: {result['home_form']['points']} pts, {result['home_form']['goals_scored']} GF, {result['home_form']['goals_conceded']} GA")
print(f"  {result['away_team']}: {result['away_form']['points']} pts, {result['away_form']['goals_scored']} GF, {result['away_form']['goals_conceded']} GA")

Available teams: Arsenal, Aston Villa, Bournemouth, Brentford, Brighton, Burnley, Chelsea, Crystal Palace, Everton, Fulham... (showing first 10)

Arsenal vs Aston Villa
------------------------------
Prediction: Home Win

Probabilities:
  Home Win: 52.0%
  Draw:     25.7%
  Away Win: 22.3%

Team Form (Last 5 matches):
  Arsenal: 10 pts, 8 GF, 3 GA
  Aston Villa: 8 pts, 6 GF, 6 GA


In [25]:
# Prediction Probabilities
outcomes = ['Home Win', 'Draw', 'Away Win']
probabilities = [result['home_win_prob'], result['draw_prob'], result['away_win_prob']]
colors = ['#2ecc71', '#f39c12', '#e74c3c']

source = ColumnDataSource(data=dict(outcomes=outcomes, probabilities=probabilities, colors=colors))

p1 = figure(x_range=outcomes, width=800, height=350, title=f"{result['home_team']} vs {result['away_team']} - Prediction Probabilities",
           toolbar_location="right", tools="pan,wheel_zoom,box_zoom,reset,save")

p1.vbar(x='outcomes', top='probabilities', width=0.6, color='colors', source=source)
p1.y_range.start = 0
p1.y_range.end = 100
p1.xaxis.axis_label = "Outcome"
p1.yaxis.axis_label = "Probability (%)"
p1.xgrid.grid_line_color = None

show(p1)

In [26]:
# Team Form Comparison
p2 = figure(width=800, height=350, title="Team Form Comparison (Last 5 matches)",
           toolbar_location="right", tools="pan,wheel_zoom,box_zoom,reset,save")

x_positions = [0, 1, 2]  # Numeric positions for categories
x_offset = [-0.15, 0.15]

# Define team statistics
home_stats = [result['home_form']['points'], result['home_form']['goals_scored'], result['home_form']['goals_conceded']]
away_stats = [result['away_form']['points'], result['away_form']['goals_scored'], result['away_form']['goals_conceded']]

teams_list = [result['home_team'], result['away_team']]
stats_list = [home_stats, away_stats]
colors_teams = ['#3498db', '#e74c3c']

for i, (team, stats, color) in enumerate(zip(teams_list, stats_list, colors_teams)):
    x_pos = [x + x_offset[i] for x in x_positions]
    p2.vbar(x=x_pos, top=stats, width=0.25, color=color, legend_label=team)

# Set custom x-axis labels
p2.xaxis.ticker = FixedTicker(ticks=x_positions)
p2.xaxis.major_label_overrides = {0: 'Points', 1: 'Goals Scored', 2: 'Goals Conceded'}
p2.xaxis.axis_label = "Metric"
p2.yaxis.axis_label = "Value"
p2.legend.location = "top_right"
p2.xgrid.grid_line_color = None

show(p2)

In [27]:
# Feature Importance
feature_names = ['Home Points', 'Home Goals For', 'Home Goals Against', 
                'Away Points', 'Away Goals For', 'Away Goals Against']
importance = np.abs(models['Logistic Regression'].coef_[0])

source_imp = ColumnDataSource(data=dict(features=feature_names, importance=importance))

p3 = figure(y_range=feature_names, width=800, height=350, title="Feature Importance",
           toolbar_location="right", tools="pan,wheel_zoom,box_zoom,reset,save")

p3.hbar(y='features', right='importance', height=0.6, color='#9b59b6', source=source_imp)
p3.x_range.start = 0
p3.xaxis.axis_label = "Importance"
p3.ygrid.grid_line_color = None

show(p3)

**Observation:**
- The models are not performing very well — accuracy is barely above random guessing.
- This is likely because the match outcome data is IMBALANCED (more home wins than draws or away wins),
- which can confuse models that expect a balanced set of outcomes.
- If the model sees mostly home wins during training, it may learn to always predict home wins, missing important cases like draws or away wins, even when they do happen.

**Solutions to Improve the Model:**
- Use class weights in the models to reduce bias toward majority outcomes
- Gather more match data, especially draws, possibly from multiple seasons
- Use stratified sampling to make sure train/test sets reflect the real outcome distribution
- Try resampling methods like SMOTE to generate synthetic examples of minority classes
- Evaluate using F1-score (balances precision and recall), not just accuracy
- Focus on predicting probabilities (e.g. 70% chance of home win), not just win/draw/loss