<a href="https://colab.research.google.com/github/ishaandindwar/my-projects/blob/main/epl_prediction_model1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
# Install (usually already present in Colab, but safe)
!pip install pandas numpy scikit-learn

# Imports
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier




In [10]:
# Load CSV (change name if needed)
df = pd.read_csv("/content/epl_matches.csv")

# Quick look
print(df.head())
print(df.columns)
print(df.info())


         Date   Season      HomeTeam                AwayTeam  FTH Goals  \
0  16/01/2025  2024/25  Ipswich Town  Brighton & Hove Albion          0   
1  16/01/2025  2024/25    Man United             Southampton          3   
2  15/01/2025  2024/25       Everton             Aston Villa          0   
3  15/01/2025  2024/25     Leicester          Crystal Palace          0   
4  15/01/2025  2024/25     Newcastle                  Wolves          3   

   FTA Goals FT Result  HTH Goals  HTA Goals HT Result  ... H Fouls  A Fouls  \
0          2         A        0.0        1.0         A  ...    13.0     14.0   
1          1         H        0.0        1.0         A  ...     7.0     10.0   
2          1         A        0.0        0.0         D  ...    17.0     10.0   
3          2         A        0.0        0.0         D  ...     7.0      6.0   
4          0         H        1.0        0.0         H  ...    10.0     13.0   

   H Corners  A Corners  H Yellow  A Yellow  H Red  A Red  Display_O

In [11]:
# If Date exists, convert to datetime (optional but good)
if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Drop rows with missing critical values
critical_cols = ['HomeTeam', 'AwayTeam', 'FTH Goals', 'FTA Goals']
for c in critical_cols:
    if c not in df.columns:
        raise ValueError(f"Column {c} not found. Adjust code to your dataset.")
df = df.dropna(subset=critical_cols)

# Create target label: 'H', 'D', 'A'
def result(row):
    if row['FTH Goals'] > row['FTA Goals']:
        return 'H'
    elif row['FTH Goals'] < row['FTA Goals']:
        return 'A'
    else:
        return 'D'

df['FT Result'] = df.apply(result, axis=1)

print(df[['HomeTeam', 'AwayTeam', 'FTH Goals', 'FTA Goals', 'FT Result']].head())
df['FT Result'].value_counts()


  df['Date'] = pd.to_datetime(df['Date'], errors='coerce')


       HomeTeam                AwayTeam  FTH Goals  FTA Goals FT Result
0  Ipswich Town  Brighton & Hove Albion          0          2         A
1    Man United             Southampton          3          1         H
2       Everton             Aston Villa          0          1         A
3     Leicester          Crystal Palace          0          2         A
4     Newcastle                  Wolves          3          0         H


Unnamed: 0_level_0,count
FT Result,Unnamed: 1_level_1
H,5563
A,3482
D,3108


In [12]:
# Sort by date so rolling stats make sense
if 'Date' in df.columns:
    df = df.sort_values('Date')
else:
    # Fall back: keep current order if no Date
    df = df.reset_index(drop=True)

# We'll create per-team history stats for goals scored and conceded
home_stats = df[['HomeTeam', 'FTH Goals', 'FTA Goals']].copy()
home_stats.columns = ['Team', 'GoalsFor', 'GoalsAgainst']

away_stats = df[['AwayTeam', 'FTA Goals', 'FTH Goals']].copy()
away_stats.columns = ['Team', 'GoalsFor', 'GoalsAgainst']

all_stats = pd.concat([home_stats, away_stats], ignore_index=True)

# Rolling averages per team
all_stats['GoalsFor_rolling5'] = (all_stats
                                  .groupby('Team')['GoalsFor']
                                  .rolling(5, min_periods=1)
                                  .mean()
                                  .reset_index(level=0, drop=True)
                                 )

all_stats['GoalsAgainst_rolling5'] = (all_stats
                                      .groupby('Team')['GoalsAgainst']
                                      .rolling(5, min_periods=1)
                                      .mean()
                                      .reset_index(level=0, drop=True)
                                     )

# Shift by 1 so current match doesn't leak into its own features
all_stats['GoalsFor_rolling5'] = all_stats.groupby('Team')['GoalsFor_rolling5'].shift(1)
all_stats['GoalsAgainst_rolling5'] = all_stats.groupby('Team')['GoalsAgainst_rolling5'].shift(1)

# After shift, some first matches become NaN -> fill with global mean or 0
all_stats[['GoalsFor_rolling5', 'GoalsAgainst_rolling5']] = all_stats[['GoalsFor_rolling5', 'GoalsAgainst_rolling5']].fillna(0)

# Split back into home/away aligned with original df index
home_features = all_stats.iloc[:len(df)].reset_index(drop=True)
away_features = all_stats.iloc[len(df):].reset_index(drop=True)

# Attach features to main df
df = df.reset_index(drop=True)
df['home_gf_rolling5'] = home_features['GoalsFor_rolling5']
df['home_ga_rolling5'] = home_features['GoalsAgainst_rolling5']
df['away_gf_rolling5'] = away_features['GoalsFor_rolling5']
df['away_ga_rolling5'] = away_features['GoalsAgainst_rolling5']

df[['HomeTeam','AwayTeam','home_gf_rolling5','away_gf_rolling5']].head()


Unnamed: 0,HomeTeam,AwayTeam,home_gf_rolling5,away_gf_rolling5
0,Oldham,Ipswich,0.0,0.8
1,West Ham,Wimbledon,0.0,1.2
2,Man City,Leeds,0.0,1.2
3,Newcastle,Tottenham,0.0,2.0
4,Chelsea,Blackburn,0.0,1.2


In [13]:
from sklearn.preprocessing import LabelEncoder

team_encoder = LabelEncoder()
all_teams = pd.concat([df['HomeTeam'], df['AwayTeam']], axis=0).unique()
team_encoder.fit(all_teams)

df['HomeTeam_encoded'] = team_encoder.transform(df['HomeTeam'])
df['AwayTeam_encoded'] = team_encoder.transform(df['AwayTeam'])

df[['HomeTeam','HomeTeam_encoded','AwayTeam','AwayTeam_encoded']].head()


Unnamed: 0,HomeTeam,HomeTeam_encoded,AwayTeam,AwayTeam_encoded
0,Oldham,35,Ipswich,23
1,West Ham,49,Wimbledon,51
2,Man City,29,Leeds,25
3,Newcastle,32,Tottenham,46
4,Chelsea,15,Blackburn,4


In [14]:
feature_cols = [
    'HomeTeam_encoded',
    'AwayTeam_encoded',
    'home_gf_rolling5',
    'home_ga_rolling5',
    'away_gf_rolling5',
    'away_ga_rolling5'
]

X = df[feature_cols]
y = df['FT Result']

# Simple random split (for first version)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape


((9722, 6), (2431, 6))

In [15]:
model = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)


In [16]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred))
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.4586589880707528

Classification report:
               precision    recall  f1-score   support

           A       0.43      0.36      0.40       696
           D       0.22      0.10      0.14       622
           H       0.51      0.72      0.60      1113

    accuracy                           0.46      2431
   macro avg       0.39      0.39      0.38      2431
weighted avg       0.41      0.46      0.42      2431


Confusion matrix:
 [[254  96 346]
 [141  63 418]
 [192 123 798]]


In [17]:
def predict_match(home_team_name, away_team_name,
                  home_gf5, home_ga5, away_gf5, away_ga5):
    # Encode teams
    home_enc = team_encoder.transform([home_team_name])[0]
    away_enc = team_encoder.transform([away_team_name])[0]

    row = pd.DataFrame([{
        'HomeTeam_encoded': home_enc,
        'AwayTeam_encoded': away_enc,
        'home_gf_rolling5': home_gf5,
        'home_ga_rolling5': home_ga5,
        'away_gf_rolling5': away_gf5,
        'away_ga_rolling5': away_ga5
    }])

    pred = model.predict(row)[0]
    proba = model.predict_proba(row)[0]

    classes = model.classes_
    prob_dict = {cls: float(p) for cls, p in zip(classes, proba)}
    return pred, prob_dict

# Example (dummy averages)
prediction, probabilities = predict_match(
    home_team_name="Arsenal",
    away_team_name="Chelsea",
    home_gf5=2.0, home_ga5=0.8,
    away_gf5=1.5, away_ga5=1.2
)

print("Predicted result (H/D/A):", prediction)
print("Probabilities:", probabilities)


Predicted result (H/D/A): H
Probabilities: {'A': 0.135, 'D': 0.245, 'H': 0.62}


In [24]:
def readable_prediction(home_team_name, away_team_name,
                        home_gf5, home_ga5, away_gf5, away_ga5):
    # Get prediction and probabilities
    pred, prob_dict = predict_match(
        home_team_name=home_team_name,
        away_team_name=away_team_name,
        home_gf5=home_gf5,
        home_ga5=home_ga5,
        away_gf5=away_gf5,
        away_ga5=away_ga5
    )

    # Extract probabilities for H, D, A safely
    home_win_prob = prob_dict.get('H', 0.0) * 100
    draw_prob = prob_dict.get('D', 0.0) * 100
    away_win_prob = prob_dict.get('A', 0.0) * 100

    print(f"Match: {home_team_name} vs {away_team_name}")
    print(f"Predicted result: {pred}  (H=Home win, D=Draw, A=Away win)")
    print(f"{home_team_name} win percentage: {home_win_prob:.2f}%")
    print(f"Draw percentage: {draw_prob:.2f}%")
    print(f"{away_team_name} win percentage: {away_win_prob:.2f}%")

# Example usage (change teams and stats as you like)
readable_prediction(
    home_team_name="Man City",
    away_team_name="Arsenal",
    home_gf5=2.2, home_ga5=0.2,
    away_gf5=2.4, away_ga5=0.6
)


Match: Man City vs Arsenal
Predicted result: D  (H=Home win, D=Draw, A=Away win)
Man City win percentage: 30.50%
Draw percentage: 43.50%
Arsenal win percentage: 26.00%


In [23]:
# Independent code cell
# Input format for each of the 5 league matches:
#   Home team:  x-y  (x = goals scored by home team, y = goals conceded)
#   Away team:  a-b  (a = goals scored by away team, b = goals conceded)
# Output:
#   home_gf5, home_ga5, away_gf5, away_ga5

home_gf5 = 0
home_ga5 = 0
away_gf5 = 0
away_ga5 = 0

print("Enter scores for LAST 5 league matches.")
print("Format examples: 2-1, 0-0, 3-2\n")

for i in range(1, 6):
    print(f"Match {i}:")

    home_score_str = input("  Home team score (x-y): ").strip()
    away_score_str = input("  Away team score (a-b): ").strip()

    # Parse home score x-y
    x_str, y_str = home_score_str.split("-")
    x = int(x_str)
    y = int(y_str)

    # Parse away score a-b
    a_str, b_str = away_score_str.split("-")
    a = int(a_str)
    b = int(b_str)

    # Accumulate 5-match stats
    home_gf5 += x
    home_ga5 += y
    away_gf5 += a
    away_ga5 += b
    print()

home_gf5 /= 5.0
home_ga5 /= 5.0
away_gf5 /= 5.0
away_ga5 /= 5.0

print("===== Stats over last 5 matches =====")
print("home_gf5 =", home_gf5)
print("home_ga5 =", home_ga5)
print("away_gf5 =", away_gf5)
print("away_ga5 =", away_ga5)


Enter scores for LAST 5 league matches.
Format examples: 2-1, 0-0, 3-2

Match 1:
  Home team score (x-y): 0-0
  Away team score (a-b): 4-1

Match 2:
  Home team score (x-y): 2-1
  Away team score (a-b): 2-1

Match 3:
  Home team score (x-y): 3-0
  Away team score (a-b): 1-0

Match 4:
  Home team score (x-y): 3-0
  Away team score (a-b): 2-1

Match 5:
  Home team score (x-y): 3-0
  Away team score (a-b): 3-0

===== Stats over last 5 matches =====
home_gf5 = 11
home_ga5 = 1
away_gf5 = 12
away_ga5 = 3
