In [1]:
# 📦 Step 1: Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
import pandas as pd

# Load and sort chronologically
df = pd.read_csv("23-24Season.csv")
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)
df = df.sort_values('Date')

# Keep only what we need
df = df[['Date', 'HomeTeam', 'AwayTeam', 'FTR', 'B365H', 'B365D', 'B365A']].dropna()
df['Target'] = df['FTR'].apply(lambda x: 1 if x == 'H' else 0)


In [3]:
# Initialize form dictionaries
team_history = {}
home_form = []
away_form = []

def get_win_rate(matches, n=5):
    if not matches:
        return 0.5  # neutral default
    return sum(matches[-n:]) / min(n, len(matches))

# Loop through each match
for _, row in df.iterrows():
    home, away = row['HomeTeam'], row['AwayTeam']

    # Get past performance
    home_wins = team_history.get(home, [])
    away_wins = team_history.get(away, [])

    home_form.append(get_win_rate(home_wins))
    away_form.append(get_win_rate(away_wins))

    # Update history after match
    if row['FTR'] == 'H':
        team_history.setdefault(home, []).append(1)
        team_history.setdefault(away, []).append(0)
    elif row['FTR'] == 'A':
        team_history.setdefault(home, []).append(0)
        team_history.setdefault(away, []).append(1)
    else:
        team_history.setdefault(home, []).append(0)
        team_history.setdefault(away, []).append(0)

# Add to dataframe
df['HomeForm'] = home_form
df['AwayForm'] = away_form


In [5]:
# Encode teams
df = pd.get_dummies(df, columns=['HomeTeam', 'AwayTeam'], drop_first=True)

# Drop unused
df = df.drop(columns=['FTR', 'Date'])

# Define features
X = df.drop(columns=['Target'])
y = df['Target']

# Time-based split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Train model
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate
from sklearn.metrics import classification_report, confusion_matrix
y_pred = model.predict(X_test)
print("🔍 Classification Report:")
print(classification_report(y_test, y_pred))
print("🧮 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


🔍 Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.72      0.70        43
           1       0.61      0.58      0.59        33

    accuracy                           0.66        76
   macro avg       0.65      0.65      0.65        76
weighted avg       0.66      0.66      0.66        76

🧮 Confusion Matrix:
[[31 12]
 [14 19]]
