In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load data fresh. This is Premier League data from this current season.
df = pd.read_csv("../data/E0.csv")

# Target column: 1 = home win, 0 = draw, -1 = away win
df["Result"] = df["FTR"].map({"H": 1, "D": 0, "A": -1})

# Team strength mapping (based on your table)
team_strength = {
    "Arsenal": 1,
    "Man City": 1,
    "Chelsea": 1,
    "Sunderland": 1,
    "Tottenham": 1,
    "Aston Villa": 1,
    "Man United": 1,

    "Liverpool": 0,
    "Bournemouth": 0,
    "Crystal Palace": 0,
    "Brighton": 0,
    "Brentford": 0,
    "Everton": 0,
    "Newcastle": 0,

    "Fulham": -1,
    "Leeds": -1,
    "Burnley": -1,
    "West Ham": -1,
    "Nott'm Forest": -1,
    "Wolves": -1
}

df["HomeStrength"] = df["HomeTeam"].map(team_strength)
df["AwayStrength"] = df["AwayTeam"].map(team_strength)

# Feature set (stats + team strength)
feature_cols = [
    "HS", "AS", "HST", "AST",
    "HF", "AF",
    "HC", "AC",
    "HY", "AY",
    "HR", "AR",
    "HomeStrength", "AwayStrength"
]

X = df[feature_cols]
y = df["Result"]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Model
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy


0.5454545454545454

In [3]:
df["HomeAdvantage"] = 1

In [4]:
feature_cols.append("HomeAdvantage")

In [5]:
X = df[feature_cols]