In [50]:
import pandas as pd

df = pd.read_csv("../data/E0.csv") # Load the CSV file into a DataFrame. This is Premier League 25/26 data.
df.head() # .head() prints the first 5 rows
df.columns # .columns attribute prints the column names

list(df.columns[:20])
df.columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110 entries, 0 to 109
Columns: 132 entries, Div to BFECAHA
dtypes: float64(108), int64(16), object(8)
memory usage: 113.6+ KB


In [51]:
df["Result"] = df["FTR"].map({"H": 1, "D": 0, "A": -1}) #Creates a new column called Result. Shows the first few rows so you can visually confirm it worked.

df[["FTR", "Result"]].head() #Shows the first few rows so you can visually confirm it worked.

Unnamed: 0,FTR,Result
0,H,1
1,D,0
2,D,0
3,H,1
4,H,1


In [52]:
# Feature columns for the ML model
feature_cols = ["HS", "AS", "HST", "AST", "HF", "AF", "HC", "AC", "HY", "AY", "HR", "AR"] #A Python list of the feature column names

X = df[feature_cols] #selects columns as your ML inputs
y = df["Result"] # this is what the model will try to predict

X.head(), y.head() #Shows the first few rows of X and y to visually confirm it worked

(   HS  AS  HST  AST  HF  AF  HC  AC  HY  AY  HR  AR
 0  19  10   10    3   7  10   6   7   1   2   0   0
 1   3  16    3    3  13  11   3   6   1   1   1   0
 2  10   7    4    2  16  15   4   3   3   3   0   0
 3  10  12    5    4   8  10   5   7   0   1   0   0
 4  16  14    6    4  14   8   6   5   0   0   0   0,
 0    1
 1    0
 2    0
 3    1
 4    1
 Name: Result, dtype: int64)

In [57]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Split into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Create the model
model = RandomForestClassifier(n_estimators=200, random_state=42)

# Train (fit) the model on the training data
model.fit(X_train, y_train)

# Predict on the test data
y_pred = model.predict(X_test)

# Calculate accuracy on the test set
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.5454545454545454

In [54]:
import pandas as pd

# Get feature importances from the trained model
importances = model.feature_importances_

# Pair feature names with their importance scores
feature_importances = pd.DataFrame({
    "feature": feature_cols,
    "importance": importances
}).sort_values(by="importance", ascending=False)

feature_importances

Unnamed: 0,feature,importance
2,HST,0.159976
0,HS,0.122965
5,AF,0.110794
7,AC,0.098838
3,AST,0.092519
4,HF,0.087598
8,HY,0.083253
6,HC,0.079033
9,AY,0.073266
1,AS,0.072429


In [55]:
# Add Team Strength Ratings

team_strength = {
    "Arsenal": 1,
    "Man City": 1,
    "Chelsea": 1,
    "Sunderland": 1,
    "Tottenham": 1,
    "Aston Villa": 1,
    "Man United": 1,

    "Liverpool": 0,
    "Bournemouth": 0,
    "Crystal Palace": 0,
    "Brighton": 0,
    "Brentford": 0,
    "Everton": 0,
    "Newcastle": 0,

    "Fulham": -1,
    "Leeds": -1,
    "Burnley": -1,
    "West Ham": -1,
    "Nott'm Forest": -1,
    "Wolves": -1
}

# Add new columns
df["HomeStrength"] = df["HomeTeam"].map(team_strength)
df["AwayStrength"] = df["AwayTeam"].map(team_strength)

df[["HomeTeam", "HomeStrength", "AwayTeam", "AwayStrength"]].head()


Unnamed: 0,HomeTeam,HomeStrength,AwayTeam,AwayStrength
0,Liverpool,0,Bournemouth,0
1,Aston Villa,1,Newcastle,0
2,Brighton,0,Fulham,-1
3,Sunderland,1,West Ham,-1
4,Tottenham,1,Burnley,-1


In [58]:
# Add team strength to the feature list for ML
feature_cols += ["HomeStrength", "AwayStrength"]

# Rebuild X with the new features
X = df[feature_cols]
y = df["Result"]

X.head()


Unnamed: 0,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,HomeStrength,AwayStrength,HomeStrength.1,AwayStrength.1
0,19,10,10,3,7,10,6,7,1,2,0,0,0,0,0,0
1,3,16,3,3,13,11,3,6,1,1,1,0,1,0,1,0
2,10,7,4,2,16,15,4,3,3,3,0,0,0,-1,0,-1
3,10,12,5,4,8,10,5,7,0,1,0,0,1,-1,1,-1
4,16,14,6,4,14,8,6,5,0,0,0,0,1,-1,1,-1
