In [58]:
import pandas as pd

In [59]:
# Load the dataset, skipping the first column which is an unnecessary index

matches = pd.read_csv("matches.csv", index_col=0)

In [60]:
# Display the first few rows of the dataframe to verify it loaded correctly
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,sh,sot,dist,fk,pk,pkatt,xg_y,npxg,team,season
0,2024-08-10,12:30 (13:30),Championship,Matchweek 1,Sat,Home,D,3,3,Portsmouth,...,21,9,16.6,1.0,1,1,3.3,2.5,Leeds United,2025
2,2024-08-17,12:30 (13:30),Championship,Matchweek 2,Sat,Away,D,0,0,West Brom,...,8,1,20.3,0.0,0,0,0.7,0.7,Leeds United,2025
3,2024-08-23,20:00 (21:00),Championship,Matchweek 3,Fri,Away,W,2,0,Sheffield Weds,...,13,5,18.4,1.0,0,0,1.2,1.2,Leeds United,2025
4,2024-08-31,15:00 (16:00),Championship,Matchweek 4,Sat,Home,W,2,0,Hull City,...,16,5,17.6,1.0,0,0,1.5,1.5,Leeds United,2025
5,2024-09-14,12:30 (13:30),Championship,Matchweek 5,Sat,Home,L,0,1,Burnley,...,17,4,18.4,0.0,0,0,1.2,1.2,Leeds United,2025


In [61]:
# Display the shape of the dataframe to understand its dimensions
matches.shape

(10024, 30)

# Analyze Data

In [62]:
# Value counts will not be the same for all teams due to promotions and relegations in previous seasons.
matches["team"].value_counts()

team
Bristol City                416
Queens Park Rangers         414
Preston North End           414
Middlesbrough               372
Cardiff City                370
Blackburn Rovers            368
Millwall                    368
Birmingham City             368
Derby County                327
Swansea City                327
Reading                     325
Sheffield Wednesday         324
Norwich City                324
Hull City                   322
Stoke City                  322
Huddersfield Town           282
Leeds United                281
West Bromwich Albion        280
Nottingham Forest           279
Coventry City               235
Luton Town                  235
Brentford                   235
Sheffield United            235
Barnsley                    232
Rotherham United            230
Fulham                      192
Sunderland                  189
Ipswich Town                184
Watford                     184
Wigan Athletic              184
Aston Villa                 144
Bou

In [63]:
matches["round"].value_counts()

round
Matchweek 1                          216
Matchweek 2                          216
Matchweek 3                          216
Matchweek 4                          216
Matchweek 5                          216
Matchweek 6                          216
Matchweek 7                          216
Matchweek 8                          216
Matchweek 9                          216
Matchweek 10                         216
Matchweek 11                         216
Matchweek 12                         216
Matchweek 13                         216
Matchweek 14                         216
Matchweek 15                         216
Matchweek 16                         216
Matchweek 17                         216
Matchweek 18                         216
Matchweek 19                         216
Matchweek 20                         216
Matchweek 21                         216
Matchweek 22                         216
Matchweek 23                         216
Matchweek 24                         216
Matchweek 

# Cleaning Data

In [64]:
# Display the data types of each column to understand the structure of the dataframe

matches.dtypes

date              object
time              object
comp              object
round             object
day               object
venue             object
result            object
gf                object
ga                object
opponent          object
xg_x             float64
xga              float64
poss             float64
attendance       float64
captain           object
formation         object
opp formation     object
referee           object
match report      object
notes             object
sh                 int64
sot                int64
dist             float64
fk               float64
pk                 int64
pkatt              int64
xg_y             float64
npxg             float64
team              object
season             int64
dtype: object

In [65]:
# Delete unnecessary columns that will never be used in the analysis or model training

del matches["comp"]
del matches["notes"]

In [66]:
# Convert the 'date' type from object to datetime format so that it can be used by the model.

matches["date"] = pd.to_datetime(matches["date"])

In [67]:
# Create the target variable by using the result variable converted into a binary format so that it can be used by the model.

matches["target"] = (matches["result"] == "W").astype("int") # 1 for win, 0 for draw or loss

In [68]:
# Extract the first number (including negative numbers) from the string and convert the columns to integer type so that they can be used by the model.

matches["gf"] = matches["gf"].str.extract(r'(-?\d+)').astype("Int64") # Goals For
matches["ga"] = matches["ga"].str.extract(r'(-?\d+)').astype("Int64") # Goals Against

In [69]:
# Categorize the venue column and create a new column with the corresponding category codes so that it can be used by the model.

matches["venue_code"] = matches["venue"].astype("category").cat.codes # Home=1, Away=0

In [70]:
# Create a new code for each opponent team so that it can be used by the model.

matches["opp_code"] = matches["opponent"].astype("category").cat.codes

In [71]:
# Remove the colon and everything after it from the time column (or replacing with nothing), then convert the hour to an integer type so that it can be used by the model.

matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")

In [72]:
# Generate a number for each day of the week (Monday=0, Sunday=6) from the date column so that it can be used by the model.

matches["day_code"]= matches["date"].dt.dayofweek

# Create the Machine Learning Model

In [73]:
from sklearn.ensemble import RandomForestClassifier

A Random Forest is a type of model that can pick up non-linearities in the data. 

For example, our opponent code doesn't have a linear relationship. Having an opponent code of 10 instead of 2 doesn't mean that you're facing a tougher opponent. A linear model cannot see this.

In [74]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

``n_estimators`` is the number of trees in the forest. 
This means that the model will create 50 different decision trees and combine their predictions to make a final prediction.

``min_samples_split`` is the minimum number of samples required to split an internal node.
This means that a node must have at least 10 samples in order to be considered for splitting into two child nodes.
The higher this number, the less likely the model is to overfit the training data, but it may also lead to underfitting if set too high.

``random_state`` is a seed value that ensures reproducibility of the results.

In [75]:
# Generate the training set by selecting all matches before 2025.

train = matches[matches["date"] < '2025-01-01']

In [76]:
# Generate the test set by selecting all matches in 2025 and beyond.

test = matches[matches["date"] >= '2025-01-01']

In [77]:
# Define the predictor variables that will be used to train the model.

predictors = ["venue_code", "opp_code", "hour", "day_code"]

In [78]:
# Fit the Random Forest model using the training data and the defined predictor variable.

rf.fit(train[predictors], train["target"])

# We are telling the model to learn the relationship between the predictor variables and the target variable (win or not win).

0,1,2
,n_estimators,50
,criterion,'gini'
,max_depth,
,min_samples_split,10
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [79]:
# After fitting, we can make predictions based on the predictors in the test data.

preds = rf.predict(test[predictors])

# Measuring Model's Performance

In [80]:
from sklearn.metrics import accuracy_score

In [81]:
# Evaluate the model's accuracy by comparing the predicted values to the actual target values in the test set.

accuracy_score(test["target"], preds)

0.5952380952380952

We get an **accuracy** score of **60%**

Let's dig a little bit deeper to see when our accuracy was high versus when it was low.

In order to do that we need to create a Pandas dataframe to combine our actual values and predicted values.

In [82]:
combined = pd.DataFrame(dict(actual=test["target"], prediction=preds))

In [83]:
pd.crosstab(index=combined["actual"], columns=combined["prediction"])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,287,56
1,165,38


We can see that when predicting a loss or draw we are performing better than when we predict a win.

Since predicting wins is what we are trying to do, we're going to change our performance metric from **accuracy** to **precision**. 

This will show us how often we were correct when we predicted a win.

In [84]:
from sklearn.metrics import precision_score

In [85]:
precision_score(test["target"], preds)

0.40425531914893614

Our precision is only **40%**

## Rolling Averages

In order to attempt to improve our performance we're going to create some more predictors.

We'll start by splitting our dataframe up by team. This will allow us to calculate rolling averages for team performance.

What is their recent form like? 

How did each team do in the past few games? 

How many shots did they have? How many shots were made against them?

In [86]:
# Create a dataframe for every team in our dataset.

grouped_matches = matches.groupby("team")

In [87]:
# Display one of the new dataframes to verify it worked. Using Hull City as an example.


hull = grouped_matches.get_group("Hull City").sort_values("date")

hull

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg_x,...,pkatt,xg_y,npxg,team,season,target,venue_code,opp_code,hour,day_code
0,2017-08-05,17:30 (18:30),Matchweek 1,Sat,Away,D,1,1,Aston Villa,,...,0,,,Hull City,2018,0,0,0,17,5
1,2017-08-12,15:00 (16:00),Matchweek 2,Sat,Home,W,4,1,Burton Albion,,...,0,,,Hull City,2018,1,1,11,15,5
2,2017-08-15,19:45 (20:45),Matchweek 3,Tue,Home,L,2,3,Wolves,,...,1,,,Hull City,2018,0,1,45,19,1
3,2017-08-19,15:00 (16:00),Matchweek 4,Sat,Away,L,1,2,QPR,,...,0,,,Hull City,2018,0,0,33,15,5
5,2017-08-25,19:45 (20:45),Matchweek 5,Fri,Home,W,4,0,Bolton,,...,0,,,Hull City,2018,1,1,5,19,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44,2025-04-14,20:00 (21:00),Matchweek 42,Mon,Home,D,1,1,Coventry City,1.0,...,0,1.0,1.0,Hull City,2025,0,1,14,20,0
45,2025-04-18,15:00 (16:00),Matchweek 43,Fri,Away,L,0,1,Swansea City,0.6,...,0,0.6,0.6,Hull City,2025,0,0,41,15,4
46,2025-04-21,15:00 (16:00),Matchweek 44,Mon,Home,W,2,1,Preston,2.1,...,2,2.1,0.7,Hull City,2025,1,1,32,15,0
47,2025-04-26,15:00 (16:00),Matchweek 45,Sat,Home,L,0,1,Derby County,0.4,...,0,0.4,0.4,Hull City,2025,0,1,15,15,5


Using these new dataframes, if we're on matchweek 10 we want to ask how did they do in their last 6 previous weeks. 

Then we can pass that information into the model to be used to make better predictions.

In [88]:
# We create a function that will take in a group (a team's dataframe), the columns we want to calculate rolling averages for, and the names of the new columns to store those averages.

def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(6, closed="left").mean() # closed="left" ensures we don't include the current match in the average
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols) # Drop rows where the new columns have NaN values as Pandas cannot handle NaN values in the model
    return group

In [89]:
# We're going to calculate rolling averages for the following columns:

cols = ["gf", "ga", "xg_x", "xga", "sh", "sot", "dist", "fk", "pk", "pkatt"] # goals for, goals against, expected goals for, expected goals against, shots, shots on target, distance covered, free kicks, penalties scored, penalties attempted

# Now we columns for the rolling averages of each of the above columns.
new_cols = [f"{c}_rolling" for c in cols]

new_cols

['gf_rolling',
 'ga_rolling',
 'xg_x_rolling',
 'xga_rolling',
 'sh_rolling',
 'sot_rolling',
 'dist_rolling',
 'fk_rolling',
 'pk_rolling',
 'pkatt_rolling']

In [90]:
# Apply the rolling_averages function to each team's dataframe and combine the results into a single matches_rolling dataframe.

matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))

matches_rolling

  matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))


Unnamed: 0_level_0,Unnamed: 1_level_0,date,time,round,day,venue,result,gf,ga,opponent,xg_x,...,gf_rolling,ga_rolling,xg_x_rolling,xga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Aston Villa,8,2018-09-15,17:30 (18:30),Matchweek 7,Sat,Away,D,1,1,Blackburn,0.9,...,1.833333,1.833333,1.066667,1.233333,13.833333,5.333333,18.433333,0.500000,0.000000,0.000000
Aston Villa,9,2018-09-18,19:45 (20:45),Matchweek 8,Tue,Home,W,2,0,Rotherham Utd,1.4,...,1.500000,1.833333,1.066667,1.266667,14.833333,5.333333,18.600000,0.500000,0.000000,0.000000
Aston Villa,10,2018-09-22,15:00 (16:00),Matchweek 9,Sat,Home,L,1,2,Sheffield Weds,1.5,...,1.333333,1.500000,0.983333,1.166667,14.333333,5.000000,18.116667,0.666667,0.000000,0.000000
Aston Villa,11,2018-09-28,20:00 (21:00),Matchweek 10,Fri,Away,D,1,1,Bristol City,0.5,...,1.333333,1.666667,1.150000,1.250000,15.666667,5.000000,18.600000,0.833333,0.000000,0.000000
Aston Villa,12,2018-10-02,19:45 (20:45),Matchweek 11,Tue,Home,D,3,3,Preston,1.5,...,1.166667,1.500000,1.066667,1.033333,14.833333,4.500000,19.316667,0.833333,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wycombe Wanderers,45,2021-04-17,12:30 (13:30),Matchweek 42,Sat,Away,D,2,2,Swansea City,1.4,...,1.166667,1.000000,1.066667,1.450000,9.333333,3.166667,18.166667,0.333333,0.166667,0.166667
Wycombe Wanderers,46,2021-04-21,19:00 (20:00),Matchweek 43,Wed,Home,W,2,1,Bristol City,2.4,...,1.333333,1.333333,1.150000,1.816667,9.333333,3.000000,17.716667,0.333333,0.166667,0.166667
Wycombe Wanderers,47,2021-04-24,15:00 (16:00),Matchweek 44,Sat,Away,L,1,2,Cardiff City,1.5,...,1.500000,1.000000,1.350000,1.500000,11.833333,3.333333,16.866667,0.000000,0.333333,0.333333
Wycombe Wanderers,48,2021-05-01,15:00 (16:00),Matchweek 45,Sat,Home,W,1,0,Bournemouth,0.6,...,1.666667,1.333333,1.500000,1.416667,12.833333,3.333333,17.616667,0.000000,0.500000,0.500000


In [91]:
# We've now created a second index level called "team" which we don't need so we'll remove that.

matches_rolling = matches_rolling.droplevel("team")

In [92]:
# Values for the one index column that remains are not unique, they are repeated for each team.

matches_rolling.tail(50)

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg_x,...,gf_rolling,ga_rolling,xg_x_rolling,xga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
40,2023-03-14,19:45 (20:45),Matchweek 37,Tue,Home,D,1,1,Coventry City,0.9,...,0.5,1.333333,0.7,1.516667,9.0,3.166667,18.166667,0.333333,0.0,0.0
41,2023-03-18,15:00 (16:00),Matchweek 38,Sat,Away,D,1,1,Watford,0.3,...,0.5,1.333333,0.666667,1.833333,9.0,3.5,18.833333,0.333333,0.0,0.0
42,2023-04-01,15:00 (16:00),Matchweek 39,Sat,Home,W,1,0,QPR,1.2,...,0.666667,1.5,0.55,1.933333,8.166667,2.833333,19.15,0.166667,0.0,0.0
43,2023-04-07,15:00 (16:00),Matchweek 40,Fri,Away,L,0,1,Sheffield Utd,1.2,...,0.666667,1.166667,0.65,1.883333,7.0,2.833333,19.666667,0.333333,0.166667,0.166667
44,2023-04-10,15:00 (16:00),Matchweek 41,Mon,Home,L,0,2,Swansea City,0.9,...,0.5,1.166667,0.766667,2.216667,7.333333,2.5,19.266667,0.333333,0.166667,0.166667
45,2023-04-15,15:00 (16:00),Matchweek 42,Sat,Away,L,0,1,Blackpool,1.1,...,0.5,1.333333,0.833333,2.133333,7.0,2.5,18.15,0.333333,0.166667,0.166667
46,2023-04-18,19:45 (20:45),Matchweek 43,Tue,Away,W,1,0,Stoke City,0.7,...,0.5,1.0,0.933333,1.683333,8.833333,2.5,18.566667,0.333333,0.166667,0.166667
47,2023-04-22,15:00 (16:00),Matchweek 44,Sat,Home,W,2,1,Millwall,1.8,...,0.5,0.833333,0.9,1.35,8.333333,2.0,18.9,0.5,0.166667,0.166667
48,2023-04-29,15:00 (16:00),Matchweek 45,Sat,Away,D,1,1,Reading,1.7,...,0.666667,0.833333,1.15,1.3,10.0,2.0,17.716667,0.666667,0.166667,0.166667
49,2023-05-08,15:00 (16:00),Matchweek 46,Mon,Home,D,0,0,Rotherham Utd,1.0,...,0.666667,1.0,1.233333,1.516667,10.666667,2.0,15.666667,0.333333,0.0,0.0


In [93]:
# We'll reset the index to be a simple range index.

matches_rolling.index = range(matches_rolling.shape[0])

In [94]:
# Now each row has a unique index value once again.

matches_rolling

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg_x,...,gf_rolling,ga_rolling,xg_x_rolling,xga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
0,2018-09-15,17:30 (18:30),Matchweek 7,Sat,Away,D,1,1,Blackburn,0.9,...,1.833333,1.833333,1.066667,1.233333,13.833333,5.333333,18.433333,0.500000,0.000000,0.000000
1,2018-09-18,19:45 (20:45),Matchweek 8,Tue,Home,W,2,0,Rotherham Utd,1.4,...,1.500000,1.833333,1.066667,1.266667,14.833333,5.333333,18.600000,0.500000,0.000000,0.000000
2,2018-09-22,15:00 (16:00),Matchweek 9,Sat,Home,L,1,2,Sheffield Weds,1.5,...,1.333333,1.500000,0.983333,1.166667,14.333333,5.000000,18.116667,0.666667,0.000000,0.000000
3,2018-09-28,20:00 (21:00),Matchweek 10,Fri,Away,D,1,1,Bristol City,0.5,...,1.333333,1.666667,1.150000,1.250000,15.666667,5.000000,18.600000,0.833333,0.000000,0.000000
4,2018-10-02,19:45 (20:45),Matchweek 11,Tue,Home,D,3,3,Preston,1.5,...,1.166667,1.500000,1.066667,1.033333,14.833333,4.500000,19.316667,0.833333,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7521,2021-04-17,12:30 (13:30),Matchweek 42,Sat,Away,D,2,2,Swansea City,1.4,...,1.166667,1.000000,1.066667,1.450000,9.333333,3.166667,18.166667,0.333333,0.166667,0.166667
7522,2021-04-21,19:00 (20:00),Matchweek 43,Wed,Home,W,2,1,Bristol City,2.4,...,1.333333,1.333333,1.150000,1.816667,9.333333,3.000000,17.716667,0.333333,0.166667,0.166667
7523,2021-04-24,15:00 (16:00),Matchweek 44,Sat,Away,L,1,2,Cardiff City,1.5,...,1.500000,1.000000,1.350000,1.500000,11.833333,3.333333,16.866667,0.000000,0.333333,0.333333
7524,2021-05-01,15:00 (16:00),Matchweek 45,Sat,Home,W,1,0,Bournemouth,0.6,...,1.666667,1.333333,1.500000,1.416667,12.833333,3.333333,17.616667,0.000000,0.500000,0.500000


# Retraining our Model

Now we have a new set of rolling averages columns, we can add these to our existing predictors.

In [95]:
updated_predictors = predictors + new_cols

In [96]:
# We create a function that includes everything we need to make predictions.

def make_predictions(data, predictors):
    train = data[data["date"] < '2025-01-01']
    test = data[data["date"] >= '2025-01-01']
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)
    precision = precision_score(test["target"], preds)
    return combined, precision

In [97]:
combined, precision = make_predictions(matches_rolling, updated_predictors)

In [98]:
precision

0.5571428571428572

Our **precision** now improves, going from **40%** to **56%**

We can look at this more closely by looking at the ``combined`` values.

In [99]:
combined

Unnamed: 0,actual,predicted
740,0,0
741,0,1
742,1,0
743,0,0
744,0,0
...,...,...
7349,1,0
7350,0,0
7351,0,0
7352,0,0


However, ``combined`` doesn't give us any information for each match, so we can't see how well we're doing for each team.

We can fix this by merging that extra information in a new copy of ``combined``

In [100]:
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)

combined.head()

Unnamed: 0,actual,predicted,date,team,opponent,result
740,0,0,2025-01-01,Blackburn Rovers,Leeds United,D
741,0,1,2025-01-04,Blackburn Rovers,Burnley,L
742,1,0,2025-01-15,Blackburn Rovers,Portsmouth,W
743,0,0,2025-01-18,Blackburn Rovers,Oxford United,L
744,0,0,2025-01-21,Blackburn Rovers,Coventry City,L


# Combining Home Team and Away Team Predictions

The final thing we'll do is look at how well our model did at correctly predicting the outcome for both teams in a match.

In our dataset we have data for **both** teams in **every** match and so **two separate rows** of data for **each** match.

Our model therefore might have predicted **both** teams to win in any given match. 

We can therefore combine the predictions for both teams in each match to resolve any issues.

In order to do this we need to replace some of the team names to their short-hand form so that they are matched with their short-hand forms in the ``opponent`` column. 

To do that we'll create a dictionary that includes all the teams that require editting in the ``team`` column and then use the Pandas ``map`` function with that dictionary.

In [101]:
class MissingDict(dict):
    __missing__ = lambda self, key: key # Return the key if it's not found in the dictionary

map_values = {
    "West Bromwich Albion": "West Brom",
    "Queens Park Rangers": "QPR",
    "Preston North End": "Preston",
    "Leeds United": "Leeds Utd",
    "Sheffield United": "Sheffield Utd",
    "Sheffield Wednesday": "Sheffield Weds",
    "Rotherham United": "Rotherham Utd",
    "Wolverhampton Wanderers": "Wolves",
    "Newcastle United": "Newcastle Utd",
    "Blackburn Rovers": "Blackburn",
    "Brighton and Hove Albion": "Brighton",
    "Peterborough Utd": "P'borough Utd",
    "Charlton Athletic": "Charlton Ath",
    "Peterborough United": "P'borough Utd",
    "Wycome Wanderers": "Wycome",
    "Bolton Wanderers": "Bolton",
    "Huddersfield Town": "Huddersfield",
    "Nottingham Forest": "Nott'ham Forest"
}


# Create an instance of MissingDict with the mapping values as well as a default behaviour for missing keys
mapping = MissingDict(**map_values)

In [102]:
# Test mapping to ensure a team that doesn't need changing is returned as-is.
mapping["Hull City"]

'Hull City'

In [103]:
# Test the mapping to ensure a team that does need changing to it's short-hand format is correctly mapped.
mapping["Queens Park Rangers"]

'QPR'

We can then use ``mapping`` with the Pandas ``map`` method.

In [104]:
combined["new_team"] = combined["team"].map(mapping)

If we had passed the basic ``map_values`` dictionary to the Pandas ``map`` method, the ``new_team`` column would be empty for Hull City etc. because they don't exist in the dictionary.

In [105]:
combined[combined["team"] == "Hull City"]

Unnamed: 0,actual,predicted,date,team,opponent,result,new_team
2849,0,0,2025-01-01,Hull City,Middlesbrough,L,Hull City
2850,0,0,2025-01-04,Hull City,Leeds United,D,Hull City
2851,1,0,2025-01-18,Hull City,Millwall,W,Hull City
2852,0,0,2025-01-21,Hull City,QPR,L,Hull City
2853,1,0,2025-01-24,Hull City,Sheffield Utd,W,Hull City
2854,0,0,2025-02-01,Hull City,Stoke City,L,Hull City
2855,0,0,2025-02-12,Hull City,Burnley,L,Hull City
2856,0,0,2025-02-15,Hull City,Norwich City,D,Hull City
2857,1,0,2025-02-22,Hull City,Sunderland,W,Hull City
2858,0,0,2025-02-25,Hull City,Cardiff City,L,Hull City


We can now use ``new_team`` to merge ``combined`` with itself. 

In [106]:
merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"])

In [107]:
merged

Unnamed: 0,actual_x,predicted_x,date,team_x,opponent_x,result_x,new_team_x,actual_y,predicted_y,team_y,opponent_y,result_y,new_team_y
0,0,0,2025-01-01,Blackburn Rovers,Leeds United,D,Blackburn,0,1,Leeds United,Blackburn,D,Leeds Utd
1,0,1,2025-01-04,Blackburn Rovers,Burnley,L,Blackburn,1,1,Burnley,Blackburn,W,Burnley
2,1,0,2025-01-15,Blackburn Rovers,Portsmouth,W,Blackburn,0,0,Portsmouth,Blackburn,L,Portsmouth
3,0,0,2025-01-18,Blackburn Rovers,Oxford United,L,Blackburn,1,0,Oxford United,Blackburn,W,Oxford United
4,0,0,2025-01-21,Blackburn Rovers,Coventry City,L,Blackburn,1,0,Coventry City,Blackburn,W,Coventry City
...,...,...,...,...,...,...,...,...,...,...,...,...,...
519,1,0,2025-04-12,West Bromwich Albion,Watford,W,West Brom,0,0,Watford,West Brom,L,Watford
520,0,0,2025-04-18,West Bromwich Albion,Coventry City,L,West Brom,1,1,Coventry City,West Brom,W,Coventry City
521,0,0,2025-04-21,West Bromwich Albion,Derby County,L,West Brom,1,0,Derby County,West Brom,W,Derby County
522,0,0,2025-04-26,West Bromwich Albion,Cardiff City,D,West Brom,0,0,Cardiff City,West Brom,D,Cardiff City


We can now look at all the games where the home team were predicted to win and the away team were predicted to not win and get the value count of home wins.

In [108]:
merged[(merged["predicted_x"] == 1) & (merged["predicted_y"] == 0)]["actual_x"].value_counts()

actual_x
1    27
0    22
Name: count, dtype: int64

In [109]:
27 / (27 + 22)


0.5510204081632653

Precision for home wins is **55%**

We can then do the same for games where the **home** team were predicted to **not win** and the **away** team **were** predicted win and get the value count of away wins.

In [110]:
merged[(merged["predicted_x"] == 0) & (merged["predicted_y"] == 1)]["actual_y"].value_counts()

actual_y
1    35
0    27
Name: count, dtype: int64

In [111]:
35 / (35 + 27)

0.5645161290322581

Precision for away wins is **56%**