In [1]:
import pandas as pd

frames = []

for year in range(2016,2024):
    url="https://fixturedownload.com/download/epl-%s-GMTStandardTime.csv" % year
    print(url)
    frame = pd.read_csv(url)
    frame['Season']=year
    frames.append(frame)
df = pd.concat(frames)
df.head()

https://fixturedownload.com/download/epl-2016-GMTStandardTime.csv
https://fixturedownload.com/download/epl-2017-GMTStandardTime.csv
https://fixturedownload.com/download/epl-2018-GMTStandardTime.csv
https://fixturedownload.com/download/epl-2019-GMTStandardTime.csv
https://fixturedownload.com/download/epl-2020-GMTStandardTime.csv
https://fixturedownload.com/download/epl-2021-GMTStandardTime.csv
https://fixturedownload.com/download/epl-2022-GMTStandardTime.csv
https://fixturedownload.com/download/epl-2023-GMTStandardTime.csv


Unnamed: 0,Match Number,Round Number,Date,Location,Home Team,Away Team,Result,Season
0,1,1,13/08/2016 12:30,KCOM Stadium,Hull,Leicester,2 - 1,2016
1,2,1,13/08/2016 15:00,Turf Moor,Burnley,Swansea,0 - 1,2016
2,3,1,13/08/2016 15:00,Selhurst Park,Crystal Palace,West Brom,0 - 1,2016
3,4,1,13/08/2016 15:00,Goodison Park,Everton,Spurs,1 - 1,2016
4,5,1,13/08/2016 15:00,Riverside Stadium,Middlesbrough,Stoke,1 - 1,2016


In [2]:
df.tail()

Unnamed: 0,Match Number,Round Number,Date,Location,Home Team,Away Team,Result,Season
375,376,38,19/05/2024 16:00,Selhurst Park,Crystal Palace,Aston Villa,,2023
376,377,38,19/05/2024 16:00,Anfield,Liverpool,Wolves,,2023
377,378,38,19/05/2024 16:00,Kenilworth Road,Luton,Fulham,,2023
378,379,38,19/05/2024 16:00,Etihad Stadium,Man City,West Ham,,2023
379,380,38,19/05/2024 16:00,Bramall Lane,Sheffield Utd,Spurs,,2023


In [3]:
# Filter out the matches that haven't been played yet.
df = df[pd.notnull(df.Result)]
df.tail()

Unnamed: 0,Match Number,Round Number,Date,Location,Home Team,Away Team,Result,Season
275,272,28,10/03/2024 13:00,Villa Park,Aston Villa,Spurs,0 - 4,2023
276,274,28,10/03/2024 14:00,Amex Stadium,Brighton,Nottingham Forest,1 - 0,2023
277,279,28,10/03/2024 14:00,London Stadium,West Ham,Burnley,2 - 2,2023
278,277,28,10/03/2024 15:45,Anfield,Liverpool,Man City,1 - 1,2023
279,275,28,11/03/2024 20:00,Stamford Bridge,Chelsea,Newcastle,3 - 2,2023


In [4]:
import re

def parse_scores(result):
    return list(map(lambda x:int(x), re.findall(r'\b\d+\b',result) ))

def parse_home_score(row):
    return parse_scores(row['Result'])[0]


def parse_away_score(row): 
    return parse_scores(row['Result'])[1]


df['Home Score']=df.apply(parse_home_score, axis=1)
df['Away Score']=df.apply(parse_away_score, axis=1)

df.head()

Unnamed: 0,Match Number,Round Number,Date,Location,Home Team,Away Team,Result,Season,Home Score,Away Score
0,1,1,13/08/2016 12:30,KCOM Stadium,Hull,Leicester,2 - 1,2016,2,1
1,2,1,13/08/2016 15:00,Turf Moor,Burnley,Swansea,0 - 1,2016,0,1
2,3,1,13/08/2016 15:00,Selhurst Park,Crystal Palace,West Brom,0 - 1,2016,0,1
3,4,1,13/08/2016 15:00,Goodison Park,Everton,Spurs,1 - 1,2016,1,1
4,5,1,13/08/2016 15:00,Riverside Stadium,Middlesbrough,Stoke,1 - 1,2016,1,1


In [5]:
# Extract unique team names from column B
team_names = df['Home Team'].unique()
df.describe()

Unnamed: 0,Match Number,Round Number,Season,Home Score,Away Score
count,2939.0,2939.0,2939.0,2939.0,2939.0
mean,185.714529,19.101735,2019.379721,1.552229,1.263015
std,108.310702,10.878727,2.235324,1.328682,1.222894
min,1.0,1.0,2016.0,0.0,0.0
25%,92.0,10.0,2017.0,1.0,0.0
50%,184.0,19.0,2019.0,1.0,1.0
75%,276.0,28.0,2021.0,2.0,2.0
max,380.0,38.0,2023.0,9.0,9.0


In [6]:
# Check for missing values in the 'Home Team' column
missing_teams = df[df['Home Team'].isna()]

# Display missing teams, if any
if not missing_teams.empty:
    print("Missing Teams:")
    print(missing_teams['Home Team'])
else:
    print("No missing teams.")

No missing teams.


In [7]:
# Dictionary to store calculated data for each team
team_data = {}

# Loop through each team
for team in team_names:
    # Filter rows for the teams as Home Team
    home_team_rows = df[df['Home Team'] == team]

    # Calculate average goals for and against as Home Team
    avg_home_goals_for = home_team_rows['Home Score'].mean()
    avg_away_goals_against = home_team_rows['Away Score'].mean()

    # Filter rows for the specific team as Away Team
    away_team_rows = df[df['Away Team'] == team]

    # Calculate average goals for and against as Away Team
    avg_away_goals_for = away_team_rows['Away Score'].mean()
    avg_home_goals_against = away_team_rows['Home Score'].mean()

    # Calculate average goals for and against for each team
    OFF_rating = (avg_home_goals_for + avg_away_goals_for) / 2
    DEF_rating = (avg_home_goals_against + avg_away_goals_against) / 2

    # Store calculated data in the dictionary
    team_data[team] = {
        'Home Goals For': avg_home_goals_for,
        'Away Goals For': avg_away_goals_for,
        'Home Goals Against': avg_home_goals_against,
        'Away Goals Against': avg_away_goals_against,
        'OFF Rating': OFF_rating,
        'DEF Rating': DEF_rating
    }

# Display calculated data for each team
print("Team Data:")
for team, data in team_data.items():
    print(f"{team}:")
    print(f"  Home Goals For: {data['Home Goals For']:.5f}")
    print(f"  Away Goals For: {data['Away Goals For']:.5f}")
    print(f"  Home Goals Against: {data['Home Goals Against']:.5f}")
    print(f"  Away Goals Against: {data['Away Goals Against']:.5f}")
    print(f"  OFF Rating: {data['OFF Rating']:.2f}")
    print(f"  DEF Rating: {data['DEF Rating']:.2f}")
    print()


Team Data:
Hull:
  Home Goals For: 1.47368
  Away Goals For: 0.47368
  Home Goals Against: 2.36842
  Away Goals Against: 1.84211
  OFF Rating: 0.97
  DEF Rating: 2.11

Burnley:
  Home Goals For: 1.05469
  Away Goals For: 0.95312
  Home Goals Against: 1.59375
  Away Goals Against: 1.39062
  OFF Rating: 1.00
  DEF Rating: 1.49

Crystal Palace:
  Home Goals For: 1.18367
  Away Goals For: 1.13605
  Home Goals Against: 1.65306
  Away Goals Against: 1.27211
  OFF Rating: 1.16
  DEF Rating: 1.46

Everton:
  Home Goals For: 1.40136
  Away Goals For: 1.02721
  Home Goals Against: 1.60544
  Away Goals Against: 1.21088
  OFF Rating: 1.21
  DEF Rating: 1.41

Middlesbrough:
  Home Goals For: 0.89474
  Away Goals For: 0.52632
  Home Goals Against: 1.57895
  Away Goals Against: 1.21053
  OFF Rating: 0.71
  DEF Rating: 1.39

Southampton:
  Home Goals For: 1.16541
  Away Goals For: 1.09023
  Home Goals Against: 1.79699
  Away Goals Against: 1.48872
  OFF Rating: 1.13
  DEF Rating: 1.64

Man City:
  Hom

In [8]:
# Display just OFF & DEF for each team from the above
print("Team Data:")
for team, data in team_data.items():
    print(f"{team}:")
    print(f"  OFF Rating: {data['OFF Rating']:.5f}")
    print(f"  DEF Rating: {data['DEF Rating']:.5f}")
    print()

Team Data:
Hull:
  OFF Rating: 0.97368
  DEF Rating: 2.10526

Burnley:
  OFF Rating: 1.00391
  DEF Rating: 1.49219

Crystal Palace:
  OFF Rating: 1.15986
  DEF Rating: 1.46259

Everton:
  OFF Rating: 1.21429
  DEF Rating: 1.40816

Middlesbrough:
  OFF Rating: 0.71053
  DEF Rating: 1.39474

Southampton:
  OFF Rating: 1.12782
  DEF Rating: 1.64286

Man City:
  OFF Rating: 2.45578
  DEF Rating: 0.82653

Bournemouth:
  OFF Rating: 1.25688
  DEF Rating: 1.77064

Arsenal:
  OFF Rating: 1.88435
  DEF Rating: 1.18367

Chelsea:
  OFF Rating: 1.70005
  DEF Rating: 1.10870

Man Utd:
  OFF Rating: 1.63265
  DEF Rating: 1.12245

Stoke:
  OFF Rating: 1.00000
  DEF Rating: 1.63158

Swansea:
  OFF Rating: 0.96053
  DEF Rating: 1.65789

Spurs:
  OFF Rating: 1.89036
  DEF Rating: 1.14398

Watford:
  OFF Rating: 1.08421
  DEF Rating: 1.74737

West Brom:
  OFF Rating: 0.95614
  DEF Rating: 1.60526

Leicester:
  OFF Rating: 1.51504
  DEF Rating: 1.46241

Sunderland:
  OFF Rating: 0.76316
  DEF Rating: 1.81

In [9]:
#Work out each teams SPI rating for a season

In [10]:
import pandas as pd
import numpy as np
from scipy.stats import poisson
from datetime import date


In [11]:
elo_rating_url="http://api.clubelo.com/%s" % date.today()
print("Downloading ELO Ratings from %s" % elo_rating_url)
date.today()
elo_df = pd.read_csv(elo_rating_url)
elo_df.head()

Downloading ELO Ratings from http://api.clubelo.com/2024-03-16


Unnamed: 0,Rank,Club,Country,Level,Elo,From,To
0,1.0,Man City,ENG,1,2052.299072,2024-03-15,2024-03-17
1,2.0,Inter,ITA,1,1976.278931,2024-03-15,2024-03-17
2,3.0,Liverpool,ENG,1,1967.693604,2024-03-15,2024-03-17
3,4.0,Real Madrid,ESP,1,1966.906494,2024-03-15,2024-03-16
4,5.0,Arsenal,ENG,1,1956.634888,2024-03-15,2024-03-16


In [12]:
elo_df = elo_df[elo_df["Country"]=="ENG"]
elo_df['Team'] = elo_df['Club']
elo_df['ELO Rating'] = elo_df['Elo']

elo_df.head()

Unnamed: 0,Rank,Club,Country,Level,Elo,From,To,Team,ELO Rating
0,1.0,Man City,ENG,1,2052.299072,2024-03-15,2024-03-17,Man City,2052.299072
2,3.0,Liverpool,ENG,1,1967.693604,2024-03-15,2024-03-17,Liverpool,1967.693604
4,5.0,Arsenal,ENG,1,1956.634888,2024-03-15,2024-03-16,Arsenal,1956.634888
11,12.0,Tottenham,ENG,1,1858.789551,2024-03-15,2024-03-16,Tottenham,1858.789551
13,14.0,Aston Villa,ENG,1,1836.786865,2024-03-15,2024-03-17,Aston Villa,1836.786865


In [13]:
#calculate the win/lose/draw probabilities from correct score probabilities using poisson 


# Function to simulate Poisson distribution
number_of_goals = 9

def simulate_poisson_distribution(home_team, away_team):
    score_matrix = np.zeros((number_of_goals, number_of_goals))

    for home_goals in range(number_of_goals):
        for away_goals in range(number_of_goals):
            home_prob = poisson.pmf(home_goals, home_team)
            away_prob = poisson.pmf(away_goals, away_team)
            score_matrix[home_goals][away_goals] = home_prob * away_prob

    return score_matrix


home_team_name = 'Chelsea' 
away_team_name = 'Man City' 

# Extract ELO ratings for home and away teams
home_elo_rating = elo_df.loc[elo_df['Team'] == home_team_name, 'ELO Rating'].values[0]
away_elo_rating = elo_df.loc[elo_df['Team'] == away_team_name, 'ELO Rating'].values[0]

# Adjust the lambda parameter for the home and away teams based on ELO ratings
home_xg = 1.74208 * (home_elo_rating / 1000)  # Adjusted for demonstration, replace with your calculation
away_xg = 2.35229 * (away_elo_rating / 1000)  # Adjusted for demonstration, replace with your calculation

result_matrix = simulate_poisson_distribution(home_xg, away_xg)

# Display the result matrix
for home_goals in range(number_of_goals):
    for away_goals in range(number_of_goals):
        print(f'{home_goals}-{away_goals}: {result_matrix[home_goals][away_goals]:.4f}')

0-0: 0.0004
0-1: 0.0017
0-2: 0.0041
0-3: 0.0066
0-4: 0.0080
0-5: 0.0077
0-6: 0.0062
0-7: 0.0043
0-8: 0.0026
1-0: 0.0011
1-1: 0.0053
1-2: 0.0129
1-3: 0.0207
1-4: 0.0250
1-5: 0.0241
1-6: 0.0194
1-7: 0.0134
1-8: 0.0081
2-0: 0.0017
2-1: 0.0083
2-2: 0.0201
2-3: 0.0323
2-4: 0.0390
2-5: 0.0376
2-6: 0.0303
2-7: 0.0209
2-8: 0.0126
3-0: 0.0018
3-1: 0.0086
3-2: 0.0209
3-3: 0.0336
3-4: 0.0405
3-5: 0.0391
3-6: 0.0315
3-7: 0.0217
3-8: 0.0131
4-0: 0.0014
4-1: 0.0067
4-2: 0.0163
4-3: 0.0262
4-4: 0.0316
4-5: 0.0305
4-6: 0.0245
4-7: 0.0169
4-8: 0.0102
5-0: 0.0009
5-1: 0.0042
5-2: 0.0101
5-3: 0.0163
5-4: 0.0197
5-5: 0.0190
5-6: 0.0153
5-7: 0.0105
5-8: 0.0064
6-0: 0.0005
6-1: 0.0022
6-2: 0.0053
6-3: 0.0085
6-4: 0.0102
6-5: 0.0099
6-6: 0.0079
6-7: 0.0055
6-8: 0.0033
7-0: 0.0002
7-1: 0.0010
7-2: 0.0023
7-3: 0.0038
7-4: 0.0046
7-5: 0.0044
7-6: 0.0035
7-7: 0.0024
7-8: 0.0015
8-0: 0.0001
8-1: 0.0004
8-2: 0.0009
8-3: 0.0015
8-4: 0.0018
8-5: 0.0017
8-6: 0.0014
8-7: 0.0010
8-8: 0.0006


In [14]:
import numpy as np
from scipy.stats import poisson

number_of_goals = 9

def simulate_poisson_distribution(home_team, away_team):
    score_matrix = np.zeros((number_of_goals, number_of_goals))

    for home_goals in range(number_of_goals):
        for away_goals in range(number_of_goals):
            home_prob = poisson.pmf(home_goals, home_team)
            away_prob = poisson.pmf(away_goals, away_team)
            score_matrix[home_goals][away_goals] = home_prob * away_prob

    return score_matrix

def print_result_grid(result_matrix):
    print("  |", end="")
    for i in range(number_of_goals):
        print(f"  {i}   |", end="")
    print("\n--+------+------+------+------+------+------+------+------+------+")

    for home_goals in range(number_of_goals):
        print(f"{home_goals} |", end="")
        for away_goals in range(number_of_goals):
            print(f" {result_matrix[home_goals][away_goals]:.2f} |", end="")
        print()

def calculate_win_draw_probabilities(result_matrix):
    home_win_prob = np.sum(np.tril(result_matrix, -1))  # Sum the lower triangular part for home win
    away_win_prob = np.sum(np.triu(result_matrix, 1))   # Sum the upper triangular part for away win
    draw_prob = np.sum(np.diag(result_matrix))         # Sum the diagonal for draw

    return home_win_prob, away_win_prob, draw_prob

home_team_xg = 2.35229  # calculated from a seperate source
away_team_xg = 1.74208  # calculated from a seperate source

result_matrix = simulate_poisson_distribution(home_team_xg, away_team_xg)

# Display the result matrix as a grid
print_result_grid(result_matrix)

# Calculate and print win and draw probabilities
home_win_prob, away_win_prob, draw_prob = calculate_win_draw_probabilities(result_matrix)

print("\nWin Probabilities:")
print(f"\nHome Win Probability: {home_win_prob:.2f}")
print(f"Away Win Probability: {away_win_prob:.2f}")
print(f"Draw Probability: {draw_prob:.2f}")

#calculate and print prices
home_price = 1 / home_win_prob
draw_price = 1 / draw_prob
away_price = 1 / away_win_prob

print("\nWin Prices:")
print(f"\nHome Price : {home_price:.2f}")
print(f"Draw Price : {draw_price:.2f}")
print(f"Away Price : {away_price:.2f}")

  |  0   |  1   |  2   |  3   |  4   |  5   |  6   |  7   |  8   |
--+------+------+------+------+------+------+------+------+------+
0 | 0.02 | 0.03 | 0.03 | 0.01 | 0.01 | 0.00 | 0.00 | 0.00 | 0.00 |
1 | 0.04 | 0.07 | 0.06 | 0.03 | 0.02 | 0.01 | 0.00 | 0.00 | 0.00 |
2 | 0.05 | 0.08 | 0.07 | 0.04 | 0.02 | 0.01 | 0.00 | 0.00 | 0.00 |
3 | 0.04 | 0.06 | 0.05 | 0.03 | 0.01 | 0.00 | 0.00 | 0.00 | 0.00 |
4 | 0.02 | 0.04 | 0.03 | 0.02 | 0.01 | 0.00 | 0.00 | 0.00 | 0.00 |
5 | 0.01 | 0.02 | 0.02 | 0.01 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
6 | 0.00 | 0.01 | 0.01 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
7 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
8 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |

Win Probabilities:

Home Win Probability: 0.52
Away Win Probability: 0.29
Draw Probability: 0.20

Win Prices:

Home Price : 1.94
Draw Price : 5.09
Away Price : 3.49


In [15]:
#Calculating win prob & supremacy

def calculate_win_probability(man_city_rating, liverpool_rating):
    formula = -(man_city_rating - liverpool_rating) / 400
    win_probability = 1 / (1 + 10**formula)
    return win_probability

def calculate_supremacy(man_city_rating, liverpool_rating):
    supremacy = (man_city_rating - liverpool_rating) * 0.462476 / abs(man_city_rating - liverpool_rating)
    return supremacy

# SPI ratings
man_city_rating = 2047
liverpool_rating = 1957

# Calculate and print win probability and supremacy
win_probability = calculate_win_probability(man_city_rating, liverpool_rating)
supremacy = calculate_supremacy(man_city_rating, liverpool_rating)

price = 1 / win_probability  # Calculate Price

print(f"Win Probability for Manchester City: {win_probability:.2f}")
print(f"Supremacy for Manchester City: {supremacy:.2f}")
print(f"Price for Manchester City: {price:.2f}")

Win Probability for Manchester City: 0.63
Supremacy for Manchester City: 0.46
Price for Manchester City: 1.60


In [17]:
# Win probabilities
# Home win probability = []
# Draw Probability = []
# Away win probability = []

# Prices
# Home price = []
# Draw Price = []
# Away Price = []

# Supremacy = []

# Results
# Home Goals = []
# Away Goals = []

# XG
# Home Team XG = []
# Away Team XG = []


In [18]:
import numpy as np

# Number of dice rolls
num_rolls = 10000

# Simulate rolling two dice
dice_rolls = np.random.randint(1, 7, size=(num_rolls, 2))

# Calculate the sum of each pair of dice
dice_sums = np.sum(dice_rolls, axis=1)

# Count occurrences of each sum
sum_counts = np.bincount(dice_sums, minlength=13)[2:]

# Calculate the percentage of each sum
sum_percentages = (sum_counts / num_rolls) * 100

# Print the count and percentage of each sum
for number, (count, percentage) in enumerate(zip(sum_counts, sum_percentages), start=2):
    print(f'Sum {number}: {count} times ({percentage:.2f}%)')


Sum 2: 258 times (2.58%)
Sum 3: 601 times (6.01%)
Sum 4: 845 times (8.45%)
Sum 5: 1082 times (10.82%)
Sum 6: 1383 times (13.83%)
Sum 7: 1717 times (17.17%)
Sum 8: 1352 times (13.52%)
Sum 9: 1115 times (11.15%)
Sum 10: 817 times (8.17%)
Sum 11: 561 times (5.61%)
Sum 12: 269 times (2.69%)
