In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
import requests
import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_odd_or_text(td):
    if "data-odd" in td.attrs:
        return td["data-odd"]

    odd = td.select_one("[data-odd]")
    if odd:
        return odd["data-odd"]

    return td.get_text(strip=True) or 0

In [3]:
base_url = "https://www.betexplorer.com/soccer/england/premier-league-"

In [4]:
all_data = []
url_list = []

In [5]:
for year in range(2002, 2023):
    year_next = year+1
    url = base_url + str(year) + "-" + str(year_next) + "/results/"
    url_list.append(url)

In [6]:
for url in url_list:
    response = requests.get(url)
    if response.status_code == 404:
        continue
    soup = BeautifulSoup(requests.get(url).content, "html.parser")
    for row in soup.select(".table-main tr:has(td)"):
        tds = [get_odd_or_text(td) if td else 0 for td in row.select("td")]
        round_ = row.find_previous("th").find_previous("tr").th.text
        all_data.append([round_, *tds])

In [7]:
df = pd.DataFrame(
    all_data, columns=["Round", "Match", "Score", "1", "X", "2", "Date"])

df['Home'] = [i.split('-')[0] for i in df['Match']]
df['Away'] = [i.split('-')[1] for i in df['Match']]

df['HomeGoals'] = [i.split(':', 1)[0] for i in df['Score']]
df['AwayGoals'] = [i.split(':', 1)[1] for i in df['Score']]
df['Competition'] = "English Premier League"

In [8]:
df.shape

(7980, 12)

In [9]:
df.head(3)

Unnamed: 0,Round,Match,Score,1,X,2,Date,Home,Away,HomeGoals,AwayGoals,Competition
0,38. Round,Birmingham-West Ham,2:2,2.86,3.54,2.02,11.05.2003,Birmingham,West Ham,2,2,English Premier League
1,38. Round,Bolton-Middlesbrough,2:1,1.57,3.62,4.7,11.05.2003,Bolton,Middlesbrough,2,1,English Premier League
2,38. Round,Charlton-Fulham,0:1,2.19,3.14,3.06,11.05.2003,Charlton,Fulham,0,1,English Premier League


In [10]:
df.tail(3)

Unnamed: 0,Round,Match,Score,1,X,2,Date,Home,Away,HomeGoals,AwayGoals,Competition
7977,1. Round,Newcastle-Nottingham,2:0,1.58,4.03,6.17,06.08.2022,Newcastle,Nottingham,2,0,English Premier League
7978,1. Round,Tottenham-Southampton,4:1,1.36,5.2,8.73,06.08.2022,Tottenham,Southampton,4,1,English Premier League
7979,1. Round,Crystal Palace-Arsenal,0:2,4.58,3.57,1.84,05.08.2022,Crystal Palace,Arsenal,0,2,English Premier League


In [11]:
df['Date'] = df['Date'].str.split('.', expand=True)[2]+'-'+df['Date'].str.split('.', expand=True)[1]+'-'+df['Date'].str.split('.', expand=True)[0]

In [12]:
df = df[['Date','Competition','Home','Away','HomeGoals','AwayGoals','1','X','2']]
df.head(3)

Unnamed: 0,Date,Competition,Home,Away,HomeGoals,AwayGoals,1,X,2
0,2003-05-11,English Premier League,Birmingham,West Ham,2,2,2.86,3.54,2.02
1,2003-05-11,English Premier League,Bolton,Middlesbrough,2,1,1.57,3.62,4.7
2,2003-05-11,English Premier League,Charlton,Fulham,0,1,2.19,3.14,3.06


In [13]:
def Home_win(df):
    if df.HomeGoals > df.AwayGoals:
        return 1
    else:
        return 0

def Away_win(df):
    if df.HomeGoals < df.AwayGoals:
        return 1
    else:
        return 0

def draw(df):
    if df.HomeGoals == df.AwayGoals:
        return 1
    else:
        return 0


df['HomeWin'] = df.apply(Home_win,axis=1)
df['AwayWin'] = df.apply(Away_win,axis=1)
df['Draw'] = df.apply(draw,axis=1)

In [14]:
df.head(3)

Unnamed: 0,Date,Competition,Home,Away,HomeGoals,AwayGoals,1,X,2,HomeWin,AwayWin,Draw
0,2003-05-11,English Premier League,Birmingham,West Ham,2,2,2.86,3.54,2.02,0,0,1
1,2003-05-11,English Premier League,Bolton,Middlesbrough,2,1,1.57,3.62,4.7,1,0,0
2,2003-05-11,English Premier League,Charlton,Fulham,0,1,2.19,3.14,3.06,0,1,0


In [15]:
df[['1', 'X', '2']] = df[['1', 'X', '2']].astype(np.float64)

In [16]:
def home_bins(df):
    if 1.01 <= df['1'] < 1.31:
        return '1.01-1.3'
    elif 1.31 <= df['1'] <= 1.5:
        return '1.31-1.5'
    elif 1.51 <= df['1'] <= 1.8:
        return '1.51-1.8'
    elif 1.81 <= df['1'] <= 2:
        return '1.81-2.0'
    elif 2.01 <= df['1'] <= 2.6:
        return '2.01-2.6'
    else:
        return 'Outsider'

def away_bins(df):
    if 1.01 <= df['2'] < 1.31:
        return '1.01-1.3'
    elif 1.31 <= df['2'] <= 1.5:
        return '1.31-1.5'
    elif 1.51 <= df['2'] <= 1.8:
        return '1.51-1.8'
    elif 1.81 <= df['2'] <= 2:
        return '1.81-2.0'
    elif 2.01 <= df['2'] <= 2.6:
        return '2.01-2.6'
    else:
        return 'Outsider'

df['HomeBins'] = df.apply(home_bins,axis=1)
df['AwayBins'] = df.apply(away_bins,axis=1)

In [17]:
df.sample(10)

Unnamed: 0,Date,Competition,Home,Away,HomeGoals,AwayGoals,1,X,2,HomeWin,AwayWin,Draw,HomeBins,AwayBins
3004,2009-08-30,English Premier League,Aston Villa,Fulham,2,0,1.75,3.47,4.86,1,0,0,1.51-1.8,Outsider
4775,2014-12-20,English Premier League,Manchester City,Crystal Palace,3,0,1.3,5.6,10.87,1,0,0,1.01-1.3,Outsider
1525,2007-05-13,English Premier League,Middlesbrough,Fulham,3,1,1.84,3.31,4.07,1,0,0,1.81-2.0,Outsider
1723,2006-12-17,English Premier League,Manchester City,Tottenham,1,2,2.4,3.18,2.81,0,1,0,2.01-2.6,Outsider
7572,2021-08-29,English Premier League,Wolves,Manchester Utd,0,1,4.95,3.7,1.76,0,1,0,Outsider,1.51-1.8
104,2003-02-22,English Premier League,Bolton,Manchester Utd,1,1,5.02,3.62,1.6,0,0,1,Outsider,1.51-1.8
513,2004-02-11,English Premier League,Charlton,Tottenham,2,4,2.13,3.15,3.05,0,1,0,2.01-2.6,Outsider
5690,2016-08-15,English Premier League,Chelsea,West Ham,2,1,1.52,4.27,6.88,1,0,0,1.51-1.8,Outsider
2623,2008-09-13,English Premier League,Blackburn,Arsenal,0,4,4.5,3.38,1.81,0,1,0,Outsider,1.81-2.0
13,2003-05-03,English Premier League,Blackburn,West Brom,1,1,1.37,4.12,7.5,0,0,1,1.31-1.5,Outsider
