# Package installations and imports

In [1]:
# Module installations
# !pip install -U matplotlib
!pip install xgboost

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import pandas as pd
import numpy as np
import sklearn as skn
import matplotlib.pyplot as plt
import xgboost as xgb
import requests
import datetime
import traceback
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import RandomOverSampler


# Load dataset

In [71]:
DATASET_NAME = "premier_league_copy"
matches = pd.read_csv(f"{DATASET_NAME}.csv").dropna()
matches.date = pd.to_datetime(matches.date)
matches = matches.sort_values("date", ascending=True).reset_index(drop=True)
matches

Unnamed: 0,id,date,status,league_name,league_id,league_type,stage,home_team,home_team_id,away_team,...,>2.5,>3.5,>4.5,elevation,temperature_2m,rain,cloud_cover,wind_speed_10m,wind_direction_10m,relative_humidity_2m
0,327362,2021-08-13 19:00:00+00:00,FINISHED,Premier League,2021,LEAGUE,REGULAR_SEASON,Brentford,402,Arsenal,...,0,0,0,10.0,17.2,0.0,44.0,14.1,218.0,82.0
1,327357,2021-08-14 11:30:00+00:00,FINISHED,Premier League,2021,LEAGUE,REGULAR_SEASON,Man United,66,Leeds United,...,1,0,0,32.0,16.4,0.3,89.0,11.0,251.0,76.0
2,327359,2021-08-14 14:00:00+00:00,FINISHED,Premier League,2021,LEAGUE,REGULAR_SEASON,Everton,62,Southampton,...,0,0,0,41.0,15.5,0.8,100.0,5.2,304.0,87.0
3,327360,2021-08-14 14:00:00+00:00,FINISHED,Premier League,2021,LEAGUE,REGULAR_SEASON,Chelsea,61,Crystal Palace,...,0,0,0,9.0,23.0,0.0,35.0,17.2,237.0,56.0
4,327358,2021-08-14 14:00:00+00:00,FINISHED,Premier League,2021,LEAGUE,REGULAR_SEASON,Leicester City,338,Wolverhampton,...,0,0,0,54.0,19.1,0.5,75.0,8.8,279.0,83.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1091,436232,2024-04-24 18:45:00+00:00,FINISHED,Premier League,2021,LEAGUE,REGULAR_SEASON,Wolverhampton,76,Bournemouth,...,0,0,0,130.0,4.6,0.0,4.0,7.4,61.0,73.0
1092,436230,2024-04-24 19:00:00+00:00,FINISHED,Premier League,2021,LEAGUE,REGULAR_SEASON,Man United,66,Sheffield Utd,...,1,0,0,32.0,3.0,0.0,70.0,8.7,263.0,86.0
1093,436227,2024-04-24 19:00:00+00:00,FINISHED,Premier League,2021,LEAGUE,REGULAR_SEASON,Everton,62,Liverpool,...,0,0,0,41.0,3.8,0.0,25.0,7.7,298.0,83.0
1094,436226,2024-04-24 19:00:00+00:00,FINISHED,Premier League,2021,LEAGUE,REGULAR_SEASON,Crystal Palace,354,Newcastle,...,0,0,0,55.0,4.2,0.0,46.0,6.1,315.0,79.0


In [72]:
matches = matches[["date", "home_team_id", "away_team_id", "home/away",]]
matches

Unnamed: 0,date,home_team_id,away_team_id,home/away
0,2021-08-13 19:00:00+00:00,402,57,1
1,2021-08-14 11:30:00+00:00,66,341,1
2,2021-08-14 14:00:00+00:00,62,340,1
3,2021-08-14 14:00:00+00:00,61,354,1
4,2021-08-14 14:00:00+00:00,338,76,1
...,...,...,...,...
1091,2024-04-24 18:45:00+00:00,76,1044,0
1092,2024-04-24 19:00:00+00:00,66,356,1
1093,2024-04-24 19:00:00+00:00,62,64,1
1094,2024-04-24 19:00:00+00:00,354,67,1


-0.11405392794053924

# Producing the required outcomes p/model

In [26]:
# Define the classes
matches["home/away"] = (matches.score_home_ft > matches.score_away_ft).astype(int)
matches["draw"] = (matches.score_home_ft == matches.score_away_ft).astype(int)
matches["both_score"] = ((matches.score_home_ft > 0) | (matches.score_away_ft > 0)).astype(int)
matches[">1.5"] = ((matches.score_home_ft + matches.score_away_ft) / 2  > 1.5).astype(int)
matches[">2.5"] = ((matches.score_home_ft + matches.score_away_ft) / 2  > 2.5).astype(int)
matches[">3.5"] = ((matches.score_home_ft + matches.score_away_ft) / 2  > 3.5).astype(int)
matches[">4.5"] = ((matches.score_home_ft + matches.score_away_ft) / 2  > 3.5).astype(int)
matches = matches.drop(columns=["score_home_ft", "score_away_ft", "score_home_ht", "score_away_ht"], axis=1).dropna().reset_index(drop=True)
matches

Unnamed: 0,id,date,status,league_name,league_id,league_type,stage,home_team,home_team_id,away_team,...,home_odds,draw_odds,away_odds,home/away,draw,both_score,>1.5,>2.5,>3.5,>4.5
0,327362,2021-08-13 19:00:00+00:00,FINISHED,Premier League,2021,LEAGUE,REGULAR_SEASON,Brentford,402,Arsenal,...,3.63,3.63,1.98,1,0,1,0,0,0,0
1,327357,2021-08-14 11:30:00+00:00,FINISHED,Premier League,2021,LEAGUE,REGULAR_SEASON,Man United,66,Leeds United,...,1.61,4.10,5.23,1,0,1,1,1,0,0
2,327359,2021-08-14 14:00:00+00:00,FINISHED,Premier League,2021,LEAGUE,REGULAR_SEASON,Everton,62,Southampton,...,1.93,3.67,3.80,1,0,1,1,0,0,0
3,327360,2021-08-14 14:00:00+00:00,FINISHED,Premier League,2021,LEAGUE,REGULAR_SEASON,Chelsea,61,Crystal Palace,...,1.28,5.69,10.14,1,0,1,0,0,0,0
4,327358,2021-08-14 14:00:00+00:00,FINISHED,Premier League,2021,LEAGUE,REGULAR_SEASON,Leicester City,338,Wolverhampton,...,1.60,3.89,5.64,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1098,436292,2024-04-27 14:00:00+00:00,FINISHED,Premier League,2021,LEAGUE,REGULAR_SEASON,Wolverhampton,76,Luton Town,...,1.95,3.85,3.74,1,0,1,0,0,0,0
1099,436288,2024-04-27 14:00:00+00:00,FINISHED,Premier League,2021,LEAGUE,REGULAR_SEASON,Newcastle,67,Sheffield Utd,...,1.21,7.38,12.32,1,0,1,1,1,0,0
1100,436287,2024-04-27 14:00:00+00:00,FINISHED,Premier League,2021,LEAGUE,REGULAR_SEASON,Man United,66,Burnley,...,1.55,4.83,5.25,0,1,1,0,0,0,0
1101,436285,2024-04-27 16:30:00+00:00,FINISHED,Premier League,2021,LEAGUE,REGULAR_SEASON,Everton,62,Brentford,...,2.49,3.47,2.83,1,0,1,0,0,0,0


In [44]:
# Save the current CSV file with the progress of the data
def save_dataset_copy():
    matches.to_csv(f"{DATASET_NAME}_copy.csv", index=False)
save_dataset_copy()

In [27]:
unallocated_stadium_names = list()
for index, match in matches.iterrows():
    if stadium_names.get(match.venue) is None:
        if match.venue not in unallocated_stadium_names:
            unallocated_stadium_names.append(match.venue)
unallocated_stadium_names # Should return an empty list

[]

# Location and weather functions

In [28]:
# Additions of weather conditions to the data
def get_stadium_coordinates(stadium_name):
  COORDINATES_ENDPOINT = f"https://geocode-api.arcgis.com/arcgis/rest/services/World/GeocodeServer/findAddressCandidates?=&f=json&token=AAPK4ccecd97951a4c1da6418e3fba9e2c58t5qYY8i5kRqFQkT8iKyh_VhKWIQnTVSrH7Cxw-w69tnHkqj91rbvOTyZlMkhQNBV&address={stadium_name}+stadium"
  coordinates_response = requests.get(COORDINATES_ENDPOINT)
  if coordinates_response.status_code == 200:
    locations_found = coordinates_response.json()["candidates"]
    if len(locations_found) == 0: return None, None, None

    best_location = None
    highest_score = 0
    for location_found in locations_found:
      if location_found["score"] > highest_score:
        best_location = location_found
        highest_score = location_found["score"]

    if best_location:
      lat = best_location["location"]["y"]
      lng = best_location["location"]["x"]
      place_id = abs(lat + lng) # use the unique lat/lng coords as the stadium ID
      return lat, lng, place_id


"""Gets the elevation of a place given the coordinates"""
def get_place_elevation(lat, lng):
    elevation = requests.get(f"https://api.open-elevation.com/api/v1/lookup?locations={lat},{lng}")
    if elevation.status_code == 200:
        return elevation.json()["results"][0]["elevation"]
    else:
        return None


def get_stadium_weather(lat, lng, timestamp, forecast=True):
    try:
        if lat != None and lng != None and timestamp:
            date = datetime.datetime.fromtimestamp(timestamp)
            day = date.strftime("%Y-%m-%d")
            weather_response = requests.get(
               f"https://archive-api.open-meteo.com/v1/archive?latitude={lat}&longitude={lng}&start_date={day}&end_date={day}&hourly=temperature_2m,rain,cloud_cover,wind_speed_10m,wind_direction_10m,relative_humidity_2m&timeformat=unixtime" if forecast == False else f"https://api.open-meteo.com/v1/forecast?latitude={lat}&longitude={lng}&start_date={day}&end_date={day}&hourly=temperature_2m,rain,cloud_cover,wind_speed_10m,wind_direction_10m,relative_humidity_2m&timeformat=unixtime")
            print()
               
            if weather_response.status_code == 200:
                response = weather_response.json()
                hour = date.hour
                
                return dict(
                    elevation=response["elevation"],
                    temperature_2m=response["hourly"]["temperature_2m"][hour],
                    rain=response["hourly"]["rain"][hour],
                    cloud_cover=response["hourly"]["cloud_cover"][hour],
                    wind_speed_10m=response["hourly"]["wind_speed_10m"][hour],
                    wind_direction_10m=response["hourly"]["wind_direction_10m"][hour],
                    relative_humidity_2m=response["hourly"]["relative_humidity_2m"][hour],
                )
            else:
                print(weather_response.json())
                return dict(elevation=None,
                            temperature_2m=None,
                            rain=None,
                            cloud_cover=None,
                            wind_speed_10m=None,
                            relative_humidity_2m=None,
                            wind_direction_10m=None)
        else:
            return dict(elevation=None,
                        temperature_2m=None,
                        rain=None,
                        cloud_cover=None,
                        wind_direction_10m=None,
                        relative_humidity_2m=None,
                        wind_speed_10m=None)
    except:
        print(traceback.format_exc())
        return dict(elevation=None,
                        temperature_2m=None,
                        rain=None,
                        cloud_cover=None,
                        wind_direction_10m=None,
                        relative_humidity_2m=None,
                        wind_speed_10m=None)

# Get stadium names and coordinates

In [None]:
# Outputs all the stadiums into a dictionary
stadium_names = dict()
for index, name in enumerate(matches.venue.values):
    print("Pgrs: ", round(index / matches.venue.values.shape[0] * 100, 2))
    stadium_names[name] = get_stadium_coordinates(name)
stadium_names

# Loading the stadium names by coordinates

In [48]:
# Coordinates and placeID (lat+lng)
stadium_names = {
    'City Ground, Nottingham': [52.9398241, -1.1354287],
    'St. Mary\'s Stadium': [50.9058106, -1.3926777],
    'The American Express Community Stadium': [50.8615651, -0.0862912],
    'Vicarage Road Stadium': [51.6499133, -0.4015284],
    'ARENA MRV': [-19.929952, -44.014057],
    'De Grolsch Veste': [52.2365348, 6.8352838],
    'Unipol Domus': [39.199621, 9.137395],
    'Stade Francis-Le Blé': [48.402995, -4.461427],
    'Hillsborough': [53.411664, -1.501752],
    'London Stadium': [51.538714, -0.016506],
    'Benito Villamarín': [37.35649, -5.98195],
    'Philips Stadion': [51.4415191, 5.4673271],
    'Kras Stadion': [40.533993, 22.20226],
    'Anfield': [53.4308435, -2.9633923],
    'Stade Pierre Mauroy': [50.6128586, 3.1301388],
    'Stade de la Mosson': [43.6226285, 3.8116955],
    'Stade Saint Symphorien': [49.109774, 6.159539],
    'Amex Stadium': [50.8615651, -0.0862912],
    'Stadio Brianteo': [45.5825046, 9.3091837],
    'Villa Park': [52.5091117, -1.8873577],
    'Deutsche Bank Park': [50.0686015, 8.6454229],
    'Campo de Fútbol de Vallecas': [40.392062, -3.6612824],
    'Old Trafford': [53.4597707, -2.288024],
    'Estadio dos Arcos': [41.3631, -8.74012],
    'Roazhon Park': [48.1074881, -1.7154413],
    'BayArena': [51.038209, 6.9973857],
    'Giuseppe Meazza': [45.4785976, 9.1233441],
    'San Mamés': [43.2641706, -2.951948],
    'Estadio Jose Gomes': [38.7520231, -9.2279409],
    'Serrinha': [-11.6666237, -39.0105109],
    'Arena da Baixada': [-25.4478575, -49.2759763],
    'Diego Armando Maradona': [40.8265023, 14.1920857],
    'Cívitas Metropolitano': [40.4362323, -3.6020568],
    'Estadio do Futebol Clube de Vizela': [41.3887095, -8.3073114],
    'Beira-Rio': [-30.0654496, -51.2384349],
    'Neo Química Arena': [-23.5463647, -46.4742934],
    'Engenhão': [-22.8932749, -43.2948877],
    'Stadio Carlo Castellani': [43.72667, 10.95464],
    'Stadio Olimpico, Rome': [41.9317718, 12.4556773],
    'Nuevo Los Cármenes': [37.152851, -3.595725],
    'Coventry Building Society Arena': [52.448158, -1.495712],
    'Estadio do Bessa XXI': [41.162187, -8.642561],
    'Arena Fonte Nova': [-12.9798324, -38.5042312],
    'Couto Pereira': [-25.4209759, -49.2620552],
    'Estádio Nacional Mané Garrincha': [-15.78356, -47.9018195],
    'Johan Cruijff ArenA': [52.3143557, 4.9392738],
    'Pantanal': [-15.6040194, -56.1241994],
    'Morumbí': [-23.600084, -46.720136],
    'Merck-Stadion am Böllenfalltor': [49.8576828, 8.6697562],
    "Stadio Renato Dell'Ara": [44.4925713, 11.3096494],
    'Parc des Princes': [48.8414355, 2.2481775],
    'King Power Stadium': [52.6203662, -1.1447644],
    'Estadio de Gran Canaria': [28.1003747, -15.4567183],
    'Estadio do Dragao': [41.16177, -8.5861659],
    'Craven Cottage': [51.4749002, -0.2264787],
    'El Sadar': [42.7966918, -1.6396947],
    'Stadio Arechi': [40.6511269, 14.7923143],
    'Reale Arena': [43.3013668, -1.9761617],
    'Red Bull Arena': [40.7368435, -74.1551064],
    'Emirates Stadium': [51.5550821, -0.1109747],
    'Allianz Arena': [48.2187971, 11.6221367],
    'Parken': [55.702755, 12.572343],
    'Estadio Santiago Bernabeu': [40.4521084, -3.6884956],
    'Independência': [-19.9087707, -43.9228718],
    'Urbano Caldeira': [-23.9510244, -46.3414042],
    'Maracanã': [-22.9160007, -43.2300071],
    'São Januario': [-22.89073, -43.22896],
    'Stadio Mapei - Citta del Tricolore': [44.7145752, 10.6464456],
    'Fortuna Sittard Stadion': [50.9920034, 5.8436585],
    'Borussia-Park': [51.1751503, 6.3843216],
    'Stadio Luigi Ferraris': [44.4161239, 8.9519026],
    'Ewood Park': [53.7286169, -2.49175482],
    'Antonio Coimbra da Mota': [38.715883, -9.406346],
    'Molineux Stadium': [52.5902362, -2.1330037],
    'Stadium of Light': [54.914561, -1.3909459],
    'Stadio Via del Mare': [40.3651295, 18.2064023],
    'Mercedes-Benz Arena': [52.5062262, 13.4412531],
    'WWK ARENA': [48.3231778, 10.8838436],
    'Cardiff City Stadium': [51.4728372, -3.2056092],
    "Saint Mary's": [50.9058105, -1.3949737],
    'MKM Stadium': [53.7459183, -0.3706354],
    'Elland Road': [53.777792, -1.572083],
    'Vicarage Road': [51.649818, -0.401838],
    'Selhurst Park': [51.397894, -0.085924],
    'Portman Road': [52.05494, 1.14525],
    'Riverside': [54.5779904, -1.2179441],
    'Loftus Road': [51.50912, -0.231915],
    'Estadio de los Juegos Mediterráneos': [36.8400327, -2.4354519],
    'Estádio Municipal de Portimão': [37.1358092, -8.5397795],
    'Stade Auguste Delaune': [49.2470549, 4.0213585],
    'Allianz Stadium': [-33.8893938, 151.2245596],
    'Vitality Stadium': [50.7348316, -1.8416532],
    'Vonovia Ruhrstadion': [51.4898225, 7.2376155],
    'Mandemakers Stadion': [51.6866276, 5.086397],
    'Santiago Bernabéu': [40.4532703, -3.690148],
    'Stade Océane': [49.4993417, 0.1667196],
    'GelreDome': [51.9631196, 5.8892381],
    'Estadio D. Afonso Henriques': [41.44584, -8.30072],
    'Arena Barueri': [-23.5121766, -46.9028573],
    'Yanmar Stadion': [52.3933654, 5.2384738],
    'Stade Gabriel Montpied': [45.81539, 3.1189],
    'Dacia Arena - Stadio Friuli': [46.0814817, 13.1971946],
    'Artemio Franchi, Firenze': [43.7811446, 11.2820626],
    'Olímpic Lluís Companys': [41.36475, 2.15561],
    'Municipal de Barcelos': [41.5510987, -8.6220715],
    'De Kuip': [51.8938354, 4.5220959],
    'Stamford Bridge': [51.481663, -0.1935314],
    'Wohninvest Weserstadion': [53.0671442, 8.8383548],
    'Ramón Sánchez Pizjuán': [37.38421, -5.97069],
    'Municipal de Arouca': [40.932975, -8.25032],
    'Nabizão': [-22.9653951, -46.5400473],
    'Stadion Galgenwaard': [52.07847, 5.14587],
    'Arena do Grêmio': [-29.9741742, -51.1956365],
    'Stade Bollaert-Delelis': [50.4328518, 2.8149571],
    'Estadio da Luz': [38.752592, -9.184668],
    'Castelão': [2.5487596, -44.2858445],
    'Mineirão': [19.865867, -43.9723101],
    'RheinEnergieStadion': [50.9334619, 6.875141],
    'Mendizorroza': [42.83709, -2.68824],
    'New York Stadium': [253.4270248, -1.3655963],
    'Etihad Stadium': [53.4831634, -41.2238111],
    'An der Alten Försterei': [52.4574353, 13.5632896],
    'Signal Iduna Park': [51.4925888, 7.4492825],
    'Europa-Park Stadion': [48.0216207, 7.8295546],
    'Volkswagen Arena': [52.43224, 10.8025807],
    'bet365 Stadium': [52.9882849, -2.1786709],
    'Ashton Gate': [51.439877, -2.621021],
    'Carrow Road': [52.6220876, 1.3066076],
    'John Smith’s Stadium': [53.654323, -1.768921],
    'Deepdale': [53.7723074, -2.6933985],
    "St James' Park": [54.9741178, -1.6277682],
    'Kenilworth Road': [51.88419, -0.43171],
    'The Den': [51.69447, -0.042399],
    'Bramall Lane': [53.3705246, -1.4735892],
    'Swansea.com Stadium': [51.6427932, -3.9382112],
    "St Andrew's Ground": [52.4751532, -1.8708628],
    'Home Park': [51.4242181, -4.3284733],
    'City Ground': [52.9398241, -1.1354287],
    'Turf Moor': [53.7889918, -2.2327442],
    'Estadio de Mestalla': [39.474549, -0.358205],
    'Van Donge & De Roo Stadion': [51.6438242, 4.751706],
    'Gewiss Stadium': [45.7090796, 9.678241],
    'Brentford Community Stadium': [51.490801, -0.2937049],
    'Coliseum Alfonso Pérez': [40.3257419, -3.7149057],
    'The Hawthorns': [52.5090842, -1.9664843],
    'Abe Lenstra Stadion': [52.9591087, 5.9344877],
    'Stade de la Meinau': [48.55942, 7.75738],
    'MAC³PARK stadion': [52.5163708, 6.1209173],
    'Allianz Riviera': [43.7044505, 7.1921797],
    'Estadio de la Cerámica': [39.9437535, -0.103732],
    'De Goffert': [51.8219656, 5.833922],
    'AFAS Stadion': [52.612973, 4.7392798],
    'Stade de la Beaujoire': [47.2561, -1.52466],
    'Stadio Benito Stirpe': [41.6356594, 13.3228588],
    'Stade du Moustoir': [47.7910622, -3.4160668],
    'Tottenham Hotspur Stadium': [51.6041837, -0.0661199],
    'Voith-Arena': [48.6684699, 10.1366995],
    'Het Kasteel': [51.9197992, 4.4306541],
    'PreZero Arena': [49.2383665, 8.886727],
    'Goodison Park': [53.4388599, -2.9687967],
    'Nuevo Mirandilla': [36.502601, -6.2755034],
    'Groupama Stadium': [45.765295, 4.9794541],
    'Estádio Parque do Sabiá': [-18.91, -48.23],
    'Stadio Marcantonio Bentegodi': [45.43531, 10.96874],
    'Municipal de Montilivi': [41.9610546, 2.8251803],
    'Volksparkstadion': [53.5871748, 9.8986017],
    'Stadion Wankdorf': [46.9627952, 7.4656044],
    'Nef Stadyumu': [41.103203, 28.9886681],
    'Visit Mallorca Estadi': [39.589952, 2.6274751],
    'Estadio Municipal de Braga': [41.5625546, -8.4298374],
    'Allianz Parque': [-23.5279676, -46.6792052],
    'Estadio Municipal Eng. Manuel Branco Teixeira': [41.7506825, -7.4674106],
    'Estadio de Sao Luis': [39.357684, -10.3367182],
    'Municipal 22 de Junho': [41.4013865, -8.525005],
    'Erve Asito': [52.3388758, 6.6475107],
    'Estadio Pina Manique': [38.7369042, -9.2050409],
    'De Adelaarshorst': [52.2605519, 6.1729166],
    'Stadium Municipal': [43.5832972, 1.4314731],
    'Stade Louis II': [43.726671, 7.4151808],
    'MEWA ARENA': [49.9839442, 8.2218067],
    'Estadio Comendador Joaquim de Almeida Freitas': [41.378895590364, -8.354907325555],
    'Orange Vélodrome': [43.2698327, 5.3958908],
    'Stadio Olimpico Grande Torino': [45.0425838, 7.6514638],
    'Municipal de Balaídos': [42.2127767, -8.7369502],
    'Estadio Jose Alvalade': [38.7612286, -9.1593912],
    'Marakana': [-22.9160007, -43.2300071],
    'Bosuil': [51.2324121, 4.4695293],
    'Celtic Park': [55.849696, -4.2081176],
    'Estádio Heriberto Hülse': [-28.684537, -49.367663],
    'Serra Dourada': [-16.699536, -49.234565],
    'Barradão': [-12.9195064, -38.4307914],
    'Octávio Mangabeira': [-12.2156168, -38.9333891],
    'Alfredo Jaconi': [-29.1629286, -51.1792717],
    'Antônio Accioly': [-16.6696586, -49.2865241]
}


In [76]:
# Recalculate the placeID since coordinates of the stadiuns gave changed.
for stadium_name in stadium_names.keys():
    stadium_names[stadium_name] = list(stadium_names[stadium_name])[:2]
stadium_names

{'ARENA MRV': [-19.929952, -44.014057],
 'De Grolsch Veste': [52.2365348, 6.8352838],
 'Unipol Domus': [39.199621, 9.137395],
 'Stade Francis-Le Blé': [48.402995, -4.461427],
 'Hillsborough': [53.411664, -1.501752],
 'London Stadium': [51.538714, -0.016506],
 'Benito Villamarín': [37.35649, -5.98195],
 'Philips Stadion': [51.4415191, 5.4673271],
 'Kras Stadion': [40.533993, 22.20226],
 'Anfield': [53.4308435, -2.9633923],
 'Stade Pierre Mauroy': [50.6128586, 3.1301388],
 'Stade de la Mosson': [43.6226285, 3.8116955],
 'Stade Saint Symphorien': [49.109774, 6.159539],
 'Amex Stadium': [50.8615651, -0.0862912],
 'Stadio Brianteo': [45.5825046, 9.3091837],
 'Villa Park': [52.5091117, -1.8873577],
 'Deutsche Bank Park': [50.0686015, 8.6454229],
 'Campo de Fútbol de Vallecas': [40.392062, -3.6612824],
 'Old Trafford': [53.4597707, -2.288024],
 'Estadio dos Arcos': [41.3631, -8.74012],
 'Roazhon Park': [48.1074881, -1.7154413],
 'BayArena': [51.038209, 6.9973857],
 'Giuseppe Meazza': [45.4785

# Find and append weather condition features to each row.

In [None]:
# Determine the weather condition of the matches played in the time of the match
matches_ = matches.copy()
for index, row in matches_.iterrows():
    stadium_lat, stadium_lng = stadium_names[row.venue]
    timestamp = row.date.timestamp()

    print(f"Progress: {round(index/len(matches_)*100, 2)}%")

    match_weather_conditions = get_stadium_weather(stadium_lat, stadium_lng, timestamp, forecast=False)
    # Iter through each weather variable and append it to the row.
    for weather_condition in match_weather_conditions.keys():
        matches_.loc[index, weather_condition] = match_weather_conditions[weather_condition]
matches_

In [None]:
matches = matches_.dropna().reset_index(drop=True)
save_dataset_copy()
matches

# Combine the weather features into one weather factor value

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=1)
matches_weather_factor = pca.fit_transform(matches[matches.columns[22:]])
matches_weather_factor = pd.concat([matches.drop(columns=matches.columns[22:]), pd.DataFrame(matches_weather_factor.reshape(-1, 1), columns=["weather_condition"])], axis=1)
matches_weather_factor = matches_weather_factor.dropna().reset_index(drop=True)
matches_weather_factor

# Convert categorical variables to integers

In [74]:
# Categorial encodings to integers
matches_encoded = matches.copy()

teams = {}

for index, match in matches_encoded.iterrows():
    encoded_teams = teams.keys()

    # Extract datetime features
    timestamp = datetime.datetime.fromtimestamp(match.date.timestamp())
    matches_encoded.loc[index, "weekday"] = timestamp.weekday()
    matches_encoded.loc[index, "hour"] = timestamp.hour

    # Venue feature
    # stdm_lat, stdm_lng = stadium_names[match.venue]
    # matches_encoded.loc[index, "venue"] = sum([stdm_lat + stdm_lng])

    # if match.home_team not in encoded_teams:
    #     teams[match.home_team] = match.home_team_id
    # if match.away_team not in encoded_teams:
    #     teams[match.away_team] = match.away_team_id
print(teams)
matches_encoded

{}


Unnamed: 0,date,home_team_id,away_team_id,home/away,weekday,hour
0,2021-08-13 19:00:00+00:00,402,57,1,4.0,21.0
1,2021-08-14 11:30:00+00:00,66,341,1,5.0,13.0
2,2021-08-14 14:00:00+00:00,62,340,1,5.0,16.0
3,2021-08-14 14:00:00+00:00,61,354,1,5.0,16.0
4,2021-08-14 14:00:00+00:00,338,76,1,5.0,16.0
...,...,...,...,...,...,...
1091,2024-04-24 18:45:00+00:00,76,1044,0,2.0,20.0
1092,2024-04-24 19:00:00+00:00,66,356,1,2.0,21.0
1093,2024-04-24 19:00:00+00:00,62,64,1,2.0,21.0
1094,2024-04-24 19:00:00+00:00,354,67,1,2.0,21.0


In [75]:
matches_encoded = matches_encoded.drop(columns=["date"], axis=1)
outcome = matches_encoded.pop("home/away")
matches_encoded["home/away"] = outcome
matches_encoded

Unnamed: 0,home_team_id,away_team_id,weekday,hour,home/away
0,402,57,4.0,21.0,1
1,66,341,5.0,13.0,1
2,62,340,5.0,16.0,1
3,61,354,5.0,16.0,1
4,338,76,5.0,16.0,1
...,...,...,...,...,...
1091,76,1044,2.0,20.0,0
1092,66,356,2.0,21.0,1
1093,62,64,2.0,21.0,1
1094,354,67,2.0,21.0,1


In [40]:
# Depending on the focus model, remove outcomes not needed.
matches_encoded = matches_encoded.drop(columns=["draw",
                                                "both_score", 
                                                ">1.5", 
                                                ">2.5", 
                                                ">3.5",
                                                ">4.5"], axis=1)

# Rearrange the columns for better visual format before training
matches_encoded = matches_encoded[["home_team_id", 
                                        "away_team_id", 
                                        "venue", 
                                        "weekday", 
                                        "hour", 
                                        "home_odds", 
                                        "draw_odds", 
                                        "away_odds", 
                                        "weather_condition", 
                                        "home/away"]]
matches_encoded

Unnamed: 0,home_team_id,away_team_id,venue,weekday,hour,home_odds,draw_odds,away_odds,weather_condition,home/away
0,402,57,51.197096,4.0,21.0,3.63,3.63,1.98,-22.823699,1
1,66,341,51.171747,5.0,13.0,1.61,4.10,5.23,-55.056913,1
2,62,340,50.470063,5.0,16.0,1.93,3.67,3.80,-108.012571,1
3,61,354,51.288132,5.0,16.0,1.28,5.69,10.14,-41.457191,1
4,338,76,51.475602,5.0,16.0,1.60,3.89,5.64,-83.173866,1
...,...,...,...,...,...,...,...,...,...,...
1091,76,1044,50.457232,2.0,20.0,2.89,3.61,2.40,134.410724,0
1092,66,356,51.171747,2.0,21.0,1.34,5.85,7.99,-67.531455,1
1093,62,64,50.470063,2.0,21.0,7.18,5.07,1.42,-102.922451,1
1094,354,67,51.31197,2.0,21.0,2.81,3.73,2.40,-119.516254,1


In [60]:
"""Clean up the data before feeding it into training."""
matches_encoded = matches_encoded.dropna().drop_duplicates().reset_index(drop=True)
matches_encoded

Unnamed: 0,home_team_id,away_team_id,cloud_cover,rain,venue,weekday,hour,home/away
0,402,57,44.0,0.0,51.197096,4.0,21.0,1
1,66,341,89.0,0.3,51.171747,5.0,13.0,1
2,62,340,100.0,0.8,50.470063,5.0,16.0,1
3,61,354,35.0,0.0,51.288132,5.0,16.0,1
4,338,76,75.0,0.5,51.475602,5.0,16.0,1
...,...,...,...,...,...,...,...,...
1090,76,1044,4.0,0.0,50.457232,2.0,20.0,0
1091,66,356,70.0,0.0,51.171747,2.0,21.0,1
1092,62,64,25.0,0.0,50.470063,2.0,21.0,1
1093,354,67,46.0,0.0,51.31197,2.0,21.0,1


# Combine game features into one for simplicity for the model


In [284]:
game_features = matches_encoded[matches_encoded.columns[:-2]]
game_features_comb = pd.DataFrame(pca.fit_transform(game_features.values), columns=["game_condition"])

model_data = matches_encoded[matches_encoded.columns[-2:]]
model_data = pd.concat([game_features_comb, model_data["home/away"]], axis=1).reset_index(drop=True)
model_data = matches_encoded
matches_encoded

Unnamed: 0,home_team_id,away_team_id,venue,weekday,hour,home_odds,draw_odds,away_odds,weather_condition,home/away
0,402,57,51.197096,4.0,21.0,3.63,3.63,1.98,-22.823699,1
1,66,341,51.171747,5.0,13.0,1.61,4.10,5.23,-55.056913,1
2,62,340,50.470063,5.0,16.0,1.93,3.67,3.80,-108.012571,1
3,61,354,51.288132,5.0,16.0,1.28,5.69,10.14,-41.457191,1
4,338,76,51.475602,5.0,16.0,1.60,3.89,5.64,-83.173866,1
...,...,...,...,...,...,...,...,...,...,...
1091,76,1044,50.457232,2.0,20.0,2.89,3.61,2.40,134.410724,0
1092,66,356,51.171747,2.0,21.0,1.34,5.85,7.99,-67.531455,1
1093,62,64,50.470063,2.0,21.0,7.18,5.07,1.42,-102.922451,1
1094,354,67,51.31197,2.0,21.0,2.81,3.73,2.40,-119.516254,1


# Model training

In [77]:
"""Train set"""
TRAIN_SAMPLE_SIZE = int(len(matches_encoded) * 0.8)
X = matches_encoded[:TRAIN_SAMPLE_SIZE]
Y = matches_encoded["home/away"][:TRAIN_SAMPLE_SIZE]

"""Test set"""
X_test = matches_encoded[matches_encoded.columns[:-1]][TRAIN_SAMPLE_SIZE-1:-1]
Y_test = matches_encoded["home/away"][TRAIN_SAMPLE_SIZE-1:-1]


print("Training set:")
print(X.shape, Y.shape)

print("\nTesting set:")
print(X_test.shape, Y_test.shape)

Training set:
(876, 5) (876,)

Testing set:
(220, 4) (220,)


In [62]:
X

Unnamed: 0,home_team_id,away_team_id,cloud_cover,rain,venue,weekday,hour,home/away
0,402,57,44.0,0.0,51.197096,4.0,21.0,1
1,66,341,89.0,0.3,51.171747,5.0,13.0,1
2,62,340,100.0,0.8,50.470063,5.0,16.0,1
3,61,354,35.0,0.0,51.288132,5.0,16.0,1
4,338,76,75.0,0.5,51.475602,5.0,16.0,1
...,...,...,...,...,...,...,...,...
871,57,328,20.0,0.0,51.444107,5.0,17.0,1
872,66,389,2.0,0.0,51.171747,5.0,17.0,1
873,1044,67,100.0,0.0,48.893178,5.0,19.0,1
874,563,351,100.0,0.8,51.522208,6.0,16.0,1


In [88]:
"""Normalize the data"""
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X[X.columns[:-1]])
X_normalized.shape

(876, 4)

In [64]:
""""Equaly scale the data set using random oversampler"""
def random_oversample_dataset(dataset):
  X = dataset

  ros = RandomOverSampler()
  X_scaled, Y_scaled = ros.fit_resample(X, Y.values)

  return X_scaled, Y_scaled


""""Perfom an accuracy test"""
def perfomance_metric(X_test, Y_test, model):
  Y_preds = model.predict(X_test)
  combined_actual_to_preds = pd.DataFrame(dict(actual=Y_test, prediction=Y_preds))
  print(classification_report(Y_test, Y_preds))
  return pd.crosstab(index=combined_actual_to_preds["actual"], columns=combined_actual_to_preds["prediction"])

In [83]:
# To test without Standard or MinMax scaler.
X_normalized = X[X.columns[:-1]]

In [89]:
X_scaled, Y_scaled = random_oversample_dataset(X_normalized)
print(X_scaled.shape, Y_scaled.shape)

(952, 4) (952,)


In [90]:
"""Model defination"""
classifier = RandomForestClassifier(n_estimators=1024)
classifier.fit(X_scaled, Y_scaled)
print(perfomance_metric(X_test.values, Y_test.values, model=classifier))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       117
           1       0.47      1.00      0.64       103

    accuracy                           0.47       220
   macro avg       0.23      0.50      0.32       220
weighted avg       0.22      0.47      0.30       220

prediction    1
actual         
0           117
1           103


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [149]:
classifier.get_params()

{'objective': 'binary:hinge',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'device': 'gpu',
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': 120,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': None,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': 0.5,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}