In [2]:
import pandas as pd
import datetime
import requests
import time
import traceback

In [6]:
MODEL_NAME = "fulltime_win_outcome"
MODEL_VERSION = "1.0"

# DATA COLLECTION AND SCRAPING

In [64]:

"""Gets head to head stats for a game for the previous 10 matches."""
def get_head2head(match_id):
  endpoint = f"https://api.football-data.org/v4/matches/{match_id}/head2head?limit=5"

  request_id = refresh_expired_keys(api_keys)
  if request_id:
    response = requests.get(endpoint,
                      headers={'X-Unfold-Lineups': "true",
                              'X-Unfold-Goals': "true",
                              'X-Auth-Token': request_id})

    if response.status_code == 200:
      return response.json().get("aggregates", dict())
    else:
      return dict()


def refresh_expired_keys(api_keys):
    current_time = time.time()
    found_key = False

    while found_key == False:
      for i, key_info in enumerate(api_keys):
        last_request_time = key_info[2]
        if current_time - last_request_time >= 60:
            # Refresh the key if it has expired
            api_keys[i][2] = current_time
            api_keys[i][1] = 0

      # Find a key with available requests less than 10
      for key_info in api_keys:
          if key_info[1] < 10:
              found_key = True
              key_info[1] += 1
              return key_info[0]  # Return the API key


def get_matches(api_key, today, past_days):
  competitions = ["PL", "ELC", "ELC", "EC", "FL1", "BL1", "SA", "DED", "CLI"] #
  response = requests.get(f"https://api.football-data.org/v4/matches?competitions={','.join(competitions)}&dateTo={today.strftime('%Y-%m-%d')}&dateFrom={past_days.strftime('%Y-%m-%d')}",
                    headers={'X-Unfold-Lineups': "true",
                             'X-Unfold-Goals': "true",
                             'X-Auth-Token': api_key})
  if response.status_code == 200:
    return response.json()["matches"]
  else:
    return []
  

# Initialize API keys with 0 timestamp
api_keys = [
   # KEY, No. REQUESTS MADE, LAST REQ. TIMESTAMP
   ["9ff4b5192f4b4272855493db973d8c61", 0, 0],
   ["a151d6c3003c4f838ffa519bea3315a1", 0, 0],
   ["618a74d05cfb4401b7a66d4ed4c3cae5", 0, 0],
   ["f8024b296b70487c852eff7a353d8f85", 0, 0],
   ["024b012d8df24b3b9ee70863633e0fcc", 0, 0],
   ["1a65d4d0d7cc4f5893b438b4092da912", 0, 0],
   ["c872982f9423441fbfab53d52c8558b4", 0, 0],
   ["bd8c94c0ce6e435d8ddc6f3d0d2ba0aa", 0, 0],
   ["5433c517fdf0408da4578b14bd95b74b", 0, 0],
   ["be9ee655916a465895ccd17f3026422a", 0, 0],
   ["61659ea176ba454f98a421ec6facaa3d", 0, 0],
   ["9cd7992817f04d5abfbe6ffa02624d6d", 0, 0],
   ["3fab499119a547f1abf56b83db88ca1b", 0, 0],
   ["ec75a3fc37c1403db385c598e3cef779", 0, 0],
   ["2bb6d52219664b18b5efde63d075a03b", 0, 0],
   ["9b648751e85443f1bde592d4b337b7e1", 0, 0],
   ["2819b51d87214f98a21bbff55bb6b4d4", 0, 0],
   ["611db8b59ba244d6bab7fc71b8d23ac9", 0, 0],
   ["cce392b8fca641f3a41361756a3cd36b", 0, 0],
   ["542f193f142d42a5845ffda13c60b705", 0, 0],
   ["8ff44d4142974404875bb5ecb71eba25", 0, 0],
   ["ba93366e933a43e4b2f0ffe2f32c17a7", 0, 0],
   ["3aa9fa11915b4ff595ccd91cc75834fd", 0, 0],
   ["92eecef75959449e84abb7559a2f2180", 0, 0]
]

In [None]:
today = datetime.datetime.now()
past_ten_days = today - datetime.timedelta(days=10)
print(today, past_ten_days)

COLUMNS = ["id", "day", "month", "year", "weekday", "hour", "minute", "matchday", "status", "leagueName", "leagueId", "leagueType", "stage", "homeName", "homeId", "awayName", "awayId", "scoreHomeHt", 
                    "scoreAwayHt", "scoreHomeFt", "scoreAwayFt", "agHomeWins", "agDraws", "agAwaywins", "avgGoals"] #, 
TOTAL_DATA_REQUIRED = 100_000
request_count = 0
data = []

while len(data) < TOTAL_DATA_REQUIRED:
    try:
        selected_key = refresh_expired_keys(api_keys)
        if selected_key:
            matches = get_matches(selected_key, today=today, past_days=past_ten_days)
            print("Received matches:", len(matches))
            for match in matches:
                print()
                head2head = get_head2head(match["id"])
                matchdate = datetime.datetime.strptime(match["utcDate"], "%Y-%m-%dT%H:%M:%SZ")

                if head2head.get("homeTeam") is None:
                    continue

                print(head2head)

                data.append([
                    match["id"], # id
                    matchdate.day, # id
                    matchdate.month, # month
                    matchdate.year, # id
                    matchdate.isoweekday(), # dayofweek
                    matchdate.hour, # hour
                    matchdate.minute, # hour
                    match["matchday"], # status
                    match["status"], # status
                    match["competition"]["name"], # leagueName
                    match["competition"]["id"], # leagueId
                    match["competition"]["type"], # leagueType
                    match["stage"], # stage
                    match["homeTeam"]["shortName"], # homeName
                    match["homeTeam"]["id"], # homeId
                    match["awayTeam"]["shortName"], # awayName
                    match["awayTeam"]["id"], # awayId
                    match["score"]["halfTime"]["home"], # scoreHomeHt
                    match["score"]["halfTime"]["away"], # scoreAwayHt
                    match["score"]["fullTime"]["home"], # scoreHomeFt
                    match["score"]["fullTime"]["away"], # scoreAwayFt
                    head2head["homeTeam"]["wins"], # homeWins
                    head2head["homeTeam"]["draws"], # agDraws
                    head2head["awayTeam"]["wins"], # awayWins
                    head2head["totalGoals"] / head2head["numberOfMatches"] # agGoals
                ])
                            
            # Simulate making a request with the selected key
            for key_info in api_keys:
                if key_info[0] == selected_key:
                    key_info[1] += 1
                    key_info[2] = time.time()  # Update last request time
                    request_count += 1
                    
                    print("Request count:", request_count)
                    print("Matches count:", len(data))
                    break
            
            today = past_ten_days
            past_ten_days = today - datetime.timedelta(days=10)
            print("Today:", today, " ------ Past 10 days:", past_ten_days)
        else:
            print("No API Key with available requests less than 10. Waiting 60 seconds...")
            time.sleep(60)  # Wait for 60 seconds before trying again
            # Reset the api keys after this point
            api_keys = [[api_key[0], 0, 0] for api_key in api_keys]
            
    except KeyboardInterrupt:
        # Incase something goes wrong with the above script, save current progress.
        df = pd.DataFrame(data, columns=COLUMNS)
        df.to_csv(f"../relative_datasets/raw/{MODEL_NAME}-v{MODEL_VERSION}_{request_count}.csv", index=False)
        break
    
    except:
        print(traceback.format_exc())
        # Await the next 60 seconds to be allowed to request more data, incase limit has been reached.
        time.sleep(60)
    
# Incase something goes wrong with the above script, save current progress.
df = pd.DataFrame(data, columns=COLUMNS)
df.to_csv(f"../relative_datasets/raw/{MODEL_NAME}-v{MODEL_VERSION}_{request_count}.csv", index=False)
print("Total requests made:", request_count)

# DATA CLEANING AND SAVING

In [43]:
DATA = pd.read_csv(f"../relative_datasets/raw/{MODEL_NAME}-v{MODEL_VERSION}.csv").drop_duplicates().dropna().reset_index(drop=True)
DATA = DATA[DATA.status == "FINISHED"]
DATA

Unnamed: 0,id,day,month,year,weekday,hour,minute,matchday,status,leagueName,...,awayName,awayId,scoreHomeHt,scoreAwayHt,scoreHomeFt,scoreAwayFt,agHomeWins,agDraws,agAwayWins,avgGoals
0,444610,12,5,2024,7,10,30,36.0,FINISHED,Serie A,...,Empoli,445,1.0,0.0,2.0,0.0,3,2,0,3.2
1,495747,12,5,2024,7,11,0,1.0,FINISHED,Championship,...,Leeds United,341,0.0,0.0,0.0,0.0,0,1,4,2.6
2,441772,12,5,2024,7,12,30,33.0,FINISHED,Eredivisie,...,Volendam,1919,5.0,2.0,7.0,2.0,4,0,1,4.0
3,441774,12,5,2024,7,12,30,33.0,FINISHED,Eredivisie,...,PSV,674,1.0,0.0,1.0,1.0,0,2,3,3.6
4,441770,12,5,2024,7,12,30,33.0,FINISHED,Eredivisie,...,Almere City,1911,3.0,0.0,3.0,0.0,1,1,0,3.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6948,285448,15,6,2021,2,16,0,1.0,FINISHED,European Championship,...,Portugal,765,0.0,0.0,0.0,3.0,0,1,4,3.4
6949,285449,15,6,2021,2,19,0,1.0,FINISHED,European Championship,...,Germany,759,1.0,0.0,1.0,0.0,2,0,3,4.2
6950,285426,16,6,2021,3,13,0,2.0,FINISHED,European Championship,...,Russia,808,0.0,1.0,0.0,1.0,0,0,1,1.0
6951,285420,16,6,2021,3,16,0,2.0,FINISHED,European Championship,...,Wales,833,0.0,1.0,0.0,2.0,1,1,1,2.0


In [45]:
DATA_NUMERIC = DATA[['day', 'month', 'year', 'weekday', 'hour', 'minute', 'matchday',
       'leagueId', 'homeId', 'awayId', 'agHomeWins', 'agDraws', 'agAwayWins', 'avgGoals',
       'scoreHomeFt', 'scoreAwayFt']]
DATA_NUMERIC = DATA_NUMERIC.astype(float)
DATA_NUMERIC

Unnamed: 0,day,month,year,weekday,hour,minute,matchday,leagueId,homeId,awayId,agHomeWins,agDraws,agAwayWins,avgGoals,scoreHomeFt,scoreAwayFt
0,12.0,5.0,2024.0,7.0,10.0,30.0,36.0,2019.0,110.0,445.0,3.0,2.0,0.0,3.2,2.0,0.0
1,12.0,5.0,2024.0,7.0,11.0,0.0,1.0,2016.0,68.0,341.0,0.0,1.0,4.0,2.6,0.0,0.0
2,12.0,5.0,2024.0,7.0,12.0,30.0,33.0,2003.0,666.0,1919.0,4.0,0.0,1.0,4.0,7.0,2.0
3,12.0,5.0,2024.0,7.0,12.0,30.0,33.0,2003.0,1920.0,674.0,0.0,2.0,3.0,3.6,1.0,1.0
4,12.0,5.0,2024.0,7.0,12.0,30.0,33.0,2003.0,678.0,1911.0,1.0,1.0,0.0,3.5,3.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6948,15.0,6.0,2021.0,2.0,16.0,0.0,1.0,2018.0,827.0,765.0,0.0,1.0,4.0,3.4,0.0,3.0
6949,15.0,6.0,2021.0,2.0,19.0,0.0,1.0,2018.0,773.0,759.0,2.0,0.0,3.0,4.2,1.0,0.0
6950,16.0,6.0,2021.0,3.0,13.0,0.0,2.0,2018.0,1976.0,808.0,0.0,0.0,1.0,1.0,0.0,1.0
6951,16.0,6.0,2021.0,3.0,16.0,0.0,2.0,2018.0,803.0,833.0,1.0,1.0,1.0,2.0,0.0,2.0


In [14]:
# READ ALREADY SCRAPED DATA
DATA_NUMERIC = pd.read_csv("../relative_datasets/raw/odds_training.csv")
DATA_NUMERIC

Unnamed: 0,Month,Day,Time,AvgH,AvgD,AvgA,Avg>2.5,Avg<2.5,HTR,FTR,FTHG,FTAG,HS,AS,HC,AC
0,7,4,19,2.49,3.31,2.80,1.92,1.86,1.0,1,2,1,11.0,13.0,4.0,3.0
1,7,5,19,2.77,3.47,2.43,1.77,2.03,0.0,0,1,2,13.0,16.0,6.0,3.0
2,7,5,12,2.38,3.51,2.81,1.65,2.22,1.0,1,3,2,16.0,12.0,4.0,4.0
3,7,5,12,3.15,3.24,2.29,1.92,1.86,1.0,1,2,1,13.0,14.0,5.0,6.0
4,7,5,12,3.55,3.84,1.93,1.60,2.31,1.0,1,2,0,17.0,13.0,3.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15071,5,0,17,4.96,3.84,1.71,1.88,1.94,0.0,0,1,2,9.0,17.0,4.0,6.0
15072,5,0,19,2.16,3.47,3.10,1.85,1.91,0.0,-1,2,2,25.0,8.0,8.0,1.0
15073,5,0,19,2.07,3.08,3.77,2.26,1.60,-1.0,1,2,1,10.0,9.0,2.0,7.0
15074,5,0,17,1.83,3.79,3.91,1.67,2.16,1.0,1,1,0,9.0,16.0,2.0,8.0


In [46]:
# REMOVE DRAW GAMES
DATA_NUMERIC = DATA_NUMERIC[DATA_NUMERIC.scoreHomeFt != DATA_NUMERIC.scoreAwayFt].reset_index(drop=True)
# DEFINE THE TARGET VARIABLE
DATA_NUMERIC["target"] = (DATA_NUMERIC.scoreHomeFt > DATA_NUMERIC.scoreAwayFt).astype(float)
DATA_NUMERIC

Unnamed: 0,day,month,year,weekday,hour,minute,matchday,leagueId,homeId,awayId,agHomeWins,agDraws,agAwayWins,avgGoals,scoreHomeFt,scoreAwayFt,target
0,12.0,5.0,2024.0,7.0,10.0,30.0,36.0,2019.0,110.0,445.0,3.0,2.0,0.0,3.2,2.0,0.0,1.0
1,12.0,5.0,2024.0,7.0,12.0,30.0,33.0,2003.0,666.0,1919.0,4.0,0.0,1.0,4.0,7.0,2.0,1.0
2,12.0,5.0,2024.0,7.0,12.0,30.0,33.0,2003.0,678.0,1911.0,1.0,1.0,0.0,3.5,3.0,0.0,1.0
3,12.0,5.0,2024.0,7.0,12.0,30.0,33.0,2003.0,1915.0,675.0,0.0,1.0,4.0,5.6,2.0,3.0,0.0
4,12.0,5.0,2024.0,7.0,12.0,30.0,33.0,2003.0,676.0,6806.0,4.0,0.0,1.0,2.8,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5224,15.0,6.0,2021.0,2.0,16.0,0.0,1.0,2018.0,827.0,765.0,0.0,1.0,4.0,3.4,0.0,3.0,0.0
5225,15.0,6.0,2021.0,2.0,19.0,0.0,1.0,2018.0,773.0,759.0,2.0,0.0,3.0,4.2,1.0,0.0,1.0
5226,16.0,6.0,2021.0,3.0,13.0,0.0,2.0,2018.0,1976.0,808.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
5227,16.0,6.0,2021.0,3.0,16.0,0.0,2.0,2018.0,803.0,833.0,1.0,1.0,1.0,2.0,0.0,2.0,0.0


In [48]:
# REMOVE FUTURE FEATURES
DATA_NUMERIC = DATA_NUMERIC[['day', 'month', 'year', 'weekday', 'hour', 'minute', 'matchday',
       'leagueId', 'homeId', 'awayId', 'agHomeWins', 'agDraws', 'agAwayWins', 'avgGoals', 'target']]
DATA_NUMERIC

Unnamed: 0,day,month,year,weekday,hour,minute,matchday,leagueId,homeId,awayId,agHomeWins,agDraws,agAwayWins,avgGoals,target
0,12.0,5.0,2024.0,7.0,10.0,30.0,36.0,2019.0,110.0,445.0,3.0,2.0,0.0,3.2,1.0
1,12.0,5.0,2024.0,7.0,12.0,30.0,33.0,2003.0,666.0,1919.0,4.0,0.0,1.0,4.0,1.0
2,12.0,5.0,2024.0,7.0,12.0,30.0,33.0,2003.0,678.0,1911.0,1.0,1.0,0.0,3.5,1.0
3,12.0,5.0,2024.0,7.0,12.0,30.0,33.0,2003.0,1915.0,675.0,0.0,1.0,4.0,5.6,0.0
4,12.0,5.0,2024.0,7.0,12.0,30.0,33.0,2003.0,676.0,6806.0,4.0,0.0,1.0,2.8,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5224,15.0,6.0,2021.0,2.0,16.0,0.0,1.0,2018.0,827.0,765.0,0.0,1.0,4.0,3.4,0.0
5225,15.0,6.0,2021.0,2.0,19.0,0.0,1.0,2018.0,773.0,759.0,2.0,0.0,3.0,4.2,1.0
5226,16.0,6.0,2021.0,3.0,13.0,0.0,2.0,2018.0,1976.0,808.0,0.0,0.0,1.0,1.0,0.0
5227,16.0,6.0,2021.0,3.0,16.0,0.0,2.0,2018.0,803.0,833.0,1.0,1.0,1.0,2.0,0.0


In [15]:
DATA_NUMERIC["target"] = DATA_NUMERIC["FTR"]
DATA_NUMERIC = DATA_NUMERIC[['Month', 'Day', 'Time', 'AvgH', 'AvgD', 'AvgA', 'HTR', 'target']]
DATA_NUMERIC

Unnamed: 0,Month,Day,Time,AvgH,AvgD,AvgA,HTR,target
0,7,4,19,2.49,3.31,2.80,1.0,1
1,7,5,19,2.77,3.47,2.43,0.0,0
2,7,5,12,2.38,3.51,2.81,1.0,1
3,7,5,12,3.15,3.24,2.29,1.0,1
4,7,5,12,3.55,3.84,1.93,1.0,1
...,...,...,...,...,...,...,...,...
15071,5,0,17,4.96,3.84,1.71,0.0,0
15072,5,0,19,2.16,3.47,3.10,0.0,-1
15073,5,0,19,2.07,3.08,3.77,-1.0,1
15074,5,0,17,1.83,3.79,3.91,1.0,1


In [17]:
# SPLIT THE DATA INTO TRAIN AND TEST VALIDATION
TRAIN_SAMPLE_SIZE = int(len(DATA_NUMERIC) * 0.7)
X = DATA_NUMERIC[:TRAIN_SAMPLE_SIZE]
X_test = DATA_NUMERIC[TRAIN_SAMPLE_SIZE:]

In [62]:
# FOR BETTER PREDICTABILITY ON CONSIDER LEAGUES TRAINED FOR IN PRODUCTION
leagues = [int(leagueId) for leagueId in DATA_NUMERIC.leagueId.unique().tolist()]
leagues

[2019, 2003, 2002, 2021, 2015, 2152, 2016, 2018]

In [18]:
# SAVE THE CLEANED DATASET
X.to_csv(f"../relative_datasets/cleaned/{MODEL_NAME}_v{MODEL_VERSION}-train-set.csv", index=False)
X_test.to_csv(f"../relative_datasets/cleaned/{MODEL_NAME}_v{MODEL_VERSION}-test-set.csv", index=False)