In [48]:
import pandas as pd
import datetime
import requests
import time
import traceback

In [None]:
MODEL_NAME = "PREMIER_LEAGUE"
MODEL_VERSION = "1.0"


"""Gets head to head stats for a game for the previous 10 matches."""
def get_head2head(match_id):
  endpoint = f"https://api.football-data.org/v4/matches/{match_id}/head2head?limit=5"

  request_id = refresh_expired_keys(api_keys)
  if request_id:
    response = requests.get(endpoint,
                      headers={'X-Unfold-Lineups': "true",
                              'X-Unfold-Goals': "true",
                              'X-Auth-Token': request_id})

    if response.status_code == 200:
      return response.json().get("aggregates", dict())
    else:
      return dict()


def refresh_expired_keys(api_keys):
    current_time = time.time()
    found_key = False

    while found_key == False:
      for i, key_info in enumerate(api_keys):
        last_request_time = key_info[2]
        if current_time - last_request_time >= 60:
            # Refresh the key if it has expired
            api_keys[i][2] = current_time
            api_keys[i][1] = 0

      # Find a key with available requests less than 10
      for key_info in api_keys:
          if key_info[1] < 10:
              found_key = True
              print(key_info[0], key_info[1])
              key_info[1] += 1
              return key_info[0]  # Return the API key


def get_matches(api_key):
  competitions = ["PL", "ELC"] #, "ELC", "EC", "FL1", "BL1", "SA", "DED", "CLI"
  response = requests.get(f"https://api.football-data.org/v4/matches?competitions={','.join(competitions)}&dateTo={today.strftime('%Y-%m-%d')}&dateFrom={past_ten_days.strftime('%Y-%m-%d')}",
                    headers={'X-Unfold-Lineups': "true",
                             'X-Unfold-Goals': "true",
                             'X-Auth-Token': api_key})
  if response.status_code == 200:
    return response.json()["matches"]
  else:
    return []
  

# Initialize API keys with 0 timestamp
api_keys = [
   # KEY, No. REQUESTS MADE, LAST REQ. TIMESTAMP
   ["9ff4b5192f4b4272855493db973d8c61", 0, 0],
   ["a151d6c3003c4f838ffa519bea3315a1", 0, 0],
   ["618a74d05cfb4401b7a66d4ed4c3cae5", 0, 0],
   ["f8024b296b70487c852eff7a353d8f85", 0, 0],
   ["024b012d8df24b3b9ee70863633e0fcc", 0, 0],
   ["1a65d4d0d7cc4f5893b438b4092da912", 0, 0],
   ["c872982f9423441fbfab53d52c8558b4", 0, 0],
   ["bd8c94c0ce6e435d8ddc6f3d0d2ba0aa", 0, 0],
   ["5433c517fdf0408da4578b14bd95b74b", 0, 0],
   ["be9ee655916a465895ccd17f3026422a", 0, 0],
   ["61659ea176ba454f98a421ec6facaa3d", 0, 0],
   ["9cd7992817f04d5abfbe6ffa02624d6d", 0, 0],
   ["3fab499119a547f1abf56b83db88ca1b", 0, 0],
   ["ec75a3fc37c1403db385c598e3cef779", 0, 0],
   ["2bb6d52219664b18b5efde63d075a03b", 0, 0],
   ["9b648751e85443f1bde592d4b337b7e1", 0, 0],
   ["2819b51d87214f98a21bbff55bb6b4d4", 0, 0],
   ["611db8b59ba244d6bab7fc71b8d23ac9", 0, 0],
   ["cce392b8fca641f3a41361756a3cd36b", 0, 0],
   ["542f193f142d42a5845ffda13c60b705", 0, 0],
   ["8ff44d4142974404875bb5ecb71eba25", 0, 0],
   ["ba93366e933a43e4b2f0ffe2f32c17a7", 0, 0],
   ["3aa9fa11915b4ff595ccd91cc75834fd", 0, 0],
   ["92eecef75959449e84abb7559a2f2180", 0, 0]
]

In [None]:
# IMPORT THE DATASETS FROM FOOTBALL-DATA.CO.UK
matches = pd.concat([
    pd.read_csv("https://www.football-data.co.uk/mmz4281/2324/E0.csv"),
    pd.read_csv("https://www.football-data.co.uk/mmz4281/2324/E1.csv"),
    pd.read_csv("https://www.football-data.co.uk/mmz4281/2223/E0.csv"),
    pd.read_csv("https://www.football-data.co.uk/mmz4281/2223/E1.csv"),
    pd.read_csv("https://www.football-data.co.uk/mmz4281/2122/E0.csv"),
    pd.read_csv("https://www.football-data.co.uk/mmz4281/2122/E1.csv"),
    pd.read_csv("https://www.football-data.co.uk/mmz4281/2021/E0.csv"),
    pd.read_csv("https://www.football-data.co.uk/mmz4281/2021/E1.csv"),
    pd.read_csv("https://www.football-data.co.uk/mmz4281/1920/E0.csv"),
    pd.read_csv("https://www.football-data.co.uk/mmz4281/1920/E1.csv"),
    pd.read_csv("https://www.football-data.co.uk/mmz4281/1819/E0.csv"),
    pd.read_csv("https://www.football-data.co.uk/mmz4281/1819/E1.csv"),
    pd.read_csv("https://www.football-data.co.uk/mmz4281/1718/E0.csv"),
    pd.read_csv("https://www.football-data.co.uk/mmz4281/1718/E1.csv"),
    pd.read_csv("https://www.football-data.co.uk/mmz4281/1617/E0.csv"),
    pd.read_csv("https://www.football-data.co.uk/mmz4281/1617/E1.csv"),
    pd.read_csv("https://www.football-data.co.uk/mmz4281/1516/E0.csv"),
    pd.read_csv("https://www.football-data.co.uk/mmz4281/1516/E1.csv"),
    pd.read_csv("https://www.football-data.co.uk/mmz4281/1415/E0.csv"),
    pd.read_csv("https://www.football-data.co.uk/mmz4281/1415/E1.csv"),
    pd.read_csv("https://www.football-data.co.uk/mmz4281/1314/E0.csv"),
    pd.read_csv("https://www.football-data.co.uk/mmz4281/1314/E1.csv"),
    pd.read_csv("https://www.football-data.co.uk/mmz4281/1213/E0.csv"),
    pd.read_csv("https://www.football-data.co.uk/mmz4281/1213/E1.csv"),
    pd.read_csv("https://www.football-data.co.uk/mmz4281/1112/E0.csv"),
    pd.read_csv("https://www.football-data.co.uk/mmz4281/1112/E1.csv"),
    pd.read_csv("https://www.football-data.co.uk/mmz4281/1011/E0.csv"),
    pd.read_csv("https://www.football-data.co.uk/mmz4281/1011/E1.csv"),
    pd.read_csv("https://www.football-data.co.uk/mmz4281/0910/E0.csv"),
    pd.read_csv("https://www.football-data.co.uk/mmz4281/0910/E1.csv"),
], ignore_index=True).drop_duplicates()
matches = matches[["Date", "HomeTeam", "AwayTeam", "HTR", "FTR"]].dropna()
matches.to_csv("../relative_datasets/raw/PREMIER_LEAGUE_FINAL.csv", index=False)
matches

In [None]:
#COLLECT FIXTURES SIMULATED DATA FOR TEAM NAME MAPPING
today = datetime.datetime.now()
past_ten_days = today - datetime.timedelta(days=10)
print(today, past_ten_days)

COLUMNS = ["id", "day", "month", "year", "weekday", "hour", "minute", "matchday", "status", "leagueName", "leagueId", "leagueType", "stage", "homeName", "homeId", "awayName", "awayId", "scoreHomeHt", 
                    "scoreAwayHt", "scoreHomeFt", "scoreAwayFt"] #, "agHomeWins", "agHomeLosses", "agDraws", "agHomeWins", "agHomeLosses", "agTotalGoals", "noOfAgMatches"
TOTAL_DATA_REQUIRED = 100_000
request_count = 0
data = []


while len(data) < TOTAL_DATA_REQUIRED:
    try:
        selected_key = refresh_expired_keys(api_keys)
        if selected_key:
            matches = get_matches(selected_key)
            print("Received matches:", len(matches))
            for match in matches:
                # head2head = get_head2head(match["id"])
                matchdate = datetime.datetime.strptime(match["utcDate"], "%Y-%m-%dT%H:%M:%SZ")

                data.append([
                    match["id"], # id
                    matchdate.day, # id
                    matchdate.month, # month
                    matchdate.year, # id
                    matchdate.isoweekday(), # dayofweek
                    matchdate.hour, # hour
                    matchdate.minute, # hour
                    match["matchday"], # status
                    match["status"], # status
                    match["competition"]["name"], # leagueName
                    match["competition"]["id"], # leagueId
                    match["competition"]["type"], # leagueType
                    match["stage"], # stage
                    match["homeTeam"]["shortName"], # homeName
                    match["homeTeam"]["id"], # homeId
                    match["awayTeam"]["shortName"], # awayName
                    match["awayTeam"]["id"], # awayId
                    match["score"]["halfTime"]["home"], # scoreHomeHt
                    match["score"]["halfTime"]["away"], # scoreAwayHt
                    match["score"]["fullTime"]["home"], # scoreHomeFt
                    match["score"]["fullTime"]["away"], # scoreAwayFt
                ])
                            
            # Simulate making a request with the selected key
            for key_info in api_keys:
                if key_info[0] == selected_key:
                    key_info[1] += 1
                    key_info[2] = time.time()  # Update last request time
                    request_count += 1
                    
                    print("Request count:", request_count)
                    print("Matches count:", len(data))
                    break
            
            today = past_ten_days
            past_ten_days = today - datetime.timedelta(days=10)
            print(today, past_ten_days)
        else:
            print("No API Key with available requests less than 10. Waiting 60 seconds...")
            time.sleep(60)  # Wait for 60 seconds before trying again
            # Reset the api keys after this point
            api_keys = [[api_key[0], 0, 0] for api_key in api_keys]
            
            today = past_ten_days
            past_ten_days = today - datetime.timedelta(days=10)
            print(today, past_ten_days)

    except KeyboardInterrupt:
        # Incase something goes wrong with the above script, save current progress.
        df = pd.DataFrame(data, columns=COLUMNS)
        df.to_csv(f"../relative_datasets/raw/{MODEL_NAME}-v{MODEL_VERSION}_{request_count}.csv", index=False)
        break
    except:
        print(traceback.format_exc())
        print("Today:", today, "Past 10 days:", past_ten_days)
        # Await the next 60 seconds to be allowed to request more data, incase limit has been reached.
        time.sleep(60)
    
# Incase something goes wrong with the above script, save current progress.
df = pd.DataFrame(data, columns=COLUMNS)
df.to_csv(f"../relative_datasets/raw/{MODEL_NAME}-v{MODEL_VERSION}_{request_count}.csv", index=False)
print("Total requests made:", request_count)