In [1]:
import boto3
from dotenv import load_dotenv
import os
import warnings
from io import StringIO
import pandas as pd
from tqdm import tqdm
import numpy as np
import io
import requests
from datetime import datetime


import sys
sys.path.append('..')
import utils
tqdm.pandas()
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
warnings.filterwarnings('ignore')

load_dotenv()
aws_access_key = os.getenv('AWS_ACCESS_KEY')
aws_secret_access = os.getenv('AWS_SECRET_ACCESS')
aws_region = os.getenv('AWS_REGION')

s3 = boto3.client('s3',
                aws_access_key_id=aws_access_key,
                aws_secret_access_key=aws_secret_access,
                region_name=aws_region)

bucket = 'footballbets'
league = "ENG-Premier League"
season = 2223

In [25]:
current_directory = os.getcwd()
parent_directory = os.path.dirname(os.path.dirname(current_directory))
sys.path.append(parent_directory)
import _config
TEAMNAME_REPLACEMENTS = _config.TEAMNAME_REPLACEMENTS

In [2]:
scheduler = s3.get_object(Bucket=bucket, Key=f'ENG-Premier League/2223/schedule.csv')
schedule = pd.read_csv(StringIO(scheduler['Body'].read().decode('utf-8')))

In [97]:
schedule.columns

Index(['Unnamed: 0', 'index', 'league', 'season', 'game', 'stage_id',
       'ws_game_id', 'status', 'start_time', 'home_team_id', 'home_team',
       'home_yellow_cards', 'home_red_cards', 'away_team_id', 'away_team',
       'away_yellow_cards', 'away_red_cards', 'has_incidents_summary',
       'has_preview', 'score_changed_at', 'elapsed', 'last_scorer',
       'is_top_match', 'home_team_country_code', 'away_team_country_code',
       'comment_count', 'is_lineup_confirmed', 'is_stream_available',
       'match_is_opta', 'home_team_country_name', 'away_team_country_name',
       'date', 'home_score', 'away_score', 'incidents', 'bets',
       'aggregate_winner_field', 'winner_field', 'period',
       'extra_result_field', 'home_extratime_score', 'away_extratime_score',
       'home_penalty_score', 'away_penalty_score', 'started_at_utc',
       'first_half_ended_at_utc', 'second_half_started_at_utc', 'stage',
       'fbref_game_id', 'und_game_id'],
      dtype='object')

In [86]:
season_id = f'20{str(season)[:2]}'
europe = pd.DataFrame()

for league in ['europa', 'champions']:

    headers = {
        'Cookies' : '_ga_DTCKHDGKYF=GS1.1.1722868866.6.1.1722869089.0.0.0; _ga=GA1.2.1274569263.1721488882; ARRAffinity=3587c3b28f299ba120e848a3ba122803c40823fd58ac197de099244cf70e116d; ARRAffinitySameSite=3587c3b28f299ba120e848a3ba122803c40823fd58ac197de099244cf70e116d; _gid=GA1.2.1211098860.1722868867; Timezone=Eastern Standard Time',
        'Referer' : f'https://fixturedownload.com/download/csv/{league}-league-{season_id}',
        'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:128.0) Gecko/20100101 Firefox/128.0'
    }


    csv = requests.get(f"https://fixturedownload.com/download/{league}-league-{season_id}-EasternStandardTime.csv", headers=headers)
    temp = pd.read_csv(io.StringIO(csv.text))
    temp['league'] = 'Europa League' if league == 'europa' else 'Champions League'
    europe = pd.concat([europe, temp])

In [90]:
team_cols = ['Home Team', 'Away Team']
europe[team_cols] = europe[team_cols].replace(TEAMNAME_REPLACEMENTS)
pl_teams = list(schedule.home_team.unique())
europe = europe[(europe['Home Team'].isin(pl_teams)) | (europe['Away Team'].isin(pl_teams))]

europe['date'] = europe.Date.apply(lambda x: str(x).split(' ')[0])
europe['date'] = europe.date.apply(lambda x: datetime.strptime(x, "%d/%m/%Y").date())
europe['time'] = europe.Date.apply(lambda x: str(x).split(' ')[1])
europe['time'] = europe.time.apply(lambda x: datetime.strptime(x, "%H:%M").time())

In [91]:
europe['season'] = 2223

europe['game'] = europe.apply(lambda x: f"{x.date} {x['Home Team']}-{x['Away Team']}", axis=1)
europe['start_time'] = europe.apply(lambda x: f"{x.date}T{x.time}", axis=1)
europe = europe.rename(columns={'Home Team':'home_team', 'Away Team':'away_team'})

cols_to_keep = ['league', 'season', 'game', 'start_time', 'home_team', 'away_team']
nul_cols = schedule.columns.difference(cols_to_keep)

europe = europe.drop(europe.columns.difference(cols_to_keep), axis=1)
europe[nul_cols] = np.nan

In [92]:
final_sched = pd.concat([schedule, europe], ignore_index=True).sort_values('start_time')

In [None]:
final_sched[final_sched['league'] != 'ENG-Premier League']