In [53]:
# selenium-trigger-1 must be invoked from outside the lambda flow.  The others are all in a connected chain....

#### **SELENIUM MAIN**

In [110]:
# body of lambda function - works locally as well as on lambda, with platform-agnostic logic

from bs4 import BeautifulSoup as BS
from selenium import webdriver
from io import StringIO, BytesIO
import pandas as pd
import numpy as np
import datetime
import platform
import zipfile
import boto3
import time
import json
import csv
import re
import os

from datetime import datetime as dt
from datetime import timedelta as td

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException

s3_resource = boto3.resource('s3')
lambda_client = boto3.client('lambda')

mypath = os.path.join('/'.join(os.getcwd().split('/')[:-1]), 'data')


# helper functions
def make_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--ignore-certificate-errors')
    options.add_argument('--incognito')
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--single-process')
    options.add_argument('--disable-dev-shm-usage')
    if platform.system() == 'Linux':
        options.binary_location = '/opt/headless-chromium'
        chromedriver_path = '/opt/chromedriver'
    else:
        chromedriver_path = '/Users/harveymanhood/chromedriver'
    return webdriver.Chrome(chromedriver_path, options=options)


def write_s3(file_path, myfile, bucket='hwm-nba', dedupe_cols=None, sort=None, ascending=True, compression='zip'):
    file_name = file_path.split('/')[-1]
    if type(myfile) == pd.core.frame.DataFrame:
        if sort is not None:
            myfile = myfile.sort_values(by=sort, ascending=ascending)
        if dedupe_cols is not None:
            myfile = myfile.drop_duplicates(subset=dedupe_cols, keep='first')
        output_buffer = BytesIO() if compression == 'zip' else StringIO()
        if compression == 'zip': file_path = '.'.join(file_path.split('.')[:-1])+'.zip'
        myfile.to_csv(output_buffer, index=False, compression={'method':compression, 'archive_name':file_name})
        myfile = output_buffer
    s3_resource.Object(bucket, file_path).put(Body=myfile.getvalue())    


def read_s3(file_path, bucket='hwm-nba', output=None, columns=None):
    data = s3_resource.Object(bucket, file_path).get()['Body'].read()
    if file_path[-3:] == 'zip':
        data = zipfile.ZipFile(BytesIO(data))
        data = data.read(data.namelist()[0])
    data = data.decode('utf-8')
    if output == 'dataframe':
        kwargs = {'header': 0}
        if columns is not None: kwargs['names'] = columns
        data = pd.read_csv(StringIO(data), **kwargs)
    return data


def append_s3(file_path, mydata, bucket='hwm-nba', output=None, columns=None, dedupe_cols=None, sort=None, ascending=True):
    data = read_s3(file_path, bucket=bucket, output=output, columns=columns)
    mydata = pd.concat([data, mydata], axis=0)
    file_path = file_path = '.'.join(file_path.split('.')[:-1])+'.csv'
    write_s3(file_path, mydata.reset_index(drop=True), bucket=bucket, dedupe_cols=dedupe_cols, sort=sort, ascending=ascending)


# temp function for parsing
def append_local(file_path, mydata, output=None, columns=None, drop_duplicates=True, sort=None):
    data = pd.read_csv(file_path)
    mydata = pd.concat([mydata, data], axis=0).drop_duplicates(keep='last').sort_values(by=['Date','Detail Path'], ascending=True)
    mydata.to_csv(file_path, index=False)
    zip_write(file_path)


def record_status(data):
    detail_unique = np.unique(data['Detail Path'])
    ones = np.ones(len(detail_unique),dtype='int')
    detail_unique = pd.DataFrame(np.transpose([detail_unique, ones]), columns=['Detail Path', 'Count'])
    games = read_s3('data/games.zip', output='dataframe')
    games = games.merge(detail_unique, how='left', on=['Detail Path'])
    games['Detail Data'] = games.apply(lambda r: 1 if r['Count']==1 else r['Detail Data'], axis=1)
    del games['Count']
    if platform.system() != 'Linux':
        games.to_csv(os.path.join(mypath, 'games.csv'), index=False)
        games.to_csv(os.path.join(mypath, 'games.zip'), index=False, compression = dict(method='zip', archive_name='games.csv'))
    write_s3('data/games.csv', games, dedupe_cols=list(games.columns)[:-2], sort=['Date','Detail Path'])


def retrieve_dates(dates, table='games', hour_offset=-100): # -100 pulls 4-5 days into the future for prediction purposes        
    days = int(hour_offset//24)
    hours = hour_offset%24
    max_date = datetime.datetime.today() - datetime.timedelta(days=days, hours=hours)    
    if type(dates) == list:
        dates.sort()
        d1, d2 = datetime.datetime.strptime(dates[0], '%Y-%m-%d'), datetime.datetime.strptime(dates[1], '%Y-%m-%d')
        dates = [datetime.datetime.strftime(d1+datetime.timedelta(days=d), '%Y-%m-%d') for d in range((d2-d1).days+1)]
    elif dates == 'infer':
        games = read_s3('data/games.zip', output='dataframe')
        if table != 'games':
            games = games[(games['Detail Data'] == 0) & (games['AS'] != '-')]
            dates = list(np.unique(games['Date']))
        else:                                          
            existing = list(np.unique(games['Date']))
            max_played = np.max(np.unique(games[games['AS'] != '-']['Date']))
            min_date = min(existing)
            d1, d2 = datetime.datetime.strptime(min_date, '%Y-%m-%d'), max_date
            dates = [datetime.datetime.strftime(d1+datetime.timedelta(days=d), '%Y-%m-%d') for d in range((d2-d1).days+1)]
            dates = [d for d in dates if (d not in existing or d > max_played)]    
    else:          
        dates = [datetime.datetime.strftime(max_date - datetime.timedelta(days=d), '%Y-%m-%d') for d in range(dates)]
    return dates


def retrieve_urls(df=False, **params):
    site_root = 'https://www.nba.com'
    date_param = 'dates' if 'dates' in params.keys() else 'num_pages'
    dates = retrieve_dates(params[date_param], table=params['table'])
    if params['table'] == 'games':
        game_ids = [None for d in dates] # to give symmetry
        urls = [site_root+'/games?date='+d for d in dates]
    else:
        games = read_s3('data/games.zip', output='dataframe')
        if 'rand' in params.keys():
            games = games[games['Rand'].isin(params['rand'])]
            games[games['Rand'].isin(params['rand'])]
        games['AS'] = games['AS'].apply(lambda x: int(x) if str(x).split('.')[0].isdigit() else 0)
        games['HS'] = games['HS'].apply(lambda x: int(x) if str(x).split('.')[0].isdigit() else 0)
        new_games = games[(games['Date'].isin(dates)) & (games['Away'] != 'No Games') & (games['AS'] != 0)]
        if params['allow_repeats'] == 'false': new_games = new_games[new_games['Detail Data']==0]
        dates, game_ids = np.array(new_games['Date']), np.array(new_games['Detail Path'])
        detail_url = '/play-by-play?period=All' # if params['table'] == 'plays' else 'box-score'
        urls = [site_root+'/game/'+g+detail_url for g in game_ids]
    if df is True:
        df = pd.DataFrame([urls, game_ids, dates]).T
        df.columns = ['urls', 'game_ids', 'dates']
        return df
    return urls, game_ids, dates


def scrape(url, date, driver, **params):
    driver.get(url)
    delay = 8
    try:
        myElem = WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'script[id="__NEXT_DATA__"]')))
    except TimeoutException:
        return 'incomplete'
    page_source = driver.page_source
    if params['upload_scrape'] == 'true':
        file_path = 'scrapes/'+params['table']+'_'+date+'.txt'
        write_s3(file_path, StringIO(page_source))
    return page_source


def zip_write(file, dest=None):
    if dest is None:
        dest = file.split('.')[0]+'.zip'
    my_zip = zipfile.ZipFile(dest, 'w', zipfile.ZIP_DEFLATED)
    my_zip.write(file)
    my_zip.close()


def fetch_nba_teams():
    return ['ATL', 'BKN', 'BOS', 'CHA', 'CHH', 'CHI', 'CLE', 'DAL', 'DEN', 'DET', 'GOS', 'GSW', 'HOU', 'IND', 'LAC', 'LAL', 'MEM',
            'MIA', 'MIL', 'MIN', 'NJN', 'NOH', 'NOK', 'NOP', 'NYK', 'OKC', 'ORL', 'PHI', 'PHL', 'PHX', 'POR', 'SAC', 'SAN', 'SAS',
            'SEA', 'TOR', 'UTA', 'UTH', 'VAN', 'WAS']


def parse(page_source, game_id, date, **params):
    teams = fetch_nba_teams()
    soup = BS(page_source, 'html.parser')
    if params['table'] == 'games':
        try:
            columns = ['Date', 'Away', 'AS', 'Home', 'HS', 'OT', 'Detail Path', 'Game Type', 'Detail Data', 'Rand']
            data = soup.select('div[class*="gamecard"]')
            if data == []:
                game_data = [[date, 'No Games', '-', 'No Games', '-', '-', '-', '-', 2, np.random.randint(10)]]
            for i, d in enumerate(data):
                box_score = d.select('a[data-id*="box-score"], a[data-id*="preview"]')[0]
                if len(box_score) == 0:
                    continue
                game_url = box_score.get_attribute_list('href')[0]
                game = re.split('\\/|\\-', game_url.upper())
                game_id = game_url.split('game/')[1].split('/box-score')[0]
                gametype = 'playoffs' if (len(d.select('[class*="gameSeriesText"]')) > 0 and game[2] in teams and game[4] in teams) \
                    else 'preseason' if d.select('[data-is-preseason]')[0].get_attribute_list('data-is-preseason')[0] == 'true' else 'regular'
                scores = [c.get_text() for c in d.select('p[class*="MatchupCardScore"]')]
                scores = ['-', '-'] if len(scores) < 2 else scores
                overtime_flag = d.select('p[class*="GameCardMatchupStatusText"]')[0].get_text().upper().split('/')[-1][-1].replace('T','1')
                overtime = 'N' if (overtime_flag == 'L') or (scores[-1] == '-') else 'Y'
                if (overtime == 'Y') and (scores[-1] != '-'): scores[-1] = overtime_flag + scores[-1]
                game_data_item = [date, game[2], scores[0], game[4], scores[-1], overtime, game_id, gametype, 0, np.random.randint(10)]
                if i==0:
                    game_data = [game_data_item]
                else:
                    game_data.append(game_data_item)
            print(date, end=' ')
            return {'games': pd.DataFrame(game_data, columns=columns)}
        except:
            return {}
    elif params['table'] == 'game_details':
        box_cols = ['Date', 'Detail Path', 'Player ID', 'First Name', 'Last Name', 'Name', 'Slug', 'Position',
                   'Comment', 'Jersey', 'Home', 'MIN', 'FGM', 'FGA', 'FG%', '3PM', '3PA', '3P%', 'FTM', 'FTA',
                    'FT%', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TO', 'PF', 'PTS', '+/-', ]
        play_cols = ['Date', 'Detail Path', 'Action Number', 'Clock', 'Period', 'Team ID', 'Team Tricode', 'Person Id', 'Player Name',
                     'Player Init', 'xLegacy', 'yLegacy', 'Shot Distance', 'Shot Result', 'Field Goal', 'Score Home', 'Score Away',
                     'Points Total', 'Location', 'Description', 'Action Type', 'Sub-Type', 'Video Available', 'Action Id']
        data = soup.select('script[id="__NEXT_DATA__"]')
        try:
            home_stats = pd.DataFrame.from_dict(json.loads(data[0].text)['props']['pageProps']['game']['homeTeam']['players'])
            home_stats['Home'] = 'Y'
            away_stats = pd.DataFrame.from_dict(json.loads(data[0].text)['props']['pageProps']['game']['awayTeam']['players'])
            away_stats['Home'] = 'N'
            box_data = pd.concat([home_stats, away_stats], axis=0).reset_index(drop=True)
            try:
                for key in box_data['statistics'][0].keys():
                    box_data[key] = box_data['statistics'].apply(lambda x: x[key])
                del box_data['statistics']
                box_data.insert(0, 'Date', date)
                box_data.insert(1, 'Detail Path', game_id)
                box_data.columns = box_cols
            except:
                box_data = ''

            if params['include_plays'] == 'true':
                try:
                    plays_data = pd.DataFrame.from_dict(json.loads(data[0].text)['props']['pageProps']['playByPlay']['actions'])
                    plays_data.insert(0, 'Date', date)
                    plays_data.insert(1, 'Detail Path', game_id)
                    plays_data.columns = play_cols
                except:
                    plays_data = []
        except:
            return 'incomplete'
        print(game_id, end=' ')
        final_data = {}
        if type(box_data) != str: final_data['boxes'] = box_data
        if (params['include_plays'] == 'true') and (len(plays_data) > 0): final_data['plays'] = plays_data
        return final_data


# main function
def main(*args): # event, context
    if args:
        params = args[0]
        if 'table' not in params.keys(): params['table'] = 'games'
        if 'num_pages' not in params.keys(): params['num_pages'] = 'infer'
        if 'batch_size' not in params.keys(): params['batch_size'] = '10'
        if 'upload' not in params.keys(): params['upload_scrape'] = 'false'
        if 'include_plays' not in params.keys(): params['include_plays'] = 'true'
        if 'allow_repeats' not in params.keys(): params['allow_repeats'] = 'false'
        if params['num_pages'] != 'infer': params['num_pages'] = int(params['num_pages'])
        if len(args) > 1: context = args[1]
    else:
        params = {'table': 'games', 'num_pages': 3, 'batch_size': 10}

    retries = 3
    url_data = retrieve_urls(**params, df=True)
    # print(params)
    # return url_data
    driver = make_driver()
    num_urls = len(url_data)
    completed = 0
    print(num_urls, 'pages to process...')
    while len(url_data) > 0:
        url_data = url_data.sample(frac=1).reset_index(drop=True)

        for j in range(retries):

            # read page source from the scrape, and only stay in the loop if the dict is valid
            page_source = scrape(url_data['urls'][0], url_data['dates'][0], driver, **params)
            if page_source == 'incomplete':
                status = 'red'
                if j < retries-1: time.sleep(6**(j+1))
                continue
            else:
                status = 'green'
                page_dict = parse(page_source, url_data['game_ids'][0], url_data['dates'][0], **params) 

                # return page_dict
            # read dataframe from the parsed dict, and only stay in the loop if we find the data we need
            if page_dict == 'incomplete':
                status = 'red'
                continue
            elif 'games' in page_dict.keys():
                break
            elif 'boxes' not in page_dict.keys():
                status = 'red'
                print(url_data['urls'][0], 'skipping - not available')
                url_data = url_data[url_data['urls'] != url_data['urls'][0]]
                num_urls -= 1
                break
            elif 'plays' not in page_dict.keys():
                if params['include_plays'] == 'true': print('retrieved boxes only')
            break

        time.sleep(3)
        if status == 'green':
            url_data = url_data[url_data['urls'] != url_data['urls'][0]]
            if completed % int(params['batch_size']) == 0:
                pages_dict = {}
            for k in page_dict.keys():
                if k not in pages_dict.keys():
                    pages_dict[k] = page_dict[k]
                else:
                    pages_dict[k] = pd.concat([pages_dict[k], page_dict[k]], axis=0)
            completed += 1
            print(completed)
            if completed % int(params['batch_size']) == 0 or completed == num_urls or os.path.isdir(os.path.join(mypath, 'send_s3')):
                print('writing batch to s3...', end=' ')
                if params['table'] != 'games':
                    record_status(pages_dict[list(pages_dict.keys())[0]])

                # we write the play by play data to a per-season file for capacity reasons
                if 'plays' in pages_dict.keys():
                    current_date = dt.strftime(dt.now(), '%Y-%m-%d')
                    if current_date[-5:] < '08-15':
                        season = '-'.join([str(int(current_date[:4]) - 1), current_date[:4]])
                    else:
                        season = '-'.join([current_date[:4], str(int(current_date[:4]) + 1)])
                    pages_dict['plays/plays_'+season] = pages_dict['plays']
                    pages_dict.pop('plays', None)

                for k in pages_dict.keys():
                    if k == 'games':
                        subset = ['Date', 'Detail Path']
                        sort = ['Date', 'Detail Path', 'Detail Data', 'AS']
                        ascending = [True, True, False, False]
                    elif k == 'boxes':
                        subset = ['Date', 'Detail Path', 'Player ID', 'Slug']
                        sort = ['Date', 'Detail Path']
                        ascending = [True, True]
                    elif re.search('plays', k) is not None:
                        subset = ['Date', 'Detail Path', 'Action Number']
                        sort = ['Date', 'Detail Path']
                        ascending = [True, True]
                    append_s3('data/' + k + '.zip', pages_dict[k].reset_index(drop=True), output='dataframe', dedupe_cols=subset, sort=sort, ascending=ascending)
                    # append_local(os.path.join(mypath, k)+'.csv', pages_dict[k].reset_index(drop=True))
                print('complete!')
            if completed == num_urls - 10:
            # params['batch_size'] = min(params['batch_size'], max(int((num_urls-completed)/5), 20)) # batch size decay as we fill up the quota
    driver.close()
    driver.quit()

In [113]:
# args = {'table': 'games', 'dates': ['2023-02-08', '2023-02-20'], 'batch_size': 20}

args = {
    'table': 'games',
    'num_pages': 'infer',
    # 'dates': ['2023-03-07', '2023-03-07'],
    'batch_size': 20,
    'rand': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
    'include_plays': 'true',
    'allow_repeats': 'false'
}
data = main(args)

  return webdriver.Chrome(chromedriver_path, options=options)


5 pages to process...
2023-03-11 1
2023-03-12 2
2023-03-13 3
2023-03-15 4
2023-03-14 5
writing batch to s3... complete!


#### **SELENIUM-TRIGGER-1**

In [2]:
from io import StringIO, BytesIO
import pandas as pd
import numpy as np
import datetime
import zipfile
import boto3
import json
import time

s3_client = boto3.client('s3')
s3_resource = boto3.resource('s3')
lambda_client = boto3.client('lambda')

def write_s3(file_path, myfile, bucket='hwm-nba', dedupe_cols=None, sort=None, compression='zip'):
    file_name = file_path.split('/')[-1]
    if type(myfile) == pd.core.frame.DataFrame:
        if dedupe_cols is not None:
            myfile = myfile.drop_duplicates(subset=dedupe_cols, keep='first')
        if sort is not None:
            myfile = myfile.sort_values(by=sort, ascending=True)
        output_buffer = BytesIO() if compression == 'zip' else StringIO()
        if compression == 'zip': file_path = '.'.join(file_path.split('.')[:-1])+'.zip'
        myfile.to_csv(output_buffer, index=False, compression={'method':compression, 'archive_name':file_name})
        myfile = output_buffer
    s3_resource.Object(bucket, file_path).put(Body=myfile.getvalue())    

def read_s3(file_path, bucket='hwm-nba', output=None, columns=None):
    data = s3_resource.Object(bucket, file_path).get()['Body'].read()
    if file_path[-3:] == 'zip':
        data = zipfile.ZipFile(BytesIO(data))
        data = data.read(data.namelist()[0])
    data = data.decode('utf-8')
    if output == 'dataframe':
        kwargs = {'header': 0}
        if columns is not None: kwargs['names'] = columns
        data = pd.read_csv(StringIO(data), **kwargs, low_memory=False)
    return data

def file_checker(bucket_name, file_name, start_utc):
    my_bucket_data = pd.DataFrame(s3_client.list_objects(Bucket=bucket_name)['Contents'])
    updated_utc = min(my_bucket_data[my_bucket_data['Key'].isin([file_name])]['LastModified']).to_pydatetime()
    diff = (updated_utc - start_utc)
    return (diff.days*86400)+(diff.seconds)

def retrieve_dates(dates, table='games', hour_offset=-100):
    days = int(hour_offset//24)
    hours = hour_offset%24
    max_date = datetime.datetime.today() - datetime.timedelta(days=days, hours=hours)    
    if type(dates) == list:
        dates.sort()
        d1, d2 = datetime.datetime.strptime(dates[0], '%Y-%m-%d'), datetime.datetime.strptime(dates[1], '%Y-%m-%d')
        dates = [datetime.datetime.strftime(d1+datetime.timedelta(days=d), '%Y-%m-%d') for d in range((d2-d1).days+1)]
    elif dates == 'infer':
        games = read_s3('data/games.zip', output='dataframe')
        if table != 'games': games = games[games['Detail Data'] != 0]
        existing = list(np.unique(games['Date']))
        max_played = np.max(np.unique(games[games['AS'] != '-']['Date']))
        min_date = min(existing)
        d1, d2 = datetime.datetime.strptime(min_date, '%Y-%m-%d'), max_date
        dates = [datetime.datetime.strftime(d1+datetime.timedelta(days=d), '%Y-%m-%d') for d in range((d2-d1).days+1)]
        dates = [d for d in dates if (d not in existing or d > max_played)]    
    else:          
        dates = [datetime.datetime.strftime(max_date - datetime.timedelta(days=d), '%Y-%m-%d') for d in range(dates)]
    return dates

def main(*args):
    start_utc = datetime.datetime.now(datetime.timezone.utc)
    lambda_payload_1 = args[0]
    try:
        print(lambda_payload_1['table'])
    except:
        lambda_payload_1 = eval(lambda_payload_1)
    if 'dates' not in lambda_payload_1.keys():
        lambda_payload_1['dates'] = []
        date_list = retrieve_dates(lambda_payload_1['num_pages'])
        if len(date_list) > 0:
            lambda_payload_1['dates'].extend([date_list[0], date_list[-1]])

    if len(lambda_payload_1['dates']) > 0:
        response = lambda_client.invoke(
            FunctionName='selenium-test',
            InvocationType='Event', # RequestResponse
            # response['StatusCode'] == 200:
            Payload=json.dumps(lambda_payload_1)
        )
        file_found = False
        while file_found is False:
            diff = file_checker('hwm-nba', 'data/games.zip', start_utc)
            print(diff)
            if diff > 0:
                file_found = True
                time.sleep(6)
                lambda_payload_2 = {
                    'table': 'game_details',
                    'dates': lambda_payload_1['dates'],
                    'batch_size': lambda_payload_1['batch_size'],
                    'rand': lambda_payload_1['rand'],
                    'include_plays': lambda_payload_1['include_plays'],
                    'allow_repeats': lambda_payload_1['allow_repeats']
                }
                print('Lambda Payload is:',lambda_payload_2)
                lambda_client.invoke(
                    FunctionName='selenium-trigger-2',
                    InvocationType='Event',
                    Payload=json.dumps(lambda_payload_2)
                )
            else:
                time.sleep(10)

#### **SELENIUM-TRIGGER-2**

In [None]:
from io import StringIO, BytesIO
import pandas as pd
import numpy as np
import datetime
import zipfile
import boto3
import json
import time

s3_client = boto3.client('s3')
s3_resource = boto3.resource('s3')
lambda_client = boto3.client('lambda')

def write_s3(file_path, myfile, bucket='hwm-nba', dedupe_cols=None, sort=None, compression='zip'):
    file_name = file_path.split('/')[-1]
    if type(myfile) == pd.core.frame.DataFrame:
        if dedupe_cols is not None:
            myfile = myfile.drop_duplicates(subset=dedupe_cols, keep='first')
        if sort is not None:
            myfile = myfile.sort_values(by=sort, ascending=True)
        output_buffer = BytesIO() if compression == 'zip' else StringIO()
        if compression == 'zip': file_path = '.'.join(file_path.split('.')[:-1])+'.zip'
        myfile.to_csv(output_buffer, index=False, compression={'method':compression, 'archive_name':file_name})
        myfile = output_buffer
    s3_resource.Object(bucket, file_path).put(Body=myfile.getvalue())    

def read_s3(file_path, bucket='hwm-nba', output=None, columns=None):
    data = s3_resource.Object(bucket, file_path).get()['Body'].read()
    if file_path[-3:] == 'zip':
        data = zipfile.ZipFile(BytesIO(data))
        data = data.read(data.namelist()[0])
    data = data.decode('utf-8')
    if output == 'dataframe':
        kwargs = {'header': 0}
        if columns is not None: kwargs['names'] = columns
        data = pd.read_csv(StringIO(data), **kwargs)
    return data

def file_checker(bucket_name, file_name, start_utc):
    my_bucket_data = pd.DataFrame(s3_client.list_objects(Bucket=bucket_name)['Contents'])
    updated_utc = min(my_bucket_data[my_bucket_data['Key'].isin([file_name])]['LastModified']).to_pydatetime()
    diff = (updated_utc - start_utc)
    return (diff.days*86400)+(diff.seconds)

def retrieve_dates(dates, table='games', hour_offset=-100):
    days = int(hour_offset//24)
    hours = hour_offset%24
    max_date = datetime.datetime.today() - datetime.timedelta(days=days, hours=hours)    
    if type(dates) == list:
        dates.sort()
        d1, d2 = datetime.datetime.strptime(dates[0], '%Y-%m-%d'), datetime.datetime.strptime(dates[1], '%Y-%m-%d')
        dates = [datetime.datetime.strftime(d1+datetime.timedelta(days=d), '%Y-%m-%d') for d in range((d2-d1).days+1)]
    elif dates == 'infer':
        games = read_s3('data/games.zip', output='dataframe')
        if table != 'games': games = games[games['Detail Data'] != 0]
        existing = list(np.unique(games['Date']))
        max_played = np.max(np.unique(games[games['AS'] != '-']['Date']))
        min_date = min(existing)
        d1, d2 = datetime.datetime.strptime(min_date, '%Y-%m-%d'), max_date
        dates = [datetime.datetime.strftime(d1+datetime.timedelta(days=d), '%Y-%m-%d') for d in range((d2-d1).days+1)]
        dates = [d for d in dates if (d not in existing or d > max_played)]    
    else:          
        dates = [datetime.datetime.strftime(max_date - datetime.timedelta(days=d), '%Y-%m-%d') for d in range(dates)]
    return dates

def main(*args):
    start_utc = datetime.datetime.now(datetime.timezone.utc)
    lambda_payload_2 = args[0]
    try:
        print(lambda_payload_2['table'])
    except:
        lambda_payload_2 = eval(lambda_payload_2)
    if 'dates' not in lambda_payload_2.keys():
        date_list = retrieve_dates(lambda_payload_2['num_pages'])
        lambda_payload_2['dates'] = [date_list[i] for i in [0,-1]]
    response = lambda_client.invoke(
        FunctionName='selenium-test',
        InvocationType='Event',
        Payload=json.dumps(lambda_payload_2)
    )

    print('Date Payload is:',lambda_payload_2['dates'])

    file_found = False
    while file_found is False:
        diff = file_checker('hwm-nba', 'data/boxes.zip', start_utc)
        print(diff)
        if diff > 0:
            file_found = True
            return
            time.sleep(6)
            # lambda_payload_3 = {'table':'plays', 'dates':lambda_payload_2['dates']}
            # lambda_client.invoke(
            #     FunctionName='selenium-trigger-3',
            #     InvocationType='Event',
            #     Payload=json.dumps(lambda_payload_3)
            # )
        else:
            time.sleep(10)

#### **SELENIUM-TRIGGER-3**

In [None]:
from io import StringIO
import pandas as pd
import json
import time
import boto3

s3_resource = boto3.resource('s3')
lambda_client = boto3.client('lambda')

def write_s3(file_path, myfile, bucket='hwm-nba', dedupe_cols=None, sort=None, compression='zip'):
    file_name = file_path.split('/')[-1]
    if type(myfile) == pd.core.frame.DataFrame:
        if dedupe_cols is not None:
            myfile = myfile.drop_duplicates(subset=dedupe_cols, keep='first')
        if sort is not None:
            myfile = myfile.sort_values(by=sort, ascending=True)
        output_buffer = BytesIO() if compression == 'zip' else StringIO()
        if compression == 'zip': file_path = '.'.join(file_path.split('.')[:-1])+'.zip'
        myfile.to_csv(output_buffer, index=False, compression={'method':compression, 'archive_name':file_name})
        myfile = output_buffer
    s3_resource.Object(bucket, file_path).put(Body=myfile.getvalue())    

def main(*args):
    lambda_payload_3 = args[0]
    try:
        print(lambda_payload_3['table'])
    except:
        lambda_payload_3 = eval(lambda_payload_3)

    response = lambda_client.invoke(
         FunctionName='selenium-test',
         InvocationType='Event',
         Payload=json.dumps(lambda_payload_3)
    )