In [None]:
import requests
import pandas as pd
import os
import fastparquet
import datetime as dt
import time
import snappy

In [None]:
#def is_time_between(begin_time, end_time, check_time=None):
    # If check time is not given, default to current UTC time
    #check_time = dt.datetime.now()
    #if begin_time < end_time:
        #return check_time >= begin_time and check_time <= end_time
    #else: # crosses midnight
        #return check_time >= begin_time or check_time <= end_time

# Original test case from OP
#is_time_between(time(10,30), time(16,30))

# Test case when range crosses midnight
#is_time_between(time(22,0), time(4,00))

In [None]:
# function to get vehicle locations by route
def get_vehicles_byroute(routenum, timestamp):
    #make request with route number
    resp = requests.get('http://api.metro.net/agencies/lametro/routes/%s/vehicles/' % routenum)
    
     #check if call is successful
    if resp.status_code != requests.codes.ok:
        print('API call unsuccessful')
        resp.raise_for_status()
        return
    
    #store json response as data
    data = resp.json()
    
    #convert json to dataframe
    routedata = pd.DataFrame(data['items'])
    
    #get current time
    now = dt.datetime.now()
    
    #add current time to as a value to dataframe "call_time"
    routedata['call_time'] = now
    
    #index with MultiIndex
    oldidx = routedata.index.to_frame()
    oldidx.insert(0, 'route', routenum)
    oldidx.insert(1, 'comparison_time', timestamp)
    oldidx.pop(0)
    routedata.index=(pd.MultiIndex.from_frame(oldidx))

    return routedata

In [None]:
#call multiple routes
def get_vehicles_byroutes(*routes, delay):
    queries = 0
    timestamp = dt.datetime.now()
    single_time_df = pd.DataFrame()
    
    for route in routes:
        single_time_df = single_time_df.append(
                        get_vehicles_byroute(routenum=route, timestamp=timestamp), sort=True)
        time.sleep(delay)
        queries += 1
    
    return [queries, single_time_df]

In [None]:
def get_vehicles_days(*routes, days=1, delay=3, interval=120):

    #create route name string for file output
    route_names = 'routes_'
    for route in routes:
        route_names += str(route) + '-'
    
    #make output directory, if necessary
    cwd = os.getcwd()
    if not os.path.exists("%s/data/processed" % cwd):
        os.makedirs("%s/data/processed" % cwd)
    
    #create datetime objects for now and days days from now
    now = dt.datetime.now()
    then = now + dt.timedelta(days=days)
    
    #columns = (['first_rt_call_time'] + list(map(str, list(routes))))
    bigdf = pd.DataFrame()
    queries = 0
    fnames = []
    
    #loop will end when current time passes target 
    while now < then:
        
        
        queries_df = get_vehicles_byroutes(*routes, delay=delay)
        queries += queries_df[0]
        bigdf = bigdf.append(queries_df[1])
        
        #request data every 2 minutes using delay, compensated
        sleep_time = interval - len(routes) * delay
        
        time.sleep(sleep_time)
        
        #update current time
        now = dt.datetime.now()
        
        #write/manage filenames

        fnames += [route_names + '_' + now.strftime('%Y-%m-%d-%H-%M') + '.parquet']
        bigdf.to_parquet('{}/data/processed/{}'.format(cwd, fnames[-1])) 
        if len(fnames) > 1 and os.path.isfile('{}/data/processed/{}'.format(cwd, fnames[-2])):
            os.remove('{}/data/processed/{}'.format(cwd, fnames[-2]))
            fnames = fnames[-1:]

    with open('{}/data/log.txt'.format(cwd), mode = 'w') as log:
        log.write('total queries executed:{}'.format(queries))
    
    return bigdf

In [None]:
#test run with two minute delays

df = get_vehicles_days(20, 720, 33, 733, 204, 754, 4, 704)