In [6]:
import pandas as pd
from datetime import datetime
import requests
import numpy as np
from polygon.rest import RESTClient
from tqdm import tqdm
import warnings
import pytz
from concurrent.futures import ThreadPoolExecutor, as_completed
import concurrent.futures
import json

warnings.simplefilter(action='ignore', category=FutureWarning)

api_key = 'hFrBS7nzcaLTa8mplO1ejm44DI4EscDM'
client = RESTClient(api_key)

In [7]:
def est_all(unix_ms_timestamp):
    # Convert Unix timestamp in milliseconds to seconds
    unix_seconds = unix_ms_timestamp / 1000.0
    # Create a datetime object from the Unix timestamp
    utc_time = datetime.utcfromtimestamp(unix_seconds)
    # Define the UTC and EST timezones
    utc_zone = pytz.utc
    est_zone = pytz.timezone('US/Eastern')
    # Localize the UTC datetime object to UTC timezone
    utc_time = utc_zone.localize(utc_time)
    # Convert the UTC time to EST
    est_time = utc_time.astimezone(est_zone)
    est_time = est_time.replace(tzinfo=None)
    return est_time.strftime('%Y-%m-%d %H:%M')

In [8]:
#Minute aggregate-based variable function
def minute_data(ticker, min_date, max_date):
    try:
        # Base URL for the initial request
        base_url = f'https://api.polygon.io/v2/aggs/ticker/{ticker}/range/1/minute/{min_date}/{max_date}?adjusted=false&sort=asc&limit=50000&apiKey={api_key}'
        all_data = []
        url = base_url
        while url:
            response = requests.get(url)
            data = response.json()
            if not 'results' in data:
                return pd.DataFrame()
            all_data.extend(data['results'])
            next_url = data.get('next_url', None)
            if next_url:
                url = f"{next_url}&apiKey={api_key}"
            else:
                break
        df = pd.DataFrame(all_data)
        #Get dates/times, filter
        df['date_time'] = df['t'].apply(est_all)
        df[['date', 'time']] = df['date_time'].str.split(' ', expand=True)
        df = df[(df['date'] >= min_date) & (df['date'] < max_date)]
        df = df[(df['time'] >= '09:30') & (df['time'] <= '15:59')]
        #Create key
        df['key'] = ticker + '_' + df['date']
        #create vwap df
        times_all = [f'{h:02}:{m:02}' for h in range(9, 16) for m in range(30 if h == 9 else 0, 60)]
        df_vwap = df.pivot(index='key', columns='time', values='vw')
        df_vwap = df_vwap.reindex(columns=times_all)
        df_vwap = df_vwap.reset_index()
        df_vwap.columns = [f'{col}_vw' if col != 'key' else col for col in df_vwap.columns]
        #create volume df
        df_vol = df.pivot(index='key', columns='time', values='v')
        df_vol = df_vol.reindex(columns=times_all)
        df_vol = df_vol.reset_index()
        df_vol.columns = [f'{col}_vol' if col != 'key' else col for col in df_vol.columns]
        #get 15:58,15:59 close columns
        df_c = df.pivot(index='key', columns='time', values='c')
        df_c = df_c.reindex(columns=['15:58','15:59'])
        df_c = df_c.reset_index()
        df_c.columns = [f'{col}_c' if col != 'key' else col for col in df_c.columns]
        #get 15:59 open column
        df_o = df.pivot(index='key', columns='time', values='o')
        df_o = df_o.reindex(columns=['15:59'])
        df_o = df_o.reset_index()
        df_o.columns = [f'{col}_o' if col != 'key' else col for col in df_o.columns]
        #merge all
        mer = df_vwap.merge(df_vol, on='key', how='left')
        mer = mer.merge(df_c, on='key',how='left')
        mer = mer.merge(df_o, on='key',how='left')
        return mer
    except Exception as e:
        print(f'{ticker}:{e}')
        return pd.DataFrame()
#minute_data('AAPL','2024-01-01','2024-03-01')

In [9]:
for x in range(0,9):
    feed = pd.read_csv(f'feed{x}.csv')
    with concurrent.futures.ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(minute_data, feed['ticker'],feed['min_date'],feed['max_date']), total=len(feed)))
    min_df = pd.concat(results, ignore_index=True)
    min_df.to_csv(f'out{x}.csv')

 12%|█▏        | 21/182 [28:03<5:05:52, 113.99s/it] IOStream.flush timed out
100%|██████████| 182/182 [54:13<00:00, 17.88s/it]  
100%|██████████| 182/182 [27:17<00:00,  9.00s/it] 
100%|██████████| 182/182 [32:33<00:00, 10.73s/it]  
100%|██████████| 182/182 [26:27<00:00,  8.72s/it]  
100%|██████████| 181/181 [30:59<00:00, 10.27s/it] 
100%|██████████| 181/181 [28:44<00:00,  9.53s/it] 
100%|██████████| 181/181 [19:06<00:00,  6.34s/it] 
100%|██████████| 181/181 [09:48<00:00,  3.25s/it]
100%|██████████| 181/181 [07:04<00:00,  2.34s/it]


In [10]:
x = 9
feed = pd.read_csv(f'feed{x}.csv')
with concurrent.futures.ThreadPoolExecutor() as executor:
    results = list(tqdm(executor.map(minute_data, feed['ticker'],feed['min_date'],feed['max_date']), total=len(feed)))
min_df = pd.concat(results, ignore_index=True)
min_df.to_csv(f'out{x}.csv')

100%|██████████| 181/181 [32:59<00:00, 10.93s/it]  
