In [2]:
from pyspark.sql.types import StructType, StructField, StringType, FloatType, LongType
from pyspark.sql.window import Window
import pyspark.sql.functions as F
from datetime import date
import aiohttp
import asyncio
import logging
import json 
import os
from concurrent.futures import ThreadPoolExecutor
import hopsworks

BASE_PRICE_URL = "https://api.polygon.io/v2/aggs/ticker/{ticker}/range/1/day/{start_date}/{end_date}?adjusted=true&sort=asc&limit=50000&apiKey={api_key}"
BASE_NEWS_URL = "https://api.polygon.io/v2/reference/news?ticker={ticker}&published_utc.lte={end_date}&order=asc&limit=1000&sort=published_utc&apiKey={api_key}"

proj = hopsworks.login()
fs = proj.get_feature_store()
secrets = hopsworks.get_secrets_api()
polygon_key = secrets.get_secret('POLYGON_API_KEY').value
print(polygon_key)


Logged in to project, explore it here https://hopsworks0.logicalclocks.com/p/119
wRZasThkH6vx7IhYJMrisdK_dlLkM7YI


In [6]:

tickers = json.loads('["AAPL", "AMZN", "GOOGL", "MSFT", "NVDA", "TSLA"]')
 

# final_tickers = tickers.copy()
# final_tickers.extend(['SPY', 'QQQ', 'DIA', 'IWM', 'VTI', 'EFA', 'BIL', 'SHV', 'IEI', 'IEF', 'TLT'])
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


async def fetch_data(session, url):
    async with session.get(url) as response:
        
        return await response.json()

async def get_ticker_data(session, ticker):
    end_date = date.today()
    start_date = "2024-01-01"  #! Adjust as needed
    url = BASE_PRICE_URL.format(ticker=ticker, start_date=start_date, end_date=end_date, api_key=polygon_key)
    all_data = []

    next_url = url
    while next_url:
        response_data = await fetch_data(session, next_url)
        all_data.extend(response_data.get('results', []))

        next_url = response_data.get('next_url')
        if next_url:
            next_url = f"{next_url}&apiKey={polygon_key}"

    return {ticker: all_data}

async def get_news_data(session, ticker):
    end_date = date.today()
    url = BASE_NEWS_URL.format(ticker=ticker, end_date=end_date, api_key=polygon_key)
    all_data = []

    next_url = url
    while next_url:
        response_data = await fetch_data(session, next_url)
        all_data.extend(response_data.get('results', []))

        next_url = response_data.get('next_url')
        if next_url:
            next_url = f"{next_url}&apiKey={polygon_key}"

    return {ticker: all_data}

async def call_price(tickers):
    async with aiohttp.ClientSession() as session:
        results = await asyncio.gather(*(get_ticker_data(session, ticker) for ticker in tickers))
    return results

async def call_news(tickers):
    async with aiohttp.ClientSession() as session:
        results = await asyncio.gather(*(get_news_data(session, ticker) for ticker in tickers))
    return results

def process_price_data(price_data):
    price_rows = []
    for ticker_data in price_data:
        for ticker, results in ticker_data.items():
            for result in results:
                price_rows.append(
                    (ticker, result["t"], float(result["o"]), float(result["h"]), float(result["l"]), float(result["c"]), 
                    float(result["v"]), float(result.get("vw", 0)), (result.get("n", 0))))
    return price_rows

def process_news_data(news_data):
    return [{"search_ticker": ticker, **data} for ticker_data in news_data for ticker, datas in ticker_data.items() for data in datas]

def run_in_thread(fn):
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    with ThreadPoolExecutor() as pool:
        return pool.submit(fn).result()

In [7]:
price_schema = StructType([
    StructField("ticker", StringType(), True),
    StructField("t", LongType(), True),
    StructField("o", FloatType(), True),
    StructField("h", FloatType(), True),
    StructField("l", FloatType(), True),
    StructField("c", FloatType(), True),
    StructField("v", FloatType(), True),
    StructField("vw", FloatType(), True),
    StructField("n", LongType(), True)
])

logging.info("Calling Polygon API")

price_data = run_in_thread(lambda: asyncio.run(call_price(tickers)))

try: 
    news_data = run_in_thread(lambda: asyncio.run(call_news(tickers)))
except Exception as e:
    logging.error(f"Error calling news API: {e}")
    pass

logging.info("Done calling Polygon API.. Processing price data")

price_rows = process_price_data(price_data)
price_df = spark.createDataFrame(price_rows, schema=price_schema)

try: 
    logging.info("Creating News Data")
    news_rows = process_news_data(news_data)
    news_df = spark.createDataFrame(news_rows)
except Exception as e:
    logging.error(f"Error processing news data: {e}")
    pass



2024-09-18 14:59:38,803 INFO: Calling Polygon API
2024-09-18 14:59:40,433 INFO: Done calling Polygon API.. Processing price data
2024-09-18 14:59:42,157 INFO: Creating News Data


In [8]:
price_df.show()

+------+-------------+-------+--------+--------+------+-----------+--------+-------+
|ticker|            t|      o|       h|       l|     c|          v|      vw|      n|
+------+-------------+-------+--------+--------+------+-----------+--------+-------+
|  AAPL|1704171600000| 187.15|  188.44| 183.885|185.64|8.1964872E7|185.9465|1008871|
|  AAPL|1704258000000| 184.22|  185.88|  183.43|184.25| 5.841446E7|184.3226| 656853|
|  AAPL|1704344400000| 182.15|183.0872|  180.88|181.91|7.1878672E7|182.0183| 712692|
|  AAPL|1704430800000| 181.99|  182.76|  180.17|181.18| 6.237116E7| 181.474| 682334|
|  AAPL|1704690000000|182.085|   185.6|   181.5|185.56|5.9144472E7|184.3702| 669173|
|  AAPL|1704776400000| 183.92|  185.15|  182.73|185.14|4.2841808E7|184.3706| 538180|
|  AAPL|1704862800000| 184.35|   186.4|  183.92|186.19|4.6192908E7|185.2509| 554777|
|  AAPL|1704949200000| 186.54|  187.05|  183.62|185.59|4.9128408E7|185.0604| 584008|
|  AAPL|1705035600000| 186.06|  186.74|  185.19|185.92|4.0477784E

In [9]:
news_df.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+--------------------+--------------------+-------------+--------------------+--------------------+-------+
|         article_url|              author|         description|                  id|           image_url|   keywords|       published_utc|           publisher|search_ticker|             tickers|               title|amp_url|
+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+--------------------+--------------------+-------------+--------------------+--------------------+-------+
|https://www.fool....|newsfeedback@fool...|Our industry expe...|vHF34aOm44qBBn5Uv...|https://g.foolcdn...|[investing]|2018-01-06T01:10:00Z|{name -> The Motl...|         AMZN|      [GS, AMZN, WW]|The 2017 Industry...|   null|
|https://www.fool....|newsfeedback@fool...|They're big, beau...|oc77BKWDEc996_Z22...|https://g.foolc

In [10]:
from pyspark.sql.functions import map_values

news_df = news_df.withColumn('publisher_values', map_values('publisher'))
news_df = news_df.drop('publisher')
news_df.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+--------------------+-------------+--------------------+--------------------+-------+--------------------+
|         article_url|              author|         description|                  id|           image_url|   keywords|       published_utc|search_ticker|             tickers|               title|amp_url|    publisher_values|
+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+--------------------+-------------+--------------------+--------------------+-------+--------------------+
|https://www.fool....|newsfeedback@fool...|Our industry expe...|vHF34aOm44qBBn5Uv...|https://g.foolcdn...|[investing]|2018-01-06T01:10:00Z|         AMZN|      [GS, AMZN, WW]|The 2017 Industry...|   null|[The Motley Fool,...|
|https://www.fool....|newsfeedback@fool...|They're big, beau...|oc77BKWDEc996_Z22...|https://g.foolc

In [11]:
stock_fg = fs.get_or_create_feature_group(name="stock_price", 
                                    version=1,
                                    primary_key=['ticker', 't' ])
stock_fg.insert(price_df)

Feature Group created successfully, explore it at 
https://hopsworks0.logicalclocks.com/p/119/fs/67/fg/13


(None, None)

In [12]:
news_fg = fs.get_or_create_feature_group(name="stock_news", 
                                    version=1,
                                    primary_key=['article_url'])
news_fg.insert(news_df)

Feature Group created successfully, explore it at 
https://hopsworks0.logicalclocks.com/p/119/fs/67/fg/14


(None, None)