In [0]:


# %sql
# DROP TABLE IF EXISTS stocks_ai.stocks_news_data.stock_news_bronze_layer;
# DROP SCHEMA IF EXISTS stocks_ai.stocks_news_data;



In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import max, col
from datetime import datetime, timedelta
import requests
import pandas as pd
import time

# Initialize Spark session
spark = SparkSession.builder.getOrCreate()

# Set your API key
api_key = "Your Finhub news Api" 

if not spark.catalog.tableExists("stocks_ai.stocks_news_data.stock_news_bronze_layer"):
    is_incremental_flag = 0
else:
    is_incremental_flag = 1

def get_finnhub_news(tickers, api_key, start_date):
    """
    This function fetches news from the Finnhub API for a list of tickers.
    Only the date (not time) is returned.
    """
    all_records = []
    start_date_str = start_date.strftime("%Y-%m-%d")
    end_date_str = datetime.now().strftime("%Y-%m-%d")

    for ticker in tickers:
        url = f"https://finnhub.io/api/v1/company-news?symbol={ticker}&from={start_date_str}&to={end_date_str}&token={api_key}"
        response = requests.get(url)
        if response.status_code != 200:
            continue
        data = response.json()

        for article in data:
            if 'summary' in article and 'datetime' in article:
                record = {
                    'stock': ticker.upper(),
                    'summary': article['summary'],
                    'date': datetime.fromtimestamp(article['datetime']).date()
                }
                all_records.append(record)
        time.sleep(1)  

    return pd.DataFrame(all_records)

# Get tickers
stock_names_df = spark.sql("SELECT * FROM stocks_ai.stocks_name_ticker.stock_names")
stock_list = [row['ticker'] for row in stock_names_df.collect()]

if is_incremental_flag == 0:
    # Full load (last 5 days)
    start_date = datetime.now() - timedelta(days=5)
    news_df = get_finnhub_news(stock_list, api_key, start_date=start_date)

    spark.sql("CREATE SCHEMA IF NOT EXISTS stocks_ai.stocks_news_data")
    df_spark = spark.createDataFrame(news_df)
    df_spark.write.format("delta").mode("overwrite").saveAsTable("stocks_ai.stocks_news_data.stock_news_bronze_layer")

else:
    # Incremental load from latest date in table
    latest_date_str = spark.read.table("stocks_ai.stocks_news_data.stock_news_bronze_layer") \
                                .agg(max("date").alias("latest_date")) \
                                .collect()[0]["latest_date"]

    start_date = latest_date_str + timedelta(days=1)
    news_df = get_finnhub_news(stock_list, api_key, start_date=start_date)
    df_spark = spark.createDataFrame(news_df)
    df_spark.write.format("delta").mode("append").saveAsTable("stocks_ai.stocks_news_data.stock_news_bronze_layer")


In [0]:
#things that can be improved- unit testing
#logging and monitoring


stock,summary,date
INTC,"Cornelis Networks onTuesday released a suite of networking hardware and softwareaimed at linking together up to half a million artificialintelligence chips. Cornelis, which was spun...",2025-06-03
INTC,New Intel--SoftBank Tie-Up Targets Japan's AI Cloud Market,2025-06-02
INTC,"Intel axed 22,000 jobs and Microsoft 6,000 as overall cuts drop from 2024 levels",2025-06-02
INTC,"The next big breakthrough in AI isn't too far off, says one industry expert at the ground floor of tech innovation.",2025-06-02
INTC,Intel Shares Tumble 30% from Highs as Wall Street Questions Strategy Shift,2025-06-02
ORCL,"Zacks.com users have recently been watching Oracle (ORCL) quite a bit. Thus, it is worth knowing the facts that could determine the stock's prospects.",2025-06-05
ORCL,"(Bloomberg) -- Microsoft Corp. Chief Executive Officer Satya Nadella said his company’s crucial partnership with OpenAI is changing, but remains strong.Most Read from BloombergICE Moves to DNA-Test Families Targeted for Deportation with New ContractThe Global Struggle to Build Safer CarsNYC Residents Want Safer Streets, Cheaper Housing, Survey SaysThe Buffalo Architect Fighting for Women in DesignUS Housing Agency Vulnerable to Fraud After DOGE Cuts, Documents Warn“Any company that has gone from",2025-06-05
ORCL,"(Bloomberg) -- US Commerce Secretary Howard Lutnick said the Trump administration has been reworking agreements forged with semiconductor makers under the 2022 Chips Act to secure what he called better terms aimed at generating additional domestic investment.Most Read from BloombergICE Moves to DNA-Test Families Targeted for Deportation with New ContractThe Global Struggle to Build Safer CarsNYC Residents Want Safer Streets, Cheaper Housing, Survey SaysThe Buffalo Architect Fighting for Women in",2025-06-05
ORCL,"The network infrastructure includes Remote Direct Memory Access technology, allowing servers to communicate directly with each other, speeding up data flow.",2025-06-05
ORCL,"Equity Insider News CommentaryIssued on behalf of Avant Technologies Inc.VANCOUVER, BC, June 4, 2025 /PRNewswire/ -- Equity Insider News Commentary – Generative AI is transforming healthcare...",2025-06-04
