In [1]:
import hopsworks
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt
import os

In [2]:
import hsfs

# 1. Login
project = hopsworks.login()

# 2. Get the Feature Store (This triggers the metadata check)
try:
    fs = project.get_feature_store("A1ID2223")
    print(f"Successfully connected to Feature Store: {fs.name}")
except Exception as e:
    print(f"Feature Store Connection Error: {e}")

# 3. Check versions
print(f"HSFS Version: {hsfs.__version__}")

2025-12-29 11:34:50,691 INFO: Initializing external client
2025-12-29 11:34:50,692 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-12-29 11:34:52,318 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1267871
Successfully connected to Feature Store: a1id2223_featurestore
HSFS Version: 4.2.10


In [3]:
data = pd.read_csv("/Users/sambarati/Documents/GitHub/nlp-stock-prediction/data/apple_news_data.csv")


In [4]:
# Aggregate sentiment data by date
# Convert date column to datetime first
data['date'] = pd.to_datetime(data['date'])
# Normalize to date only (remove time component)
data['date'] = data['date'].dt.normalize()

# Since we may have multiple news articles per day, calculate daily averages
sentiment_daily = data.groupby('date').agg({
    'sentiment_polarity': 'mean',
    'sentiment_neg': 'mean',
    'sentiment_neu': 'mean',
    'sentiment_pos': 'mean'
}).reset_index()

# Get smallest and highest dates
from_date = sentiment_daily['date'].min()
# Set index to date for later join
sentiment_daily.set_index('date', inplace=True)

print(f"Sentiment data: {len(sentiment_daily)} rows, {sentiment_daily.index.nunique()} unique dates")
print(sentiment_daily.iloc[0])

Sentiment data: 1574 rows, 1574 unique dates
sentiment_polarity    0.994
sentiment_neg         0.023
sentiment_neu         0.869
sentiment_pos         0.108
Name: 2016-02-19 00:00:00+00:00, dtype: float64


In [5]:
stock_data = yf.Ticker("AAPL") 

# for news in stock_data.news:
#     print(news['content']['summary'])

stock_data_price_history = stock_data.history(period="10y") # Get 10 yr price history
stock_data_clean = stock_data_price_history[['Open']] # 'High', 'Low', 'Close', 'Volume', 'Open'
stock_data_clean = stock_data_clean[from_date:]

# Create next-day target from today's open
stock_data_clean["target_open"] = stock_data_clean["Open"].shift(-1)
# Drop last row without a next-day target
stock_data_clean = stock_data_clean.dropna(subset=["target_open"])

In [6]:
# Remove timezone from stock data to match sentiment data (which is timezone-naive)
stock_data_clean.index = stock_data_clean.index.tz_convert(None)
print(stock_data_clean.index)

DatetimeIndex(['2016-02-19 05:00:00', '2016-02-22 05:00:00',
               '2016-02-23 05:00:00', '2016-02-24 05:00:00',
               '2016-02-25 05:00:00', '2016-02-26 05:00:00',
               '2016-02-29 05:00:00', '2016-03-01 05:00:00',
               '2016-03-02 05:00:00', '2016-03-03 05:00:00',
               ...
               '2025-12-11 05:00:00', '2025-12-12 05:00:00',
               '2025-12-15 05:00:00', '2025-12-16 05:00:00',
               '2025-12-17 05:00:00', '2025-12-18 05:00:00',
               '2025-12-19 05:00:00', '2025-12-22 05:00:00',
               '2025-12-23 05:00:00', '2025-12-24 05:00:00'],
              dtype='datetime64[ns]', name='Date', length=2478, freq=None)


In [7]:
# Remove timezone from sentiment data to match stock data (which is timezone-naive)
sentiment_daily.index = sentiment_daily.index.tz_localize(None)
print(f"Stock tz: {stock_data_clean.index.tz}, Sentiment tz: {sentiment_daily.index.tz}")
print(f"Sentiment index sample: {sentiment_daily.index[:5]}")

Stock tz: None, Sentiment tz: None
Sentiment index sample: DatetimeIndex(['2016-02-19', '2017-10-05', '2017-11-27', '2017-11-30',
               '2018-01-31'],
              dtype='datetime64[ns]', name='date', freq=None)


In [8]:
# Normalize stock data index to date only (remove time component)
stock_data_clean.index = stock_data_clean.index.normalize()
# Sentiment data is already normalized

# Rename both indices to 'date' (lowercase) for consistency
stock_data_clean.index.name = 'date'
sentiment_daily.index.name = 'date'

print(f"Stock dates: {stock_data_clean.index[:5]}")
print(f"Sentiment dates: {sentiment_daily.index[:5]}")
print(f"\nStock data has {stock_data_clean.index.duplicated().sum()} duplicate dates")
print(f"Sentiment data has {sentiment_daily.index.duplicated().sum()} duplicate dates")

Stock dates: DatetimeIndex(['2016-02-19', '2016-02-22', '2016-02-23', '2016-02-24',
               '2016-02-25'],
              dtype='datetime64[ns]', name='date', freq=None)
Sentiment dates: DatetimeIndex(['2016-02-19', '2017-10-05', '2017-11-27', '2017-11-30',
               '2018-01-31'],
              dtype='datetime64[ns]', name='date', freq=None)

Stock data has 0 duplicate dates
Sentiment data has 0 duplicate dates


In [9]:
stock_data_clean = stock_data_clean.reset_index() # get index as date column

stock_data_clean.columns = stock_data_clean.columns.str.lower()

# Create the feature group for stock opening prices
fg = fs.get_or_create_feature_group(
    name="opening_prices", 
    description="AAPL opening prices with next-day target",
    version = 2,
    primary_key = ['date'],
    event_time = 'date'
    )
fg.insert(stock_data_clean, wait=True)


Uploading Dataframe: 100.00% |██████████| Rows 2478/2478 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: opening_prices_2_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1267871/jobs/named/opening_prices_2_offline_fg_materialization/executions
2025-12-29 11:35:22,042 INFO: Waiting for execution to finish. Current state: INITIALIZING. Final status: UNDEFINED
2025-12-29 11:35:34,842 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2025-12-29 11:35:38,031 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2025-12-29 11:37:39,618 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2025-12-29 11:37:39,795 INFO: Waiting for log aggregation to finish.
2025-12-29 11:37:55,140 INFO: Execution finished successfully.


(Job('opening_prices_2_offline_fg_materialization', 'SPARK'), None)

In [10]:
sentiment_daily = sentiment_daily.reset_index() # get index as date column

sentiment_daily.columns = sentiment_daily.columns.str.lower()

print(sentiment_daily.columns)

# Create the feature group for stock opening prices
fg = fs.get_or_create_feature_group(
    name="sentiments", 
    description="AAPL stock sentiments",
    version = 2,
    primary_key = ['date'],
    event_time = 'date'
    )
fg.insert(sentiment_daily, wait=True)

Index(['date', 'sentiment_polarity', 'sentiment_neg', 'sentiment_neu',
       'sentiment_pos'],
      dtype='object')


Uploading Dataframe: 100.00% |██████████| Rows 1574/1574 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: sentiments_2_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1267871/jobs/named/sentiments_2_offline_fg_materialization/executions
2025-12-29 11:38:21,342 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2025-12-29 11:38:30,911 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2025-12-29 11:40:38,372 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2025-12-29 11:40:38,532 INFO: Waiting for log aggregation to finish.
2025-12-29 11:40:47,124 INFO: Execution finished successfully.


(Job('sentiments_2_offline_fg_materialization', 'SPARK'), None)