In [2]:
import hopsworks
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt
import os

In [3]:
import hsfs

# 1. Login
project = hopsworks.login()

# 2. Get the Feature Store (This triggers the metadata check)
try:
    fs = project.get_feature_store(os.getenv("FEATURE_STORE_NAME"))
    print(f"Successfully connected to Feature Store: {fs.name}")
except Exception as e:
    print(f"Feature Store Connection Error: {e}")

# 3. Check versions
print(f"HSFS Version: {hsfs.__version__}")

2025-12-23 19:57:27,676 INFO: Initializing external client
2025-12-23 19:57:27,677 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-12-23 19:57:29,651 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1267871
Successfully connected to Feature Store: a1id2223_featurestore
HSFS Version: 4.2.10


In [4]:
data = pd.read_csv("/Users/sambarati/Documents/GitHub/nlp-stock-prediction/data/apple_news_data.csv")


In [6]:
# Aggregate sentiment data by date
# Convert date column to datetime first
data['date'] = pd.to_datetime(data['date'])
# Normalize to date only (remove time component)
data['date'] = data['date'].dt.normalize()

# Since we may have multiple news articles per day, calculate daily averages
sentiment_daily = data.groupby('date').agg({
    'sentiment_polarity': 'mean',
    'sentiment_neg': 'mean',
    'sentiment_neu': 'mean',
    'sentiment_pos': 'mean'
}).reset_index()

# Get smallest and highest dates
from_date = sentiment_daily['date'].min()
to_date = sentiment_daily['date'].max()
# Set index to date for later join
sentiment_daily.set_index('date', inplace=True)

print(f"Sentiment data: {len(sentiment_daily)} rows, {sentiment_daily.index.nunique()} unique dates")
print(sentiment_daily.iloc[0])

Sentiment data: 1574 rows, 1574 unique dates
sentiment_polarity    0.994
sentiment_neg         0.023
sentiment_neu         0.869
sentiment_pos         0.108
Name: 2016-02-19 00:00:00+00:00, dtype: float64


In [7]:
stock_data = yf.Ticker("AAPL") 
stock_data_price_history = stock_data.history(period="10y") # Get 10 yr price history
stock_data_clean = stock_data_price_history[['Open', 'High', 'Low', 'Close', 'Volume']]
stock_data_clean = stock_data_clean[from_date:to_date]
print(stock_data_clean.index)

DatetimeIndex(['2016-02-19 00:00:00-05:00', '2016-02-22 00:00:00-05:00',
               '2016-02-23 00:00:00-05:00', '2016-02-24 00:00:00-05:00',
               '2016-02-25 00:00:00-05:00', '2016-02-26 00:00:00-05:00',
               '2016-02-29 00:00:00-05:00', '2016-03-01 00:00:00-05:00',
               '2016-03-02 00:00:00-05:00', '2016-03-03 00:00:00-05:00',
               ...
               '2024-11-13 00:00:00-05:00', '2024-11-14 00:00:00-05:00',
               '2024-11-15 00:00:00-05:00', '2024-11-18 00:00:00-05:00',
               '2024-11-19 00:00:00-05:00', '2024-11-20 00:00:00-05:00',
               '2024-11-21 00:00:00-05:00', '2024-11-22 00:00:00-05:00',
               '2024-11-25 00:00:00-05:00', '2024-11-26 00:00:00-05:00'],
              dtype='datetime64[ns, America/New_York]', name='Date', length=2209, freq=None)


In [8]:
# Remove timezone from stock data to match sentiment data (which is timezone-naive)
stock_data_clean.index = stock_data_clean.index.tz_convert(None)
print(stock_data_clean.index)

DatetimeIndex(['2016-02-19 05:00:00', '2016-02-22 05:00:00',
               '2016-02-23 05:00:00', '2016-02-24 05:00:00',
               '2016-02-25 05:00:00', '2016-02-26 05:00:00',
               '2016-02-29 05:00:00', '2016-03-01 05:00:00',
               '2016-03-02 05:00:00', '2016-03-03 05:00:00',
               ...
               '2024-11-13 05:00:00', '2024-11-14 05:00:00',
               '2024-11-15 05:00:00', '2024-11-18 05:00:00',
               '2024-11-19 05:00:00', '2024-11-20 05:00:00',
               '2024-11-21 05:00:00', '2024-11-22 05:00:00',
               '2024-11-25 05:00:00', '2024-11-26 05:00:00'],
              dtype='datetime64[ns]', name='Date', length=2209, freq=None)


In [9]:
# Remove timezone from sentiment data to match stock data (which is timezone-naive)
sentiment_daily.index = sentiment_daily.index.tz_localize(None)
print(f"Stock tz: {stock_data_clean.index.tz}, Sentiment tz: {sentiment_daily.index.tz}")
print(f"Sentiment index sample: {sentiment_daily.index[:5]}")

Stock tz: None, Sentiment tz: None
Sentiment index sample: DatetimeIndex(['2016-02-19', '2017-10-05', '2017-11-27', '2017-11-30',
               '2018-01-31'],
              dtype='datetime64[ns]', name='date', freq=None)


In [10]:
# Normalize stock data index to date only (remove time component)
stock_data_clean.index = stock_data_clean.index.normalize()
# Sentiment data is already normalized

# Rename both indices to 'date' (lowercase) for consistency
stock_data_clean.index.name = 'date'
sentiment_daily.index.name = 'date'

print(f"Stock dates: {stock_data_clean.index[:5]}")
print(f"Sentiment dates: {sentiment_daily.index[:5]}")
print(f"\nStock data has {stock_data_clean.index.duplicated().sum()} duplicate dates")
print(f"Sentiment data has {sentiment_daily.index.duplicated().sum()} duplicate dates")

Stock dates: DatetimeIndex(['2016-02-19', '2016-02-22', '2016-02-23', '2016-02-24',
               '2016-02-25'],
              dtype='datetime64[ns]', name='date', freq=None)
Sentiment dates: DatetimeIndex(['2016-02-19', '2017-10-05', '2017-11-27', '2017-11-30',
               '2018-01-31'],
              dtype='datetime64[ns]', name='date', freq=None)

Stock data has 0 duplicate dates
Sentiment data has 0 duplicate dates


In [11]:
# Merge on the date index
merged_data = stock_data_clean.join(sentiment_daily, how='inner')

# Reset index to convert date from index to column
merged_data = merged_data.reset_index()

# Rename columns to lowercase to match feature group schema
merged_data.columns = merged_data.columns.str.lower()

print(merged_data.iloc[0])


# Display the merged datamerged_data.size
print(f"Merged data shape: {merged_data.shape}")
print(f"Columns: {merged_data.columns.tolist()}")

date                  2016-02-19 00:00:00
open                             21.76247
high                            21.934757
low                             21.717132
close                           21.771538
volume                          141496800
sentiment_polarity                  0.994
sentiment_neg                       0.023
sentiment_neu                       0.869
sentiment_pos                       0.108
Name: 0, dtype: object
Merged data shape: (1132, 10)
Columns: ['date', 'open', 'high', 'low', 'close', 'volume', 'sentiment_polarity', 'sentiment_neg', 'sentiment_neu', 'sentiment_pos']


In [19]:
# Create the feature group for merged sentiment and stock price data
fg = fs.get_or_create_feature_group(
    name="stock_price_and_history", 
    description="joined stock price and sentiment history",
    version = 1,
    primary_key = ['date'],
    event_time = 'date'
    )

In [20]:
fg.insert(merged_data, wait=True)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1267871/fs/1262659/fg/1878362


Uploading Dataframe: 100.00% |██████████| Rows 1132/1132 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: stock_price_and_history_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1267871/jobs/named/stock_price_and_history_1_offline_fg_materialization/executions
2025-12-22 14:34:19,184 INFO: Waiting for execution to finish. Current state: INITIALIZING. Final status: UNDEFINED
2025-12-22 14:34:22,364 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2025-12-22 14:36:16,965 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2025-12-22 14:36:17,135 INFO: Waiting for log aggregation to finish.
2025-12-22 14:36:42,601 INFO: Execution finished successfully.


(Job('stock_price_and_history_1_offline_fg_materialization', 'SPARK'), None)

In [None]:
# Check for duplicate columns
print("Columns:", merged_data.columns.tolist())
print("Duplicate columns:", merged_data.columns[merged_data.columns.duplicated()].tolist())
print("\nDataFrame info:")
merged_data.info()

Columns: ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'sentiment_polarity', 'sentiment_neg', 'sentiment_neu', 'sentiment_pos']
Duplicate columns: []

DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24982 entries, 0 to 24981
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                24982 non-null  datetime64[ns]
 1   Open                24982 non-null  float64       
 2   High                24982 non-null  float64       
 3   Low                 24982 non-null  float64       
 4   Close               24982 non-null  float64       
 5   Volume              24982 non-null  int64         
 6   sentiment_polarity  24971 non-null  float64       
 7   sentiment_neg       24971 non-null  float64       
 8   sentiment_neu       24971 non-null  float64       
 9   sentiment_pos       24971 non-null  float64       
dtypes: datetime64[ns](1), float64(8), int64(1)