In [1]:
import hopsworks
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt
import os

In [2]:
import hsfs

# 1. Login
project = hopsworks.login()

# 2. Get the Feature Store (This triggers the metadata check)
try:
    fs = project.get_feature_store("A1ID2223")
    print(f"Successfully connected to Feature Store: {fs.name}")
except Exception as e:
    print(f"Feature Store Connection Error: {e}")

# 3. Check versions
print(f"HSFS Version: {hsfs.__version__}")

2026-01-05 18:22:50,405 INFO: Initializing external client
2026-01-05 18:22:50,413 INFO: Base URL: https://c.app.hopsworks.ai:443
2026-01-05 18:22:52,317 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1267871
Successfully connected to Feature Store: a1id2223_featurestore
HSFS Version: 4.2.10


In [3]:
sentiments = fs.get_feature_group(name="sentiments", version=2)
opening = fs.get_feature_group(name="opening_prices", version=2)

In [4]:
sentiments = sentiments.read()
opening = opening.read()

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.50s) 
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.82s) 


In [5]:
sentiments['date'].max()

Timestamp('2026-01-05 00:00:00+0000', tz='Etc/UTC')

In [6]:
opening['date'].max()

Timestamp('2026-01-05 00:00:00+0000', tz='Etc/UTC')

In [7]:
# Merge sentiment and opening price data
merged = sentiments.merge(opening, on='date', how='inner')  # Inner join drops weekends

In [8]:
# Filter to only dates where we have both sentiment and opening prices
merged = sentiments.merge(opening, on='date', how='inner', suffixes=('_sent', '_open'))
print(f"After filtering: {len(merged)} dates with both sentiment and opening prices")

After filtering: 1138 dates with both sentiment and opening prices


In [9]:
# ---------------------------------------------------------
# FIX: Fill missing target_open values
# ---------------------------------------------------------
# The daily ingestion script appends new data with target_open=NaN but doesn't 
# update the previous day's target. We can fix this here by shifting the open price.

# Sort by date to ensure shift works correctly
merged = merged.sort_values('date')

# Fill missing target_open with the next day's open price
merged['target_open'] = merged['target_open'].fillna(merged['open'].shift(-1))

print("Filled missing target_open values using next day's open price.")
print(f"Remaining null target_open: {merged['target_open'].isna().sum()} (should be 1 for the last date)")

Filled missing target_open values using next day's open price.
Remaining null target_open: 1 (should be 1 for the last date)


In [10]:
merged['date'].max()

Timestamp('2026-01-05 00:00:00+0000', tz='Etc/UTC')

## Prepare Data for Inference
Remove rows with missing target_open (like today's date) and prepare features for prediction.

In [11]:
# Sort by date to see the data range
merged_sorted = merged.sort_values('date')
print(f"Date range: {merged_sorted['date'].min()} to {merged_sorted['date'].max()}")
print(f"\nFirst few rows:")
print(merged_sorted.head())
print(f"\nLast few rows:")
print(merged_sorted.tail())

Date range: 2016-02-19 00:00:00+00:00 to 2026-01-05 00:00:00+00:00

First few rows:
                         date  sentiment_polarity  sentiment_neg  \
230 2016-02-19 00:00:00+00:00               0.994          0.023   
852 2017-10-05 00:00:00+00:00               0.997          0.008   
731 2017-11-27 00:00:00+00:00               0.997          0.008   
845 2017-11-30 00:00:00+00:00               0.989          0.021   
14  2018-01-31 00:00:00+00:00               0.995          0.009   

     sentiment_neu  sentiment_pos       open  target_open  
230          0.869          0.108  21.762466    21.832740  
852          0.925          0.067  35.978361    36.162716  
731          0.926          0.065  40.995288    40.819656  
845          0.804          0.174  39.913317    39.800894  
14           0.937          0.054  39.079592    39.149856  

Last few rows:
                          date  sentiment_polarity  sentiment_neg  \
0    2025-12-23 00:00:00+00:00            0.298535       0.131

In [12]:
# Filter out rows with null target_open (typically today's date)
inference_data = merged[merged['target_open'].notna()].copy()
print(f"Filtered data: {len(inference_data)} rows with valid target_open")
print(f"\nNull target_open count: {merged['target_open'].isna().sum()}")

Filtered data: 1137 rows with valid target_open

Null target_open count: 1


In [13]:
# Prepare features (X) and target (y) for inference
# Features: sentiment columns and open price
# Note: Model was trained with 'opening_prices_open' not 'open', so we need to rename
feature_cols = ['sentiment_polarity', 'sentiment_neg', 'sentiment_neu', 'sentiment_pos', 'opening_prices_open']

# Rename the open column to match training feature names
inference_data = inference_data.rename(columns={'open': 'opening_prices_open'})

X_inference = inference_data[feature_cols]
y_actual = inference_data['target_open']

print(f"Features shape: {X_inference.shape}")
print(f"Features: {feature_cols}")
print(f"\nColumn names in inference_data: {inference_data.columns.tolist()}")

Features shape: (1137, 5)
Features: ['sentiment_polarity', 'sentiment_neg', 'sentiment_neu', 'sentiment_pos', 'opening_prices_open']

Column names in inference_data: ['date', 'sentiment_polarity', 'sentiment_neg', 'sentiment_neu', 'sentiment_pos', 'opening_prices_open', 'target_open']


## Load Model from Hopsworks Model Registry

In [14]:
import xgboost

# Get model registry
mr = project.get_model_registry()

# Retrieve the model
model_name = "sentiment_stock_price_model_AAPL"
model = mr.get_model(model_name, version=2)

# Download model artifacts to local directory
model_dir = model.download()
print(f"Model downloaded to: {model_dir}")

Downloading: 0.000%|          | 0/463798 elapsed<00:00 remaining<?

Model downloaded to: /var/folders/2x/6wpkl49n4bsfkkvhr3gm3tt00000gn/T/b3857a55-2175-48ff-b8ac-e7e027bebc70/sentiment_stock_price_model_AAPL/2


In [15]:
# Load the XGBoost model
model_path = os.path.join(model_dir, "model.json")
xgb_model = xgboost.XGBRegressor()
xgb_model.load_model(model_path)
print(f"Model loaded from {model_path}")

Model loaded from /var/folders/2x/6wpkl49n4bsfkkvhr3gm3tt00000gn/T/b3857a55-2175-48ff-b8ac-e7e027bebc70/sentiment_stock_price_model_AAPL/2/model.json


## Make Predictions

In [16]:
# Make predictions
y_pred = xgb_model.predict(X_inference)

# Create a results dataframe (use the renamed column)
results = inference_data[['date', 'opening_prices_open', 'target_open']].copy()
results['predicted_open'] = y_pred

# Rename back for display clarity
results = results.rename(columns={'opening_prices_open': 'open'})

print(f"Predictions made for {len(results)} dates")
print("\nLast 10 predictions:")
print(results.tail(10))

Predictions made for 1137 dates

Last 10 predictions:
                          date        open  target_open  predicted_open
985  2024-11-19 00:00:00+00:00  225.958047   227.033178      222.754395
448  2024-11-20 00:00:00+00:00  227.033178   227.849499      231.013245
336  2024-11-22 00:00:00+00:00  227.033189   230.417878      229.423584
1118 2024-11-25 00:00:00+00:00  230.417878   232.279472      231.898621
407  2024-11-26 00:00:00+00:00  232.279472   233.414318      231.898621
1088 2024-11-27 00:00:00+00:00  233.414318   233.752786      231.898621
0    2025-12-23 00:00:00+00:00  270.839996   272.339996      224.523178
1134 2025-12-30 00:00:00+00:00  272.809998   273.059998      224.150162
1135 2025-12-31 00:00:00+00:00  273.059998   272.049988      223.421127
1136 2026-01-02 00:00:00+00:00  272.049988   270.714996      224.267426


## Evaluate Performance

In [17]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

# Calculate metrics
mse = mean_squared_error(y_actual, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_actual, y_pred)
r2 = r2_score(y_actual, y_pred)

print("=" * 50)
print("INFERENCE PERFORMANCE METRICS")
print("=" * 50)
print(f"Mean Squared Error (MSE):  {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R² Score: {r2:.4f}")
print("=" * 50)







INFERENCE PERFORMANCE METRICS
Mean Squared Error (MSE):  8.9943
Root Mean Squared Error (RMSE): 2.9991
Mean Absolute Error (MAE): 0.5617
R² Score: 0.9944


## Recent Predictions (Including Unvalidated)
Show predictions for the most recent dates, including today's prediction for tomorrow (which can't be validated yet).

In [18]:
# Check if there are rows with null target_open (recent dates we can predict but not validate)
unvalidated_data = merged[merged['target_open'].isna()].copy()

if len(unvalidated_data) > 0:
    print(f"Found {len(unvalidated_data)} recent dates without validation data (target_open is null)")
    print(f"These are dates: {unvalidated_data['date'].tolist()}")
    
    # Rename column to match model expectations
    unvalidated_data = unvalidated_data.rename(columns={'open': 'opening_prices_open'})
    
    # Make predictions for these dates
    X_recent = unvalidated_data[feature_cols]
    y_pred_recent = xgb_model.predict(X_recent)
    
    # Create results dataframe
    recent_results = unvalidated_data[['date', 'opening_prices_open']].copy()
    recent_results['predicted_next_day_open'] = y_pred_recent
    recent_results = recent_results.rename(columns={'opening_prices_open': 'current_open'})
    
    print("\n" + "=" * 60)
    print("PREDICTIONS FOR RECENT DATES (Not Yet Validated)")
    print("=" * 60)
    print(recent_results.to_string(index=False))
    print("=" * 60)
    print("\nNote: These predictions are for the next trading day's opening price.")
    print("They cannot be validated yet because that trading day hasn't occurred.")
else:
    print("No recent unvalidated dates found. All data has known target_open values.")

Found 1 recent dates without validation data (target_open is null)
These are dates: [Timestamp('2026-01-05 00:00:00+0000', tz='Etc/UTC')]

PREDICTIONS FOR RECENT DATES (Not Yet Validated)
                     date  current_open  predicted_next_day_open
2026-01-05 00:00:00+00:00    270.714996               224.588852

Note: These predictions are for the next trading day's opening price.
They cannot be validated yet because that trading day hasn't occurred.


In [19]:
# Show a complete timeline: validated predictions + recent unvalidated predictions
print("\nComplete Prediction Timeline:")
print(f"Validated predictions (with actual outcomes): {len(results)} dates")
print(f"  Date range: {results['date'].min()} to {results['date'].max()}")

if len(unvalidated_data) > 0:
    print(f"\nUnvalidated predictions (future): {len(recent_results)} dates")
    print(f"  Date range: {recent_results['date'].min()} to {recent_results['date'].max()}")
    print(f"\n  Latest prediction: On {recent_results['date'].max().date()}, ")
    print(f"  predict next trading day open = ${recent_results['predicted_next_day_open'].iloc[-1]:.2f}")


Complete Prediction Timeline:
Validated predictions (with actual outcomes): 1137 dates
  Date range: 2016-02-19 00:00:00+00:00 to 2026-01-02 00:00:00+00:00

Unvalidated predictions (future): 1 dates
  Date range: 2026-01-05 00:00:00+00:00 to 2026-01-05 00:00:00+00:00

  Latest prediction: On 2026-01-05, 
  predict next trading day open = $224.59


## Store Predictions
Save the predictions to a new Feature Group `opening_price_preds` for use in dashboards or external applications.

In [20]:
# ---------------------------------------------------------
# Store Predictions in Feature Store
# ---------------------------------------------------------

# Initialize an empty DataFrame for all predictions
preds_to_insert = pd.DataFrame()

# We only need to insert new predictions (recent/unvalidated ones)
# The historical validated predictions are likely already in the Feature Store from previous runs.
if 'recent_results' in locals() and not recent_results.empty:
    recent_preds = recent_results[['date', 'current_open', 'predicted_next_day_open']].copy()
    recent_preds = recent_preds.rename(columns={'predicted_next_day_open': 'predicted_open'})
    preds_to_insert = pd.concat([preds_to_insert, recent_preds])

# 3. Insert into Feature Store
if not preds_to_insert.empty:
    # Sort by date
    preds_to_insert = preds_to_insert.sort_values('date').reset_index(drop=True)
    
    print(f"Preparing to store {len(preds_to_insert)} new predictions...")
    print(preds_to_insert)

    # Get or create the feature group
    preds_fg = fs.get_or_create_feature_group(
        name="opening_price_preds",
        version=1,
        description="Predictions for AAPL opening prices (Next Day Open)",
        primary_key=["date"],
        event_time="date"
    )
    
    # Insert data
    preds_fg.insert(preds_to_insert, wait=True)
    print("Successfully inserted predictions into 'opening_price_preds' feature group")
else:
    print("No new predictions found to store.")

Preparing to store 1 new predictions...
                       date  current_open  predicted_open
0 2026-01-05 00:00:00+00:00    270.714996      224.588852


Uploading Dataframe: 100.00% |██████████| Rows 1/1 | Elapsed Time: 00:00 | Remaining Time: 00:00


Launching job: opening_price_preds_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1267871/jobs/named/opening_price_preds_1_offline_fg_materialization/executions
2026-01-05 18:23:21,246 INFO: Waiting for execution to finish. Current state: INITIALIZING. Final status: UNDEFINED
2026-01-05 18:23:27,752 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2026-01-05 18:25:11,491 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2026-01-05 18:25:11,677 INFO: Waiting for log aggregation to finish.
2026-01-05 18:25:23,877 INFO: Execution finished successfully.
Successfully inserted predictions into 'opening_price_preds' feature group


In [21]:
# ---------------------------------------------------------
# Export Data for GitHub Pages Dashboard
# ---------------------------------------------------------
import json
import numpy as np

# 1. Prepare Validated Data (Past)
# Merge results with sentiment from inference_data
# results columns: date, opening_prices_open (renamed to 'open' then 'current_open' in previous cells), target_open, predicted_open
# Let's re-fetch from 'results' dataframe which should have 'date', 'open', 'target_open', 'predicted_open'
# Note: In cell 26, 'results' was created with: results = inference_data[['date', 'opening_prices_open', 'target_open']].copy()
# and then renamed 'opening_prices_open' -> 'open'.

web_data_past = results.merge(inference_data[['date', 'sentiment_polarity']], on='date', how='left')
web_data_past = web_data_past.rename(columns={
    'open': 'current_open',       # Price on 'date'
    'target_open': 'future_open', # Price on 'date'+1 (Target)
    'sentiment_polarity': 'sentiment'
})
web_data_past = web_data_past[['date', 'current_open', 'future_open', 'predicted_open', 'sentiment']]

# 2. Prepare Recent Data (Future/Unvalidated)
if 'recent_results' in locals() and not recent_results.empty:
    # recent_results columns: date, current_open, predicted_next_day_open
    web_data_future = recent_results.merge(unvalidated_data[['date', 'sentiment_polarity']], on='date', how='left')
    web_data_future = web_data_future.rename(columns={
        'predicted_next_day_open': 'predicted_open',
        'sentiment_polarity': 'sentiment'
    })
    web_data_future['future_open'] = None # We don't know the future yet
    
    web_data_future = web_data_future[['date', 'current_open', 'future_open', 'predicted_open', 'sentiment']]
    
    # Combine
    web_df = pd.concat([web_data_past, web_data_future])
else:
    web_df = web_data_past

# 3. Format Date and Sort
web_df['date'] = web_df['date'].dt.strftime('%Y-%m-%d')
web_df = web_df.sort_values('date')

# 4. Save to JSON
output_path = '../docs/predictions.json'
os.makedirs('../docs', exist_ok=True)

# Replace NaN with None (which becomes null in JSON)
web_df = web_df.replace(np.nan, None)

json_data = web_df.to_dict(orient='records')

with open(output_path, 'w') as f:
    json.dump(json_data, f, indent=2)

print(f"Successfully exported {len(json_data)} records to {output_path}")
print("Updated JSON structure to include 'current_open' (today's price) and 'future_open' (target).")


Successfully exported 1138 records to ../docs/predictions.json
Updated JSON structure to include 'current_open' (today's price) and 'future_open' (target).
