In [57]:
import hopsworks
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt
import xgboost 

In [58]:
import hsfs

# 1. Login
project = hopsworks.login()

# 2. Get the Feature Store (This triggers the metadata check)
try:
    fs = project.get_feature_store("A1ID2223")
    print(f"Successfully connected to Feature Store: {fs.name}")
except Exception as e:
    print(f"Feature Store Connection Error: {e}")

# 3. Check versions
print(f"HSFS Version: {hsfs.__version__}")

2025-12-24 17:53:45,219 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-12-24 17:53:45,221 INFO: Initializing external client
2025-12-24 17:53:45,221 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-12-24 17:53:46,828 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1267871
Successfully connected to Feature Store: a1id2223_featurestore
HSFS Version: 4.2.10


In [None]:
sentiment_feature_group = fs.get_feature_group(name="sentiments")
opening_prices_feature_group = fs.get_feature_group(name="opening_prices", version=2)

# Join without filter first to check data
sentiment_opening_price_view = (
    sentiment_feature_group.select_all()
    .join(opening_prices_feature_group.select_all(), on=["date"])
)

df = sentiment_opening_price_view.read()

# Rename columns to remove suffixes
df = df.rename(columns={
    'opening_prices_open': 'open',
    'opening_prices_target_open': 'target_open'
})

# Filter out rows with null target_open after read
df = df[df['target_open'].notna()]

print(f"Rows after filtering: {len(df)}")


Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.29s) 
Rows after filtering: 1131


In [60]:
print(df.sort_values("date").head())
df.info()

                          date  sentiment_polarity  sentiment_neg  \
901  2016-02-19 00:00:00+00:00               0.994          0.023   
1385 2017-10-05 00:00:00+00:00               0.997          0.008   
1199 2017-11-27 00:00:00+00:00               0.997          0.008   
804  2017-11-30 00:00:00+00:00               0.989          0.021   
1295 2018-01-31 00:00:00+00:00               0.995          0.009   

      sentiment_neu  sentiment_pos       open  target_open  
901           0.869          0.108  21.762474    21.832745  
1385          0.925          0.067  35.978361    36.162712  
1199          0.926          0.065  40.995288    40.819637  
804           0.804          0.174  39.913309    39.800916  
1295          0.937          0.054  39.079596    39.149848  
<class 'pandas.core.frame.DataFrame'>
Index: 1131 entries, 0 to 1542
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype                  
---  ------              --------------  -----       

In [61]:
feature_view = fs.get_or_create_feature_view(
    name="sentiment_and_opening_price_view",
    version=1,
    query=sentiment_opening_price_view,
    description="sentiment + opening price view",
    labels=['target_open']
)

In [67]:
X_train, X_test, y_train, y_test = feature_view.train_test_split(
    test_start="2024-01-01"
)

# Drop date column - it's for temporal ordering, not a feature
X_train = X_train.drop(columns=['date'], errors='ignore')
X_test = X_test.drop(columns=['date'], errors='ignore')

# Remove rows with NaN targets (last day from backfill, today from daily pipeline)
# Fix: squeeze() or use .values.ravel() to get 1D array for proper boolean indexing
train_mask = y_train.notna().squeeze()
X_train = X_train[train_mask]
y_train = y_train[train_mask]

test_mask = y_test.notna().squeeze()
X_test = X_test[test_mask]
y_test = y_test[test_mask]

print(f"Filtered train: {len(X_train)} rows, test: {len(X_test)} rows")

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.68s) 

2025-12-24 17:55:22,534 INFO: Provenance cached data - overwriting last accessed/created training dataset from 7 to 8.
Filtered train: 905 rows, test: 226 rows


In [68]:
# Debug: Check for NaN values
print(f"y_train type: {type(y_train)}")
print(f"y_train shape: {y_train.shape}")
print(f"NaN count in y_train: {y_train.isna().sum()}")
print(f"\ny_train head:")
print(y_train.head())

y_train type: <class 'pandas.core.frame.DataFrame'>
y_train shape: (905, 1)
NaN count in y_train: opening_prices_target_open    0
dtype: int64

y_train head:
   opening_prices_target_open
0                   52.729787
1                   41.576759
2                   48.915226
3                   45.511297
4                   44.696488


In [63]:
# Validate train/test split
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")
print(f"\nFeatures: {X_train.columns.tolist()}")
print(f"\nTrain date range: {X_train.index.min()} to {X_train.index.max()}")
print(f"Test date range: {X_test.index.min()} to {X_test.index.max()}")
print(f"\nTarget stats - Train mean: {float(y_train.mean()):.2f}, Test mean: {float(y_test.mean()):.2f}")

Train shape: (1244, 5), Test shape: (332, 5)

Features: ['sentiment_polarity', 'sentiment_neg', 'sentiment_neu', 'sentiment_pos', 'opening_prices_open']

Train date range: 0 to 1568
Test date range: 64 to 1575


Target stats - Train mean: 142.15, Test mean: 201.38


In [69]:
xgb_regressor = xgboost.XGBRegressor(random_state=42)
xgb_regressor.fit(X_train, y_train)

In [70]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error
import numpy as np

y_pred = xgb_regressor.predict(X_test)
mse = mean_squared_error(y_test.iloc[:,0], y_pred)
r2 = r2_score(y_test.iloc[:,0], y_pred)
mae = mean_absolute_error(y_test.iloc[:,0], y_pred)
rmse = np.sqrt(mse)

print(f"MSE:  {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE:  {mae:.4f}")
print(f"R²:   {r2:.4f}")







MSE:  479.7301
RMSE: 21.9027
MAE:  16.2354
R²:   0.0844


In [73]:
import os

model_dir = f"sentiment_stock_price_model_AAPL"
if not os.path.exists(model_dir):
    os.mkdir(model_dir)

In [74]:
xgb_regressor.save_model(model_dir + "/model.json")
print(f"Model saved to {model_dir}")

Model saved to sentiment_stock_price_model_AAPL


In [75]:
res_dict = { 
    "MSE": float(mse),
    "RMSE": float(rmse),
    "MAE": float(mae),
    "R2": float(r2),
}

In [76]:
mr = project.get_model_registry()

model_name = model_dir

aq_model = mr.python.create_model(
    name=model_name, 
    metrics=res_dict,
    feature_view=feature_view,
    description=f"Opening price predictor for AAPL stock based on sentiment",
)

aq_model.save(model_dir)
print(f"Model '{model_name}' uploaded to registry")

  0%|          | 0/6 [00:00<?, ?it/s]

Uploading /Users/sambarati/Documents/GitHub/nlp-stock-prediction/notebooks/sentiment_stock_price_model_AAPL/mo…

Uploading /Users/sambarati/Documents/GitHub/nlp-stock-prediction/notebooks/model_schema.json: 0.000%|         …

Model created, explore it at https://c.app.hopsworks.ai:443/p/1267871/models/sentiment_stock_price_model_AAPL/1
Model 'sentiment_stock_price_model_AAPL' uploaded to registry


In [77]:
# Check the actual schema that was saved
print("Input schema saved to model registry:")
print(f"Features used for training: {X_train.columns.tolist()}")
print(f"\nFeature count: {len(X_train.columns)}")
print("\nNote: The schema includes 'date' but it was dropped during training!")
print("This may cause issues during inference.")

Input schema saved to model registry:
Features used for training: ['sentiment_polarity', 'sentiment_neg', 'sentiment_neu', 'sentiment_pos', 'opening_prices_open']

Feature count: 5

Note: The schema includes 'date' but it was dropped during training!
This may cause issues during inference.
