In [15]:
# Import required libraries
import os
import sys

from pathlib import Path
from datetime import datetime

sys.path.insert(0, str(Path.cwd().parent / 'utils'))

from data_loader import StockDataLoader  # type: ignore
from indicators import TechnicalIndicators  # type: ignore

print("✓ Libraries imported successfully")
print(f"Execution time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

✓ Libraries imported successfully
Execution time: 2025-12-04 17:15:29


## Step 1: Load Data from Bronze/Silver Layer

In [16]:
# Load data
config_dir = str(Path.cwd().parent / 'config')
loader = StockDataLoader(config_path=os.path.join(config_dir, 'config.json'))
LAKEHOUSE_PATH = str(Path.cwd().parent / 'data')

df = loader.read_from_bronze(LAKEHOUSE_PATH, ticker='MSFT')

print(f"Loaded {len(df):,} records")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
df.head()

Reading data from Bronze layer: bronze/stocks
✓ Read 1489 records from Bronze layer
Loaded 1,489 records
Date range: 2020-01-02 00:00:00-05:00 to 2025-12-03 00:00:00-05:00


Unnamed: 0,date,open,high,low,close,volume,dividends,stock_splits,Ticker,FetchTimestamp,year,month
0,2020-01-02 00:00:00-05:00,150.758649,152.610135,150.331386,152.505692,22622100,0.0,0.0,MSFT,2025-12-04 11:43:51.620688,2020,1
1,2020-01-03 00:00:00-05:00,150.321918,151.869563,150.075043,150.60675,21116200,0.0,0.0,MSFT,2025-12-04 11:43:51.620688,2020,1
2,2020-01-06 00:00:00-05:00,149.144487,151.062442,148.603275,150.995972,20813700,0.0,0.0,MSFT,2025-12-04 11:43:51.620688,2020,1
3,2020-01-07 00:00:00-05:00,151.271396,151.603706,149.372433,149.619293,21634100,0.0,0.0,MSFT,2025-12-04 11:43:51.620688,2020,1
4,2020-01-08 00:00:00-05:00,150.901085,152.676625,149.970597,152.002487,27746500,0.0,0.0,MSFT,2025-12-04 11:43:51.620688,2020,1


## Step 2: Calculate Technical Indicators

Calculate comprehensive set of indicators:
- **Trend**: SMA, EMA, ADX, Aroon
- **Momentum**: RSI, MACD, Stochastic, ROC
- **Volatility**: Bollinger Bands, ATR
- **Volume**: OBV, CMF, Volume Ratio

In [17]:
# Initialize technical indicators calculator
ti = TechnicalIndicators()

# Calculate all indicators
df_indicators = ti.calculate_all_indicators(df)

print(f"\n✓ Added {len(ti.indicator_columns)} indicator columns")
print(f"Total columns: {len(df_indicators.columns)}")
print("\nNew indicator columns:")
print(ti.indicator_columns[:20])  # Show first 20

# Display latest values
print("\nLatest indicator values:")
df_indicators[['date', 'close', 'RSI_14', 'MACD', 'BB_upper', 'BB_lower', 'Volume_ratio']].tail()

Calculating technical indicators...
✓ Calculated 69 technical indicators

✓ Added 69 indicator columns
Total columns: 81

New indicator columns:
['SMA_20', 'SMA_50', 'SMA_200', 'EMA_12', 'EMA_26', 'EMA_50', 'SMA_20_50_cross', 'golden_cross', 'death_cross', 'RSI_14', 'MACD', 'MACD_signal', 'MACD_hist', 'MACD_cross_bullish', 'MACD_cross_bearish', 'STOCH_k', 'STOCH_d', 'ROC_10', 'WILLR_14', 'BB_lower']

Latest indicator values:


Unnamed: 0,date,close,RSI_14,MACD,BB_upper,BB_lower,Volume_ratio
1484,2025-11-26 00:00:00-05:00,485.5,40.371878,-9.803868,529.942193,467.711341,0.951971
1485,2025-11-28 00:00:00-05:00,492.01001,45.338489,-8.842521,525.908917,468.467835,0.560353
1486,2025-12-01 00:00:00-05:00,486.73999,42.269131,-8.408961,523.196684,468.169802,0.952001
1487,2025-12-02 00:00:00-05:00,490.0,44.76041,-7.71339,520.248735,468.511336,0.781519
1488,2025-12-03 00:00:00-05:00,477.730011,38.096759,-8.059328,517.862063,467.334088,1.34424


## Step 3: Generate Trading Signals

In [18]:
# Generate composite trading signals
df_signals = ti.generate_trading_signals(df_indicators)

# Display signal summary
print("Signal distribution:")
print(df_signals['final_signal'].value_counts())

# Show recent signals
print("\nRecent trading signals:")
df_signals[['date', 'close', 'signal_RSI', 'signal_MACD', 'signal_BB', 'composite_signal', 'final_signal']].tail(10)

✓ Generated trading signals
Signal distribution:
final_signal
 0    1153
-1     190
 1     146
Name: count, dtype: int64

Recent trading signals:


Unnamed: 0,date,close,signal_RSI,signal_MACD,signal_BB,composite_signal,final_signal
1479,2025-11-19 00:00:00-05:00,486.209991,0,0,0,0.0,0
1480,2025-11-20 00:00:00-05:00,478.429993,0,0,0,0.0,0
1481,2025-11-21 00:00:00-05:00,472.119995,1,0,0,0.2,0
1482,2025-11-24 00:00:00-05:00,474.0,0,0,0,0.0,0
1483,2025-11-25 00:00:00-05:00,476.98999,0,0,0,0.0,0
1484,2025-11-26 00:00:00-05:00,485.5,0,0,0,-0.2,0
1485,2025-11-28 00:00:00-05:00,492.01001,0,0,0,-0.2,0
1486,2025-12-01 00:00:00-05:00,486.73999,0,0,0,-0.2,0
1487,2025-12-02 00:00:00-05:00,490.0,0,1,0,0.0,0
1488,2025-12-03 00:00:00-05:00,477.730011,0,-1,0,-0.4,-1


## Step 4: Target Label Creation

**UPGRADED Configuration:**
- `horizon=10`: Predict 10 days ahead (instead of 1 day) for less noisy, more reliable signals
- `threshold_buy=0.05`: Price needs to increase 5%+ within 10 days for BUY signal
- `threshold_sell=-0.05`: Price needs to decrease 5%+ within 10 days for SELL signal
- Remaining cases: HOLD

**Why 5% thresholds?** Higher thresholds capture stronger, more actionable trends while filtering out weak signals. This should improve prediction accuracy by focusing on meaningful price movements.

In [19]:
# Create target labels
# Using 10-day horizon with 5% thresholds for stronger signals
df_final = ti.create_target_labels(
    df_signals,
    threshold_buy=0.05,   # 5% gain threshold for 5-day horizon (stronger signals)
    threshold_sell=-0.05, # 5% loss threshold for 5-day horizon (stronger signals)
    horizon=10             # 10-day ahead prediction (less noise, more signal)
)

# Remove rows with NaN targets (last few rows)
df_clean = df_final.dropna(subset=['target'])

print("\n✓ Created target labels (10-day horizon)")
print(f"Valid records: {len(df_clean):,}")
print("\nClass distribution:")
print(df_clean['target'].value_counts())
print("\nClass percentages:")
print(df_clean['target'].value_counts(normalize=True) * 100)

✓ Created target labels (Buy: 1, Hold: 0, Sell: -1)
  Valid rows: 1489
  Label distribution:
    Buy (1):  295 (19.8%)
    Hold (0): 1017 (68.3%)
    Sell (-1): 177 (11.9%)

✓ Created target labels (10-day horizon)
Valid records: 1,489

Class distribution:
target
 0    1017
 1     295
-1     177
Name: count, dtype: int64

Class percentages:
target
 0    68.300873
 1    19.811954
-1    11.887173
Name: proportion, dtype: float64


## Step 5: Feature Summary Statistics

In [20]:
# Get current indicator summary
indicator_summary = ti.get_indicator_summary(df_clean)

print("\nCurrent Market Indicators (Latest Data):")
print("=" * 60)
for key, value in indicator_summary.items():
    if isinstance(value, float):
        print(f"{key:20s}: {value:.2f}")
    else:
        print(f"{key:20s}: {value}")
print("=" * 60)


Current Market Indicators (Latest Data):
RSI                 : 38.10
MACD                : -8.06
MACD_signal         : -7.82
BB_position         : Middle
Volume_ratio        : 1.34
ADX                 : 26.62
composite_signal    : -0.40
final_signal        : SELL


## Step 6: Store Enriched Data in Gold Layer

In [21]:
# Save to Gold layer
import os

gold_path = os.path.join(LAKEHOUSE_PATH, 'gold/stocks')
os.makedirs(gold_path, exist_ok=True)

# Save as parquet (in Fabric, use Delta table)
output_file = os.path.join(gold_path, 'msft_features.parquet')
df_clean.to_parquet(output_file, engine='pyarrow', compression='snappy', index=False)

print(f"\n{'='*60}")
print("FEATURE ENGINEERING COMPLETE")
print(f"{'='*60}")
print(f"Records processed: {len(df_clean):,}")
print(f"Features created: {len(ti.indicator_columns)}")
print(f"Storage location: {Path(output_file).name}")
print("Ready for model training")
print(f"{'='*60}")


FEATURE ENGINEERING COMPLETE
Records processed: 1,489
Features created: 69
Storage location: msft_features.parquet
Ready for model training


## Next Steps

1. **Run Notebook 04**: Model training with XGBoost/Random Forest
2. **Feature Selection**: Consider feature importance analysis
3. **Monitoring**: Track feature distributions over time

---
**Note**: Gold layer data is ready for machine learning model training and prediction.