In [11]:
import numpy as np
import pandas as pd
import os
import time
from sklearn.preprocessing import StandardScaler
from keras.layers import Dense, Activation, Dropout, LSTM
from keras.models import Sequential
import warnings
warnings.filterwarnings("ignore")
import lstm as lstm
import matplotlib.pyplot as plt


Configs: 

In [12]:
integrated_file = r'data\input\media_integrated\HLF_processed.csv'
company = 'HLF'
seq_len = 50
epochs = 20

Process Data:

In [13]:
pipeline = lstm.EnhancedLSTMPipeline(
    integrated_file=integrated_file,
    seq_len=seq_len,
    company=company
)

Training:

In [14]:
model_baseline, history_baseline = pipeline.train_and_predict(
            use_gdelt=False, 
            epochs=epochs
        )


Loading data - BASELINE (volume only)
Total observations: 1221
Date range: 2012-11-21 00:00:00 to 2017-09-27 00:00:00
Using 7 features: ['Volume', 'Returns', 'Volume_Change', 'MA_5', 'MA_20', 'Volatility_5', 'Price_Range']
After removing NaN: 1203 samples

Data split:
  Training: 806 samples
  Testing: 230 samples
  Validation: 116 samples

Building LSTM model...

Training for 20 epochs...
Epoch 1/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 499ms/step - loss: 3.3385 - val_loss: 1.6616
Epoch 2/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 247ms/step - loss: 2.8051 - val_loss: 1.5597
Epoch 3/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 244ms/step - loss: 2.9804 - val_loss: 1.5539
Epoch 4/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 269ms/step - loss: 2.3992 - val_loss: 1.5432
Epoch 5/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 246ms/step - loss: 2.8555 - val_loss: 1.5268
Epoch 6/20
[1m2

In [15]:
model_enhanced, history_enhanced = pipeline.train_and_predict(
            use_gdelt=True, 
            epochs=epochs
        )


Loading data - ENHANCED (with GDELT)
Total observations: 1221
Date range: 2012-11-21 00:00:00 to 2017-09-27 00:00:00
Using 15 features: ['Volume', 'Returns', 'Volume_Change', 'MA_5', 'MA_20', 'Volatility_5', 'Price_Range', 'ArticleCount', 'Tone', 'Polarity', 'ArticleCount_MA_7', 'Tone_MA_7', 'High_Coverage', 'Negative_Tone', 'Very_Negative_Tone']
After removing NaN: 1203 samples

Data split:
  Training: 806 samples
  Testing: 230 samples
  Validation: 116 samples

Building LSTM model...

Training for 20 epochs...
Epoch 1/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 490ms/step - loss: 3.6033 - val_loss: 1.7486
Epoch 2/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 231ms/step - loss: 3.1803 - val_loss: 1.5813
Epoch 3/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 239ms/step - loss: 2.8494 - val_loss: 1.5451
Epoch 4/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 241ms/step - loss: 2.4624 - val_loss: 2.2576
Epoch 5/

Comparison between companies: 

In [16]:
methods = {
            'window': 'Window-Based Forecasting',
            'point': 'Day-Ahead Forecasting',
            'sequence': 'Historical-Based Forecasting'
        }


In [17]:
metrics = {}
results = []
for model_type in ['baseline', 'enhanced']:
    print(f"\n=== Results for {model_type} model ===")
    for method, method_name in methods.items():
                print(f"\n{method_name}:")
            
                pred_file = f'data/output/{company}_full_{method}_pred_{model_type}.csv'
                actual_file = f'data/output/{company}_full_{method}_act.csv'
                predictions = pd.read_csv(pred_file, header=None).values.flatten()
                actuals = pd.read_csv(actual_file, header=None).values.flatten()
            
                    
                metrics[model_type] = pipeline.calculate_metrics(predictions, actuals)
                print(f"  R² Score:  {metrics[model_type]['R2']:.4f}")
                print(f"  RMSE:      {metrics[model_type]['RMSE']:.6f}")
                print(f"  MAE:       {metrics[model_type]['MAE']:.6f}")
                print(f"  MAPE:      {metrics[model_type]['MAPE']:.2f}%")
                
                print(f"\nEnhanced Model:")
                print(f"  R² Score:  {metrics[model_type]['R2']:.4f}")
                print(f"  MAE:       {metrics[model_type]['MAE']:.6f}")
                print(f"  RMSE:      {metrics[model_type]['RMSE']:.6f}")
                print(f"  MAPE:      {metrics[model_type]['MAPE']:.2f}%")  
                model_results = {
                    'Company': company,
                    'Model': model_type,
                    'Method': method,
                    f'{model_type}_predicted': predictions,
                    'Actual': actuals,
                    'method_name': method_name,
                    f'{model_type}_r2': metrics[model_type]['R2'],
                    f'{model_type}_rmse': metrics[model_type]['RMSE'],
                    f'{method_name}_mae': metrics[model_type]['MAE'],    
                }
                results.append(model_results)     
    



=== Results for baseline model ===

Window-Based Forecasting:
  R² Score:  -0.2147
  RMSE:      3.505151
  MAE:       1.483734
  MAPE:      859.08%

Enhanced Model:
  R² Score:  -0.2147
  MAE:       1.483734
  RMSE:      3.505151
  MAPE:      859.08%

Day-Ahead Forecasting:
  R² Score:  0.0588
  RMSE:      3.085417
  MAE:       0.905837
  MAPE:      624.54%

Enhanced Model:
  R² Score:  0.0588
  MAE:       0.905837
  RMSE:      3.085417
  MAPE:      624.54%

Historical-Based Forecasting:
  R² Score:  -0.2147
  RMSE:      3.505151
  MAE:       1.483734
  MAPE:      859.08%

Enhanced Model:
  R² Score:  -0.2147
  MAE:       1.483734
  RMSE:      3.505151
  MAPE:      859.08%

=== Results for enhanced model ===

Window-Based Forecasting:
  R² Score:  -0.0207
  RMSE:      3.213070
  MAE:       0.937921
  MAPE:      128.73%

Enhanced Model:
  R² Score:  -0.0207
  MAE:       0.937921
  RMSE:      3.213070
  MAPE:      128.73%

Day-Ahead Forecasting:
  R² Score:  0.0501
  RMSE:      3.099669

In [18]:
r2_improvement = ((metrics['baseline']['R2'] - metrics['enhanced']['R2']) / 
                             abs(metrics['baseline']['R2']) * 100) if metrics['baseline'] != 0 else 0
rmse_improvement = ((metrics['baseline']['RMSE'] - metrics['enhanced']['RMSE']) / 
                    metrics['baseline']['RMSE'] * 100)
print(f"\nR² Improvement from Baseline to Enhanced: {r2_improvement:.2f}%")
print(f"RMSE Improvement from Baseline to Enhanced: {rmse_improvement:.2f}%")    



R² Improvement from Baseline to Enhanced: -90.37%
RMSE Improvement from Baseline to Enhanced: 8.33%


In [19]:
print(len(results))

6


In [20]:
fig = plt.figure(figsize=(16, 12))
gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)

# Extract per-method baseline/enhanced entries
baseline_results = [r for r in results if r['Model'] == 'baseline']
enhanced_results = [r for r in results if r['Model'] == 'enhanced']

# 1. R² Comparison -------------------------------------------------------------
ax1 = fig.add_subplot(gs[0, :2])
x = np.arange(len(methods))
width = 0.35

baseline_r2 = [r['baseline_r2'] for r in baseline_results]
enhanced_r2 = [r['enhanced_r2'] for r in enhanced_results]

ax1.bar(x - width/2, baseline_r2, width, label='Baseline', alpha=0.8, color='steelblue')
ax1.bar(x + width/2, enhanced_r2, width, label='Enhanced', alpha=0.8, color='coral')

ax1.set_ylabel('R² Score', fontsize=12)
ax1.set_title('R² Score Comparison by Method', fontsize=14, fontweight='bold')
ax1.set_xticks(x)
ax1.set_xticklabels(methods, rotation=15, ha='right')
ax1.legend()
ax1.grid(True, alpha=0.3, axis='y')
ax1.axhline(y=0, color='black', linestyle='-', linewidth=0.5)

# 2. RMSE Comparison -----------------------------------------------------------
ax2 = fig.add_subplot(gs[0, 2])

baseline_rmse = [r['baseline_rmse'] for r in baseline_results]
enhanced_rmse = [r['enhanced_rmse'] for r in enhanced_results]

ax2.bar(x - width/2, baseline_rmse, width, label='Baseline', alpha=0.8, color='steelblue')
ax2.bar(x + width/2, enhanced_rmse, width, label='Enhanced', alpha=0.8, color='coral')

ax2.set_ylabel('RMSE', fontsize=12)
ax2.set_title('RMSE Comparison', fontsize=14, fontweight='bold')
ax2.set_xticks(x)
ax2.set_xticklabels(['Window', 'Day', 'Historical'], rotation=15, ha='right')
ax2.legend()
ax2.grid(True, alpha=0.3, axis='y')

# 3–5. Prediction Plots per method --------------------------------------------
for idx, method in enumerate(methods):
    ax = fig.add_subplot(gs[1 + idx // 3, idx % 3])

    baseline_entry = baseline_results[idx]
    enhanced_entry = enhanced_results[idx]

    actuals = baseline_entry['Actual'][:200]
    pred_baseline = baseline_entry['baseline_predicted'][:200]
    pred_enhanced = enhanced_entry['enhanced_predicted'][:200]

    ax.plot(actuals, label='Actual', linewidth=2, alpha=0.7, color='black')
    ax.plot(pred_baseline, label='Baseline', linewidth=1.5, alpha=0.7, color='steelblue')
    ax.plot(pred_enhanced, label='Enhanced', linewidth=1.5, alpha=0.7, color='coral')

    ax.set_xlabel('Time Step', fontsize=10)
    ax.set_ylabel('Normalized Volume', fontsize=10)

    ax.set_title(
        f"{baseline_entry['method_name']}\n"
        f"Base R²: {baseline_entry['baseline_r2']:.3f} | "
        f"Enh R²: {enhanced_entry['enhanced_r2']:.3f}",
        fontsize=11, fontweight='bold'
    )

    ax.legend(fontsize=9)
    ax.grid(True, alpha=0.3)

plt.suptitle('Baseline vs Enhanced Model Comparison', fontsize=16, fontweight='bold', y=0.995)
output_file = f'{company}_output.png'
plt.savefig(output_file, dpi=300, bbox_inches='tight')
print(f"✓ Saved: {output_file}")
plt.close()

✓ Saved: HLF_output.png
