In [2]:
import numpy as np
import pandas as pd
import os
import time
from sklearn.preprocessing import StandardScaler
from keras.layers import Dense, Activation, Dropout, LSTM
from keras.models import Sequential
import warnings
warnings.filterwarnings("ignore")
import lstm as lstm
import matplotlib.pyplot as plt


Configs: 

In [3]:
integrated_file = r'data\input\media_integrated\media_stock_features.csv'
company = 'HLF'
seq_len = 50
epochs = 20

Process Data:

In [4]:
pipeline = lstm.EnhancedLSTMPipeline(
    integrated_file=integrated_file,
    seq_len=seq_len,
    company=company
)

Training:

In [15]:
tickers = {1: "AMSC", 2: "BP", 3: "EVR", 4:"GOOGL", 5 : "HLF", 6:"MDRX", 7: "ORCL", 8:"WFC"}

full_df = pd.read_csv(r'data\input\media_integrated\media_stock_features.csv')

# Ticker mapping
##tickers = {0: "AMSC", 1: "BP", 2: "EVR", 3: "GOOGL", 4: "HLF", 5: "MDRX", 6: "ORCL", 7: "WFC"}

# Dictionary to store models for each ticker
models_baseline = {}
histories_baseline = {}
models_enhanced = {}
histories_baseline = {}
# Create temp directory if it doesn't exist
os.makedirs('data/temp', exist_ok=True)

# Train a separate model for each ticker
for ticker_code, ticker_name in tickers.items():
    print(f"\n{'='*70}")
    print(f"Training model for {ticker_name} (code: {ticker_code})")
    print(f"{'='*70}")
    
    # Filter data for this ticker and save to temporary file
    ticker_data = full_df[full_df['Ticker'] == ticker_code].copy()
    
    if len(ticker_data) == 0:
        print(f"WARNING: No data found for ticker {ticker_name} (code {ticker_code})")
        continue
    
    print(f"Found {len(ticker_data)} records for {ticker_name}")
    
    # Save to temporary file
    temp_file = f'data/temp/{ticker_name}_processed.csv'
    ticker_data.to_csv(temp_file, index=False)
    
    # Create pipeline for this ticker
    pipeline = lstm.EnhancedLSTMPipeline(
        integrated_file=temp_file,
        seq_len=50,
        company=ticker_name.lower()
    )
    
    # Train model
    model_baseline, history_baseline = pipeline.train_and_predict(
        use_gdelt=False, 
        epochs=20
    )
    
    # Store results
    models_baseline[ticker_name] = model_baseline
    histories_baseline[ticker_name] = history_baseline


Training model for AMSC (code: 1)
Found 2737 records for AMSC

Loading data - BASELINE (volume only)
Total observations: 2737
Date range: 2015-01-02 05:00:00 to 2025-11-18 05:00:00
Using 1 features: ['Volume']
After removing NaN: 2736 samples

Data split:
  Training: 1879 samples
  Testing: 537 samples
  Validation: 269 samples

Building LSTM model...

Training for 20 epochs...
Epoch 1/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 303ms/step - loss: 0.5836 - val_loss: 0.3417
Epoch 2/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 218ms/step - loss: 0.4115 - val_loss: 0.3455
Epoch 3/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 209ms/step - loss: 0.2778 - val_loss: 0.4701
Epoch 4/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 204ms/step - loss: 0.4000 - val_loss: 0.3478
Epoch 5/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 199ms/step - loss: 0.4445 - val_loss: 0.3478
Epoch 6/20
[1m4/4[0m [32m

In [16]:
tickers = {1: "AMSC", 2: "BP", 3: "EVR", 4:"GOOGL", 5 : "HLF", 6:"MDRX", 7: "ORCL", 8:"WFC"}

full_df = pd.read_csv(r'data\input\media_integrated\media_stock_features.csv')

# Ticker mapping
##tickers = {0: "AMSC", 1: "BP", 2: "EVR", 3: "GOOGL", 4: "HLF", 5: "MDRX", 6: "ORCL", 7: "WFC"}

# Dictionary to store models for each ticker

models_enhanced = {}
histories_enhanced = {}
# Create temp directory if it doesn't exist
os.makedirs('data/temp', exist_ok=True)

# Train a separate model for each ticker
for ticker_code, ticker_name in tickers.items():
    print(f"\n{'='*70}")
    print(f"Training model for {ticker_name} (code: {ticker_code})")
    print(f"{'='*70}")
    
    # Filter data for this ticker and save to temporary file
    ticker_data = full_df[full_df['Ticker'] == ticker_code].copy()
    
    if len(ticker_data) == 0:
        print(f"WARNING: No data found for ticker {ticker_name} (code {ticker_code})")
        continue
    
    print(f"Found {len(ticker_data)} records for {ticker_name}")
    
    # Save to temporary file
    temp_file = f'data/temp/{ticker_name}_processed.csv'
    ticker_data.to_csv(temp_file, index=False)
    
    # Create pipeline for this ticker
    pipeline = lstm.EnhancedLSTMPipeline(
        integrated_file=temp_file,
        seq_len=50,
        company=ticker_name.lower()
    )
    
    # Train model
    '''model_baseline, history_baseline = pipeline.train_and_predict(
        use_gdelt=True, 
        epochs=20
    )'''

    model_enhanced, history_enhanced = pipeline.train_and_predict(
        use_gdelt=True, 
        epochs=20
    )
    
    # Store results
    '''models_baseline[ticker_name] = model_baseline
    histories_baseline[ticker_name] = history_baseline'''
    models_enhanced[ticker_name] = model_enhanced
    histories_baseline[ticker_name] = history_enhanced


Training model for AMSC (code: 1)
Found 2737 records for AMSC

Loading data - ENHANCED (with GDELT)
Total observations: 2737
Date range: 2015-01-02 05:00:00 to 2025-11-18 05:00:00
Using 4 features: ['Volume', 'ArticleCount', 'Tone', 'Polarity']
After removing NaN: 2736 samples

Data split:
  Training: 1879 samples
  Testing: 537 samples
  Validation: 269 samples

Building LSTM model...

Training for 20 epochs...
Epoch 1/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 353ms/step - loss: 0.5888 - val_loss: 0.3482
Epoch 2/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 255ms/step - loss: 0.4218 - val_loss: 0.3928
Epoch 3/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 263ms/step - loss: 0.3653 - val_loss: 0.3613
Epoch 4/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 259ms/step - loss: 0.3078 - val_loss: 0.3635
Epoch 5/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 281ms/step - loss: 0.2978 - val_loss: 

Comparison between companies: 

In [17]:
methods = {
            'window': 'Window-Based Forecasting',
            'point': 'Day-Ahead Forecasting',
            'sequence': 'Historical-Based Forecasting'
        }


In [22]:
metrics = {}
results = []

for company in tickers.values():
    print(f"\n{'='*70}")
    print(f"COMPANY: {company}")
    print(f"{'='*70}")
    
    # Initialize metrics dictionary for this company
    if company not in metrics:
        metrics[company] = {}
    
    for model_type in ['baseline', 'enhanced']:
        print(f"\n=== Results for {model_type} model ===")
        
        # Initialize model type dictionary
        if model_type not in metrics[company]:
            metrics[company][model_type] = {}
        
        for method, method_name in methods.items():
            print(f"\n{method_name}:")
            
            pred_file = f'data/output/{company}_full_{method}_pred_{model_type}.csv'
            actual_file = f'data/output/{company}_full_{method}_act.csv'
            
            predictions = pd.read_csv(pred_file, header=None).values.flatten()
            actuals = pd.read_csv(actual_file, header=None).values.flatten()
            
            # Calculate metrics for this method
            method_metrics = pipeline.calculate_metrics(predictions, actuals)
            metrics[company][model_type][method] = method_metrics
            
            # Print metrics
            print(f"  R² Score:  {method_metrics['R2']:.4f}")
            print(f"  RMSE:      {method_metrics['RMSE']:.6f}")
            print(f"  MAE:       {method_metrics['MAE']:.6f}")
            print(f"  MAPE:      {method_metrics['MAPE']:.2f}%")
            
            # Store results
            model_results = {
                'Company': company,
                'Model': model_type,
                'Method': method,
                'Method_Name': method_name,
                'Predictions': predictions,
                'Actuals': actuals,
                'R2': method_metrics['R2'],
                'RMSE': method_metrics['RMSE'],
                'MAE': method_metrics['MAE'],
                'MAPE': method_metrics['MAPE']
            }
            results.append(model_results)


COMPANY: AMSC

=== Results for baseline model ===

Window-Based Forecasting:
  R² Score:  -0.0202
  RMSE:      0.612686
  MAE:       0.405466
  MAPE:      125.47%

Day-Ahead Forecasting:
  R² Score:  0.4724
  RMSE:      0.440574
  MAE:       0.280161
  MAPE:      624.68%

Historical-Based Forecasting:
  R² Score:  -0.0202
  RMSE:      0.612686
  MAE:       0.405466
  MAPE:      125.47%

=== Results for enhanced model ===

Window-Based Forecasting:
  R² Score:  -15.7082
  RMSE:      2.479425
  MAE:       1.919569
  MAPE:      2220.37%

Day-Ahead Forecasting:
  R² Score:  0.4786
  RMSE:      0.438015
  MAE:       0.273694
  MAPE:      667.10%

Historical-Based Forecasting:
  R² Score:  -15.7082
  RMSE:      2.479425
  MAE:       1.919569
  MAPE:      2220.37%

COMPANY: BP

=== Results for baseline model ===

Window-Based Forecasting:
  R² Score:  -0.0072
  RMSE:      0.966984
  MAE:       0.617710
  MAPE:      374.42%

Day-Ahead Forecasting:
  R² Score:  0.5514
  RMSE:      0.645337
  M

In [23]:
for company in tickers.values():
    print(f"IMPROVEMENTS FOR {company}")
    for method in methods.keys():
        print(f"\n{methods[method]}:")
        
        baseline_metrics = metrics[company]['baseline'][method]
        enhanced_metrics = metrics[company]['enhanced'][method]
        
        # Calculate R² improvement (higher is better, so enhanced - baseline)
        if baseline_metrics['R2'] != 0:
            r2_improvement = ((enhanced_metrics['R2'] - baseline_metrics['R2']) / 
                             abs(baseline_metrics['R2']) * 100)
        else:
            r2_improvement = 0
        
        # Calculate RMSE improvement (lower is better, so baseline - enhanced)
        if baseline_metrics['RMSE'] != 0:
            rmse_improvement = ((baseline_metrics['RMSE'] - enhanced_metrics['RMSE']) / 
                               baseline_metrics['RMSE'] * 100)
        else:
            rmse_improvement = 0
        
        # Calculate MAE improvement (lower is better, so baseline - enhanced)
        if baseline_metrics['MAE'] != 0:
            mae_improvement = ((baseline_metrics['MAE'] - enhanced_metrics['MAE']) / 
                              baseline_metrics['MAE'] * 100)
        else:
            mae_improvement = 0
        
        print(f"  R² Improvement: {r2_improvement:+.2f}% (Baseline: {baseline_metrics['R2']:.4f} → Enhanced: {enhanced_metrics['R2']:.4f})")
        print(f"  RMSE Improvement: {rmse_improvement:+.2f}% (Baseline: {baseline_metrics['RMSE']:.6f} → Enhanced: {enhanced_metrics['RMSE']:.6f})")
        print(f"  MAE Improvement: {mae_improvement:+.2f}% (Baseline: {baseline_metrics['MAE']:.6f} → Enhanced: {enhanced_metrics['MAE']:.6f})")

IMPROVEMENTS FOR AMSC

Window-Based Forecasting:
  R² Improvement: -77505.82% (Baseline: -0.0202 → Enhanced: -15.7082)
  RMSE Improvement: -304.68% (Baseline: 0.612686 → Enhanced: 2.479425)
  MAE Improvement: -373.42% (Baseline: 0.405466 → Enhanced: 1.919569)

Day-Ahead Forecasting:
  R² Improvement: +1.29% (Baseline: 0.4724 → Enhanced: 0.4786)
  RMSE Improvement: +0.58% (Baseline: 0.440574 → Enhanced: 0.438015)
  MAE Improvement: +2.31% (Baseline: 0.280161 → Enhanced: 0.273694)

Historical-Based Forecasting:
  R² Improvement: -77505.82% (Baseline: -0.0202 → Enhanced: -15.7082)
  RMSE Improvement: -304.68% (Baseline: 0.612686 → Enhanced: 2.479425)
  MAE Improvement: -373.42% (Baseline: 0.405466 → Enhanced: 1.919569)
IMPROVEMENTS FOR BP

Window-Based Forecasting:
  R² Improvement: -28716.73% (Baseline: -0.0072 → Enhanced: -2.0693)
  RMSE Improvement: -74.57% (Baseline: 0.966984 → Enhanced: 1.688058)
  MAE Improvement: -117.81% (Baseline: 0.617710 → Enhanced: 1.345432)

Day-Ahead Forecas

In [24]:
print(len(results))

48


In [25]:
import matplotlib.pyplot as plt
import numpy as np

for company in tickers.values():
    print(f"\nGenerating visualization for {company}...")
    
    fig = plt.figure(figsize=(16, 12))
    gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)

    # Extract per-method baseline/enhanced entries for THIS company
    baseline_results = [r for r in results if r['Model'] == 'baseline' and r['Company'] == company]
    enhanced_results = [r for r in results if r['Model'] == 'enhanced' and r['Company'] == company]
    
    # Skip if no data for this company
    if len(baseline_results) == 0 or len(enhanced_results) == 0:
        print(f"  WARNING: No results found for {company}, skipping...")
        continue

    # 1. R² Comparison -------------------------------------------------------------
    ax1 = fig.add_subplot(gs[0, :2])
    x = np.arange(len(methods))
    width = 0.35

    baseline_r2 = [r['R2'] for r in baseline_results]
    enhanced_r2 = [r['R2'] for r in enhanced_results]

    ax1.bar(x - width/2, baseline_r2, width, label='Baseline', alpha=0.8, color='steelblue')
    ax1.bar(x + width/2, enhanced_r2, width, label='Enhanced', alpha=0.8, color='coral')

    ax1.set_ylabel('R² Score', fontsize=12)
    ax1.set_title(f'{company} - R² Score Comparison by Method', fontsize=14, fontweight='bold')
    ax1.set_xticks(x)
    ax1.set_xticklabels([r['Method_Name'] for r in baseline_results], rotation=15, ha='right')
    ax1.legend()
    ax1.grid(True, alpha=0.3, axis='y')
    ax1.axhline(y=0, color='black', linestyle='-', linewidth=0.5)

    # 2. RMSE Comparison -----------------------------------------------------------
    ax2 = fig.add_subplot(gs[0, 2])

    baseline_rmse = [r['RMSE'] for r in baseline_results]
    enhanced_rmse = [r['RMSE'] for r in enhanced_results]

    ax2.bar(x - width/2, baseline_rmse, width, label='Baseline', alpha=0.8, color='steelblue')
    ax2.bar(x + width/2, enhanced_rmse, width, label='Enhanced', alpha=0.8, color='coral')

    ax2.set_ylabel('RMSE', fontsize=12)
    ax2.set_title('RMSE Comparison', fontsize=14, fontweight='bold')
    ax2.set_xticks(x)
    ax2.set_xticklabels(['Window', 'Day', 'Historical'], rotation=15, ha='right')
    ax2.legend()
    ax2.grid(True, alpha=0.3, axis='y')

    # 3–5. Prediction Plots per method --------------------------------------------
    for idx, (baseline_entry, enhanced_entry) in enumerate(zip(baseline_results, enhanced_results)):
        ax = fig.add_subplot(gs[1 + idx // 3, idx % 3])

        actuals = baseline_entry['Actuals'][:200]
        pred_baseline = baseline_entry['Predictions'][:200]
        pred_enhanced = enhanced_entry['Predictions'][:200]

        ax.plot(actuals, label='Actual', linewidth=2, alpha=0.7, color='black')
        ax.plot(pred_baseline, label='Baseline', linewidth=1.5, alpha=0.7, color='steelblue')
        ax.plot(pred_enhanced, label='Enhanced', linewidth=1.5, alpha=0.7, color='coral')

        ax.set_xlabel('Time Step', fontsize=10)
        ax.set_ylabel('Normalized Volume', fontsize=10)

        ax.set_title(
            f"{baseline_entry['Method_Name']}\n"
            f"Base R²: {baseline_entry['R2']:.3f} | "
            f"Enh R²: {enhanced_entry['R2']:.3f}",
            fontsize=11, fontweight='bold'
        )

        ax.legend(fontsize=9)
        ax.grid(True, alpha=0.3)

    plt.suptitle(f'{company} - Baseline vs Enhanced Model Comparison', 
                 fontsize=16, fontweight='bold', y=0.995)
    
    output_file = f'data/output/{company}_comparison.png'
    plt.savefig(output_file, dpi=300, bbox_inches='tight')
    print(f"  ✓ Saved: {output_file}")
    plt.close()

print("\n{'='*70}")
print("ALL VISUALIZATIONS COMPLETE")
print("{'='*70}")


Generating visualization for AMSC...
  ✓ Saved: data/output/AMSC_comparison.png

Generating visualization for BP...
  ✓ Saved: data/output/BP_comparison.png

Generating visualization for EVR...
  ✓ Saved: data/output/EVR_comparison.png

Generating visualization for GOOGL...
  ✓ Saved: data/output/GOOGL_comparison.png

Generating visualization for HLF...
  ✓ Saved: data/output/HLF_comparison.png

Generating visualization for MDRX...
  ✓ Saved: data/output/MDRX_comparison.png

Generating visualization for ORCL...
  ✓ Saved: data/output/ORCL_comparison.png

Generating visualization for WFC...
  ✓ Saved: data/output/WFC_comparison.png

{'='*70}
ALL VISUALIZATIONS COMPLETE
{'='*70}
