Everything is the same: 
Only differences are the following. 
A pre train split for the short term and the long term LSTM embeddinsg to get more accurate results
In the final model, there is an 80-20 split for the 309 days taken (that is not taken for the pre training). This 20% is the old test set (TEST1)
Then there is a new test split for the dates given in the pdf. That is TEST2

In [None]:
import os
import pandas as pd
import numpy as np
import yfinance as yf
import pickle
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

# Load the Nifty500 list
nifty500_list = pd.read_csv('ind_nifty500list_filtered_final.csv')
nifty500_tickers = nifty500_list['Symbol'] + '.NS'
start_date = "2022-01-10"
end_date = "2025-04-12" #added the new dates for testing purposes

# Define the new start date (29 days after original start date)
adjusted_start_date = pd.to_datetime(start_date) + pd.Timedelta(days=29)

# Create directories
raw_data_dir = 'New_HistoricalData'
processed_data_dir = 'New_ProcessedHistoricalData'
adjusted_data_dir = 'New_AdjustedHistoricalData'
pickle_dir = 'New_PickleData'
os.makedirs(raw_data_dir, exist_ok=True)
os.makedirs(processed_data_dir, exist_ok=True)
os.makedirs(adjusted_data_dir, exist_ok=True)
os.makedirs(pickle_dir, exist_ok=True)

# Dictionary to store stock data
stock_data_dict = {}
sector_data_dict = {}

# Download stock data
for ticker in tqdm(nifty500_tickers[0:500], desc="Downloading Stock Data"):
    try:
        ticker_obj = yf.Ticker(ticker)
        company_info = ticker_obj.info
        company_history = ticker_obj.history(start=start_date, end=end_date)
        company_history.index = company_history.index.tz_localize(None)

        # Drop unwanted columns
        company_history.drop(columns=['Dividends', 'Stock Splits'], inplace=True, errors='ignore')

        # Remove weeks with no data
        company_history['Week'] = company_history.index.to_period('W')
        valid_weeks = company_history.groupby('Week')['Close'].count() > 0
        company_history = company_history[company_history['Week'].isin(valid_weeks[valid_weeks].index)]
        company_history.drop(columns=['Week'], inplace=True)

        # Get sector information
        sector = company_info.get("sector", "Unknown")
        company_history.insert(1, "Sector", sector)

        # Save cleaned raw data
        file_name = ticker.replace('.NS', '') + '.csv'
        company_history.to_csv(os.path.join(raw_data_dir, file_name), index=True)

        # Store in dictionary
        stock_data_dict[ticker] = company_history
        sector_data_dict[ticker] = sector
    except Exception as e:
        print(f"Couldn't fetch data for {ticker}: {e}")

# Save raw data dictionary as pickle
with open(os.path.join(pickle_dir, 'raw_stock_data.pkl'), 'wb') as f:
    pickle.dump(stock_data_dict, f)

with open(os.path.join(pickle_dir, 'sector_data.pkl'), 'wb') as f:
    pickle.dump(sector_data_dict, f)

# Function to calculate RSI
def calculate_rsi(data, period=14):
    delta = data['Close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=period, min_periods=1).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period, min_periods=1).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

# Function to calculate MACD
def calculate_macd(data, short_window=12, long_window=26, signal_window=9):
    short_ema = data['Close'].ewm(span=short_window, adjust=False).mean()
    long_ema = data['Close'].ewm(span=long_window, adjust=False).mean()
    macd = short_ema - long_ema
    signal = macd.ewm(span=signal_window, adjust=False).mean()
    return macd, signal

# Process stock data
def process_stock_data(stock_data):
    stock_data['normalized_close'] = StandardScaler().fit_transform(stock_data[['Close']])
    stock_data['return_ratio'] = stock_data['Close'].pct_change()
    for window in [5, 10, 15, 20, 25, 30]:
        stock_data[f'MA_{window}'] = stock_data['Close'].rolling(window=window, min_periods=1).mean()
    for col in ['Open', 'High', 'Low']:
        stock_data[f'{col}_pct_change'] = stock_data[col] / stock_data['Close'] - 1
    stock_data['RSI'] = calculate_rsi(stock_data)
    stock_data['MACD'], stock_data['MACD_signal'] = calculate_macd(stock_data)
    return stock_data

# Dictionary to store processed data
processed_data_dict = {}
adjusted_data_dict = {}

# Process and save data
for file_name in tqdm(os.listdir(raw_data_dir), desc="Processing Stock Data"):
    if file_name.endswith('.csv'):
        symbol = file_name.split('.')[0]
        stock_data = pd.read_csv(os.path.join(raw_data_dir, file_name), index_col=0, parse_dates=True)
        processed_data = process_stock_data(stock_data)
        processed_data.to_csv(os.path.join(processed_data_dir, file_name), index=True)

        # Store in dictionary
        processed_data_dict[symbol] = processed_data

        # Extract data starting from the 29th day
        adjusted_data = processed_data[processed_data.index >= adjusted_start_date]
        if not adjusted_data.empty:
            adjusted_data.to_csv(os.path.join(adjusted_data_dir, file_name), index=True)
            adjusted_data_dict[symbol] = adjusted_data

# Save processed and adjusted data dictionaries as pickle
with open(os.path.join(pickle_dir, 'processed_stock_data.pkl'), 'wb') as f:
    pickle.dump(processed_data_dict, f)

with open(os.path.join(pickle_dir, 'adjusted_stock_data.pkl'), 'wb') as f:
    pickle.dump(adjusted_data_dict, f)


In [None]:
import os
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import MinMaxScaler

# Directories
input_dir = 'New_AdjustedHistoricalData'
output_dir = 'New_NormalizedProcessedData'
pickle_dir = 'New_PickleData'
os.makedirs(output_dir, exist_ok=True)
os.makedirs(pickle_dir, exist_ok=True)
pickle_file = os.path.join(pickle_dir, 'normalized_stock_data.pkl')

# Dictionary to store normalized data
normalized_data_dict = {}

# Process each file in input directory
for filename in os.listdir(input_dir):
    if filename.endswith('.csv'):
        file_path = os.path.join(input_dir, filename)
        df = pd.read_csv(file_path)

        # Ensure Date and Sector columns exist
        if 'Date' not in df.columns:
            df['Date'] = df.index  # Add Date as a column if missing
        if 'Sector' not in df.columns:
            df['Sector'] = "Unknown"  # Fallback if sector info is missing

        # Identify column groups
        non_numeric_cols = ['Date', 'Sector']
        minmax_cols = ['MA_5', 'MA_10', 'MA_15', 'MA_20', 'MA_25', 'MA_30', 'RSI', 'MACD', 'MACD_signal', 'Open_pct_change', 'High_pct_change', 'Low_pct_change']  # MinMaxScaler
        volume_cols = ['Volume']  # Log + MinMaxScaler

        # Apply MinMaxScaler (0-1 normalization) to moving averages, momentum indicators, and percentage changes
        scaler_minmax = MinMaxScaler()
        df[minmax_cols] = scaler_minmax.fit_transform(df[minmax_cols])

        # Apply Log Scaling + MinMaxScaler to Volume
        df[volume_cols] = np.log1p(df[volume_cols])  # Log transform
        df[volume_cols] = scaler_minmax.fit_transform(df[volume_cols])  # MinMax scale

        # Compute normalized close price
        df['normalized_close'] = (df['Close'] - df['Close'].min()) / (df['Close'].max() - df['Close'].min())

        # Compute return ratio
        df['return_ratio'] = df['Close'].pct_change().fillna(0)

        # Save the normalized data
        output_filename = f"{filename.replace('.csv', '_norm.csv')}"
        output_path = os.path.join(output_dir, output_filename)
        df.to_csv(output_path, index=False)

        # Store in dictionary
        normalized_data_dict[filename] = df

        #print(f"✅ Normalization complete! Saved: {output_filename}")

# Save the dictionary as a pickle file
with open(pickle_file, 'wb') as f:
    pickle.dump(normalized_data_dict, f)

print("✅ All normalized data saved as a pickle file.")


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import os
import datetime
from tqdm import tqdm
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Enhanced Configuration
class Config:
    data_dir = "New_NormalizedProcessedData"
    output_dir = "New_ShortTerm_Stock_Embeddings"
    seq_length = 30
    hidden_dim = 128
    num_layers = 2
    batch_size = 128
    epochs = 50
    initial_lr = 0.001
    weight_decay = 1e-5  # L2 regularization
    dropout = 0.25
    feature_columns = [
        "MA_5", "MA_10", "MA_15", "MA_20", "MA_25", "MA_30",
        "RSI", "MACD", "MACD_signal", "Volume",
        "Open_pct_change", "High_pct_change", "Low_pct_change"
    ]
    target_column = "return_ratio"
    train_cutoff_date = "2024-01-11"  # All data before this is training

# Volatility-Aware Attentive LSTM 
class VolatilityAwareAttentiveLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, dropout=0.25):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        
        # Standard attention mechanism
        self.attention = nn.MultiheadAttention(hidden_dim, num_heads=2, batch_first=True)
        
        # Volatility awareness components
        self.volatility_detector = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim // 2, 1),
            nn.Sigmoid()  # Normalize volatility score between 0-1
        )
        
        # Final prediction layers
        self.fc_layers = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.LayerNorm(hidden_dim // 2),  # Layer normalization for better stability
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim // 2, 1)
        )
        
        self.hidden_dim = hidden_dim
        
    def forward(self, x):
        # x shape: [batch_size, seq_len, input_dim]
        lstm_out, _ = self.lstm(x)  # [batch_size, seq_len, hidden_dim]
        
        # Calculate volatility weights
        batch_size, seq_len, _ = lstm_out.shape
        volatility_scores = torch.zeros(batch_size, seq_len).to(x.device)
        
        for i in range(seq_len):
            volatility_scores[:, i] = self.volatility_detector(lstm_out[:, i, :]).squeeze(-1)
        
        # Apply standard attention
        attn_out, _ = self.attention(lstm_out, lstm_out, lstm_out)
        
        # Apply volatility weighting to attention output
        weighted_out = attn_out * volatility_scores.unsqueeze(-1)
        
        # Get context vector by weighted 
        context = torch.sum(weighted_out, dim=1) / (torch.sum(volatility_scores, dim=1, keepdim=True) + 1e-8)
        
        # Prediction
        prediction = self.fc_layers(context)
        
        return prediction, context

# Training function with enhanced evaluation
def train_and_evaluate(stock_name, df, config):
    # Prepare data
    data = df[config.feature_columns].values
    targets = df[config.target_column].values
    dates = pd.to_datetime(df['Date'])
    
    def create_sequences(data, targets, seq_length):
        X, y = [], []
        for i in range(len(data) - seq_length):
            X.append(data[i:i + seq_length])
            y.append(targets[i + seq_length])
        return np.array(X, dtype=np.float32), np.array(y, dtype=np.float32)
    
    X, y = create_sequences(data, targets, config.seq_length)
    seq_dates = dates[config.seq_length:].reset_index(drop=True)
    
    # Time-based train-test split
    cutoff_date = pd.to_datetime(config.train_cutoff_date)
    train_indices = seq_dates[seq_dates < cutoff_date].index.tolist()
    test_indices = seq_dates[seq_dates >= cutoff_date].index.tolist()
    
    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]
    dates_test = seq_dates.iloc[test_indices]
    
    # Convert to tensors
    X_train = torch.tensor(X_train).float()
    X_test = torch.tensor(X_test).float()
    y_train = torch.tensor(y_train).float().unsqueeze(1)
    y_test = torch.tensor(y_test).float().unsqueeze(1)
    
    # Check if CUDA is available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Model setup
    model = VolatilityAwareAttentiveLSTM(
        len(config.feature_columns), 
        config.hidden_dim, 
        config.num_layers,
        config.dropout
    ).to(device)
    
    optimizer = optim.Adam(
        model.parameters(), 
        lr=config.initial_lr,
        weight_decay=config.weight_decay  # L2 regularization
    )
    
    # Learning rate scheduler
    scheduler = ReduceLROnPlateau(
        optimizer, 
        mode='min', 
        factor=0.5,
        patience=5, 
        verbose=True
    )
    
    criterion = nn.MSELoss()
    
    # Training loop with validation
    train_losses = []
    val_indices = train_indices[-len(train_indices)//5:]  # Use last 20% of training data for validation
    train_indices = train_indices[:-len(train_indices)//5]
    
    X_val = torch.tensor(X[val_indices]).float().to(device)
    y_val = torch.tensor(y[val_indices]).float().unsqueeze(1).to(device)
    
    # Move training data to device
    X_train_final = torch.tensor(X[train_indices]).float().to(device)
    y_train_final = torch.tensor(y[train_indices]).float().unsqueeze(1).to(device)
    X_test = X_test.to(device)
    y_test = y_test.to(device)
    
    best_val_loss = float('inf')
    best_model_state = None
    patience = 10
    patience_counter = 0
    
    for epoch in tqdm(range(config.epochs), desc=f"Training {stock_name}"):
        # Training
        model.train()
        epoch_loss = 0
        batch_indices = torch.randperm(len(X_train_final))
        
        for i in range(0, len(X_train_final), config.batch_size):
            batch_idx = batch_indices[i:i + config.batch_size]
            X_batch = X_train_final[batch_idx]
            y_batch = y_train_final[batch_idx]
            
            optimizer.zero_grad()
            preds, _ = model(X_batch)
            loss = criterion(preds, y_batch)
            loss.backward()
            
            # Gradient clipping to prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            optimizer.step()
            epoch_loss += loss.item() * len(X_batch)
        
        avg_train_loss = epoch_loss / len(X_train_final)
        train_losses.append(avg_train_loss)
        
        # Validation
        model.eval()
        with torch.no_grad():
            val_preds, _ = model(X_val)
            val_loss = criterion(val_preds, y_val).item()
        
        # Update learning rate
        scheduler.step(val_loss)
        
        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_state = model.state_dict().copy()
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping at epoch {epoch}")
                break
    
    # Load best model for final evaluation
    model.load_state_dict(best_model_state)
    
    # Final test evaluation
    model.eval()
    with torch.no_grad():
        test_preds, test_embs = model(X_test)
        
        # Move to CPU for numpy operations
        test_preds_np = test_preds.cpu().numpy()
        y_test_np = y_test.cpu().numpy()
        test_embs_np = test_embs.cpu().numpy()
        
        # Calculate metrics
        test_mse = mean_squared_error(y_test_np, test_preds_np)
        test_rmse = np.sqrt(test_mse)
        test_mae = mean_absolute_error(y_test_np, test_preds_np)
        test_r2 = r2_score(y_test_np, test_preds_np)
        
        # Directional accuracy
        actual_direction = np.sign(y_test_np)
        pred_direction = np.sign(test_preds_np)
        directional_accuracy = np.mean(actual_direction == pred_direction)
        
        # Information coefficient (IC) - correlation between predictions and actuals
        ic = np.corrcoef(test_preds_np.flatten(), y_test_np.flatten())[0, 1]
        
        # Maximum drawdown calculation
        # Assuming predictions are return ratios, calculate cumulative returns
        cumulative_actual = np.cumprod(1 + y_test_np.flatten())
        peak = np.maximum.accumulate(cumulative_actual)
        drawdown = (peak - cumulative_actual) / peak
        max_drawdown = np.max(drawdown)
    
    # Save results
    save_results(
        stock_name, train_losses, 
        {
            'MSE': test_mse,
            'RMSE': test_rmse,
            'MAE': test_mae,
            'R²': test_r2,
            'Directional_Accuracy': directional_accuracy,
            'Information_Coefficient': ic,
            'Max_Drawdown': max_drawdown
        },
        test_embs_np, df.iloc[test_indices], dates_test, config
    )
    
    return model, test_embs_np, dates_test, test_preds_np, y_test_np

def save_results(stock_name, train_losses, metrics, 
                 test_embs, test_df, test_dates, config):
    # Create directory if it doesn't exist
    os.makedirs(config.output_dir, exist_ok=True)
    
    # Plot training loss
    plt.figure(figsize=(10, 5))
    plt.plot(train_losses, label='Training Loss (MSE)')
    plt.title(f"Training Curve for {stock_name}")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    plt.savefig(f"{config.output_dir}/{stock_name}_train_loss.png")
    plt.close()
    
    # Save test metrics
    with open(f"{config.output_dir}/{stock_name}_metrics.txt", "w") as f:
        for metric_name, metric_value in metrics.items():
            f.write(f"{metric_name}: {metric_value:.4f}\n")
    
    # Save test embeddings with date and sector information
    embedding_df = pd.DataFrame({
        'date': test_dates,
        'stock': stock_name,
        'sector': test_df['Sector'].iloc[0] if 'Sector' in test_df.columns else 'Unknown',
        **{f'emb_{i}': test_embs[:, i] for i in range(test_embs.shape[1])}
    })
    
    # This is now the primary output - the embeddings CSV file
    embedding_df.to_csv(f"{config.output_dir}/{stock_name}_embeddings.csv", index=False)

    # Create visualization of embeddings over time (2D PCA)
    if test_embs.shape[0] > 2:  # Need at least 2 samples for PCA
        from sklearn.decomposition import PCA
        pca = PCA(n_components=2)
        emb_2d = pca.fit_transform(test_embs)
        
        plt.figure(figsize=(12, 8))
        sc = plt.scatter(emb_2d[:, 0], emb_2d[:, 1], 
                         c=range(len(emb_2d)), cmap='viridis', alpha=0.7)
        plt.colorbar(sc, label='Time Sequence')
        plt.title(f"Temporal Embedding Evolution for {stock_name}")
        plt.xlabel(f"PCA Component 1 (Var: {pca.explained_variance_ratio_[0]:.2f})")
        plt.ylabel(f"PCA Component 2 (Var: {pca.explained_variance_ratio_[1]:.2f})")
        plt.savefig(f"{config.output_dir}/{stock_name}_embedding_evolution.png")
        plt.close()

# Function to visualize prediction performance
def visualize_predictions(stock_name, dates, y_true, y_pred, config):
    plt.figure(figsize=(14, 7))
    plt.plot(dates, y_true, label='Actual Returns', color='blue', alpha=0.7)
    plt.plot(dates, y_pred, label='Predicted Returns', color='red', alpha=0.7)
    plt.fill_between(dates, y_true.flatten(), y_pred.flatten(), 
                     color='gray', alpha=0.3, label='Prediction Error')
    
    plt.title(f"Return Prediction Performance for {stock_name}")
    plt.xlabel("Date")
    plt.ylabel("Return Ratio")
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Add directional accuracy
    correct_dir = np.sign(y_true) == np.sign(y_pred)
    dir_acc = np.mean(correct_dir) * 100
    plt.figtext(0.15, 0.85, f"Directional Accuracy: {dir_acc:.2f}%",
                bbox=dict(facecolor='white', alpha=0.8))
    
    plt.tight_layout()
    plt.savefig(f"{config.output_dir}/{stock_name}_prediction_performance.png")
    plt.close()

# Main execution
def main():
    config = Config()
    os.makedirs(config.output_dir, exist_ok=True)
    
    all_metrics = []
    all_embeddings = {}
    
    for file_name in tqdm(os.listdir(config.data_dir), desc="Processing Stocks"):
        if file_name.endswith(".csv"):
            stock_name = file_name.replace("_norm.csv", "")
            df = pd.read_csv(os.path.join(config.data_dir, file_name))
            df['Date'] = pd.to_datetime(df['Date'])
            
            try:
                model, test_embs, test_dates, test_preds, y_test = train_and_evaluate(stock_name, df, config)
                
                # Visualize predictions
                visualize_predictions(stock_name, test_dates, y_test, test_preds, config)
                
                # Store metrics for comparative analysis
                test_mse = mean_squared_error(y_test, test_preds)
                test_rmse = np.sqrt(test_mse)
                test_mae = mean_absolute_error(y_test, test_preds)
                test_r2 = r2_score(y_test, test_preds)
                
                # Directional accuracy
                actual_direction = np.sign(y_test)
                pred_direction = np.sign(test_preds)
                directional_accuracy = np.mean(actual_direction == pred_direction)
                
                all_metrics.append({
                    'stock': stock_name,
                    'sector': df['Sector'].iloc[0] if 'Sector' in df.columns else 'Unknown',
                    'MSE': test_mse,
                    'RMSE': test_rmse,
                    'MAE': test_mae,
                    'R²': test_r2,
                    'Directional_Accuracy': directional_accuracy,
                })
                
                # Store embeddings for later cross-stock analysis
                all_embeddings[stock_name] = {
                    'embeddings': test_embs,
                    'dates': test_dates,
                    'sector': df['Sector'].iloc[0] if 'Sector' in df.columns else 'Unknown'
                }
                
            except Exception as e:
                print(f"Failed on {stock_name}: {str(e)}")
    
if __name__ == "__main__":
    main()

In [None]:
import os
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from itertools import combinations
import matplotlib.colors as mcolors
import torch
import random

# ========== CONFIGURATION ==========
embedding_dir = "New_ShortTerm_Stock_Embeddings"
output_dir = "New_Graphs"
os.makedirs(output_dir, exist_ok=True)

intra_sector_graph_path = os.path.join(output_dir, "Intra_Sector_Graph.csv")
sector_graphs_dir = os.path.join(output_dir, "Sector_Graphs")
os.makedirs(sector_graphs_dir, exist_ok=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ========== DATA LOADING ==========
embedding_files = [f for f in os.listdir(embedding_dir) if f.endswith(".csv")]
stock_embeddings = {}
stock_sectors = {}
embedding_size = None

# First pass: Identify all sectors and handle ISEC
print("\nScanning sectors and processing ISEC...")
for file in embedding_files:
    try:
        stock_name = file.split("_")[0]
        if stock_name == "ISEC":
            stock_sectors["ISEC"] = "Financial Services"
            print("✅ Manually set ISEC sector to Financial Services")
    except Exception as e:
        print(f"⚠️ Error scanning {file}: {str(e)}")

# Second pass: Load embeddings
print("\nLoading stock embeddings...")
for file in embedding_files:
    try:
        stock_name = file.split("_")[0]
        if stock_name == "ISEC":
            continue  # Already handled
            
        df = pd.read_csv(os.path.join(embedding_dir, file))
        
        # Process embedding columns
        emb_cols = [col for col in df.columns if col.startswith('emb_')]
        if not emb_cols:
            print(f"⚠️ {file}: No embedding columns found")
            continue
            
        if embedding_size is None:
            embedding_size = len(emb_cols)
            print(f"Detected embedding dimension: {embedding_size}")
        elif len(emb_cols) != embedding_size:
            print(f"⚠️ {file}: Dimension mismatch ({len(emb_cols)} vs {embedding_size})")
            continue
            
        # Numeric conversion and validation
        df[emb_cols] = df[emb_cols].apply(pd.to_numeric, errors='coerce')
        if df[emb_cols].isnull().values.any():
            print(f"⚠️ {file}: Contains invalid numeric values")
            continue
            
        sector = df.iloc[0, 2]
        stock_sectors[stock_name] = sector
        stock_embeddings[stock_name] = df[emb_cols].to_numpy()
        
        print(f"✓ Loaded {stock_name} ({sector}) with {len(df)} days")
        
    except Exception as e:
        print(f"⚠️ Failed to process {file}: {str(e)}")

# ========== GRAPH CONSTRUCTION ==========
print("\nBuilding intra-sector graph...")
G_intra = nx.Graph()

# Create comprehensive color palette
all_sectors = sorted(set(stock_sectors.values()))
base_colors = list(mcolors.TABLEAU_COLORS.values()) + list(mcolors.XKCD_COLORS.values())
sector_colors = {sector: base_colors[i % len(base_colors)] for i, sector in enumerate(all_sectors)}

# Add sector nodes
for sector in all_sectors:
    G_intra.add_node(sector, type="sector", color=sector_colors[sector])

# Add stock nodes and sector edges
for stock, embeddings in stock_embeddings.items():
    sector = stock_sectors[stock]
    try:
        avg_embedding = np.mean(embeddings, axis=0)
        if np.isnan(avg_embedding).any():
            print(f"⚠️ {stock}: NaN in averaged embeddings")
            continue
            
        G_intra.add_node(stock, sector=sector, embedding=avg_embedding)
        G_intra.add_edge(stock, sector, weight=1.0)
    except Exception as e:
        print(f"⚠️ Failed to add {stock}: {str(e)}")

# Ensure ISEC is properly connected
if "ISEC" in stock_sectors and "ISEC" not in G_intra:
    sector = stock_sectors["ISEC"]
    dummy_embedding = np.zeros(embedding_size) if embedding_size else np.zeros(128)
    G_intra.add_node("ISEC", sector=sector, embedding=dummy_embedding)
    G_intra.add_edge("ISEC", sector, weight=1.0)
    print("✅ Force-added ISEC to graph")

# ========== SIMILARITY CALCULATION ==========
print("\nCalculating cosine similarities...")
stock_list = [s for s in stock_embeddings.keys() if s in G_intra.nodes]
if not stock_list:
    raise ValueError("No valid stocks found for similarity calculation")

try:
    embedding_matrix = np.vstack([G_intra.nodes[s]['embedding'] for s in stock_list])
    if torch.cuda.is_available():
        tensor = torch.tensor(embedding_matrix, device=device)
        sim_matrix = torch.nn.functional.cosine_similarity(
            tensor.unsqueeze(1), tensor.unsqueeze(0), dim=-1
        ).cpu().numpy()
    else:
        sim_matrix = cosine_similarity(embedding_matrix)
except Exception as e:
    print(f"⚠️ Similarity calculation failed: {str(e)}")
    raise

# Add similarity edges
threshold = 0.8
print(f"\nAdding edges with similarity > {threshold}")
for i, j in combinations(range(len(stock_list)), 2):
    s1, s2 = stock_list[i], stock_list[j]
    sim = sim_matrix[i, j]
    same_sector = G_intra.nodes[s1]['sector'] == G_intra.nodes[s2]['sector']
    if same_sector and sim > threshold:
        G_intra.add_edge(s1, s2, weight=sim)

# ========== OUTPUT GENERATION ==========
def save_edges(graph, csv_path):
    edges = [(u, v, d['weight']) for u, v, d in graph.edges(data=True)]
    df = pd.DataFrame(edges, columns=["source", "target", "weight"])
    df.to_csv(csv_path, index=False)
    print(f"✅ Saved {len(edges)} edges to {csv_path}")

print("\nSaving intra-sector graph:")
save_edges(G_intra, intra_sector_graph_path)

# ========== VISUALIZATION ==========
print("\nGenerating intra-sector visualizations...")
def plot_enhanced_graph(graph, title, filename):
    plt.figure(figsize=(20, 16))
    pos = nx.spring_layout(graph, k=0.15, iterations=50)
    
    # Get node colors safely
    node_colors = []
    for node in graph.nodes():
        if 'color' in graph.nodes[node]:
            node_colors.append(graph.nodes[node]['color'])
        else:
            sector = graph.nodes[node].get('sector')
            node_colors.append(sector_colors.get(sector, 'gray'))
    
    nx.draw_networkx_edges(
        graph, pos,
        width=[2*d['weight'] for _,_,d in graph.edges(data=True)],
        alpha=0.3, edge_color='gray'
    )
    nx.draw_networkx_nodes(
        graph, pos,
        node_size=[1200 if graph.nodes[node].get('type') == 'sector' else 600 
                  for node in graph.nodes()],
        node_color=node_colors,
        alpha=0.9
    )
    
    # Label only sectors
    nx.draw_networkx_labels(
        graph, pos,
        labels={n: n for n in graph.nodes() if graph.nodes[n].get('type') == 'sector'},
        font_size=14,
        font_weight='bold'
    )
    
    plt.title(f"{title}\nNodes: {graph.number_of_nodes()} | Edges: {graph.number_of_edges()}", fontsize=16)
    plt.savefig(os.path.join(output_dir, filename), bbox_inches='tight', dpi=300)
    plt.close()

plot_enhanced_graph(G_intra, "Intra-Sector Network", "Intra_Sector_Graph_enhanced.png")

# Sector-specific plots
print("\nGenerating sector-specific plots:")
for sector in all_sectors:
    plt.figure(figsize=(16, 14))
    nodes = [n for n in G_intra.nodes if G_intra.nodes[n].get('sector') == sector or n == sector]
    if len(nodes) <= 1:
        continue
        
    subgraph = G_intra.subgraph(nodes)
    pos = nx.spring_layout(subgraph, k=0.3)
    
    nx.draw(
        subgraph, pos,
        node_color=sector_colors[sector],
        node_size=800,
        with_labels=True,
        font_size=10,
        edge_color="gray",
        alpha=0.9,
        width=[2*d['weight'] for _,_,d in subgraph.edges(data=True)]
    )
    plt.title(f"{sector} Sector\n{len(nodes)-1} stocks", fontsize=14)
    plt.savefig(
        os.path.join(sector_graphs_dir, f"{sector}_Graph_enhanced.png"),
        bbox_inches='tight', dpi=300
    )
    plt.close()
    print(f"✓ Generated {sector} sector plot")

# ========== FINAL OUTPUT ==========
print("\n✅ Processing completed successfully")
print(f"Final counts - Intra-Sector Graph: {G_intra.number_of_nodes()} nodes, {G_intra.number_of_edges()} edges")

# Verify ISEC presence
if "ISEC" in stock_sectors:
    print("\nISEC verification:")
    if "ISEC" in G_intra.nodes():
        print("✓ ISEC present in intra-sector graph")
        print(f"Connections: {list(G_intra.edges('ISEC'))}")
    else:
        print("⚠️ ISEC missing from intra-sector graph")


In [None]:
import os
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
from itertools import combinations
import matplotlib.pyplot as plt
from tqdm import tqdm

class StockGraphBuilder:
    def __init__(self, embedding_dir="New_ShortTerm_Stock_Embeddings"):
        self.embedding_dir = embedding_dir
        self.output_dir = "New_Graphs"
        os.makedirs(self.output_dir, exist_ok=True)

    def build_graphs(self):
        """Complete graph construction pipeline"""
        print("1. Loading embeddings...")
        stock_data = self._load_with_isec_handling()
        
        print("\n2. Computing similarities...")
        similarity_matrix = self._compute_similarities(stock_data)
        
        print("\n3. Building graph...")
        G = self._build_with_dynamic_thresholds(stock_data, similarity_matrix)
        
        print("\n4. Analyzing results...")
        self._enhanced_analysis(G)
        return G

    def _load_with_isec_handling(self):
        """Special handling for ISEC while validating others"""
        embeddings = {}
        valid_stocks = 0
        
        for file in tqdm(os.listdir(self.embedding_dir)):
            if not file.endswith("_embeddings.csv"):
                continue
                
            stock_name = file.split("_")[0]
            df = pd.read_csv(os.path.join(self.embedding_dir, file))
            
            # ISEC special case
            if stock_name == "ISEC":
                sector = "Financial Services"
                print("✅ Manually assigned sector to ISEC")
            elif 'sector' in df.columns:
                sector = df['sector'].iloc[0]
            else:
                print(f"⚠️ Skipping {stock_name}: No sector data")
                continue
                
            # Validate embeddings
            emb = df.filter(regex='emb_').values
            if np.isnan(emb).any() or np.all(emb == 0):
                print(f"⚠️ Skipping {stock_name}: Invalid embeddings")
                continue
                
            embeddings[stock_name] = {
                'sector': sector,
                'embedding': np.mean(emb, axis=0)
            }
            valid_stocks += 1
            
        print(f"\nLoaded {valid_stocks} valid stocks (including ISEC)")
        return embeddings

    def _compute_similarities(self, stock_data):
        """Compute similarity matrix with diagnostics"""
        stock_names = list(stock_data.keys())
        embeddings = np.array([stock_data[name]['embedding'] for name in stock_names])
        
        print(f"Embedding matrix shape: {embeddings.shape}")
        print(f"NaN values: {np.isnan(embeddings).sum()}")
        print(f"Zero embeddings: {np.all(embeddings == 0, axis=1).sum()}")
        
        similarity = cosine_similarity(embeddings)
        
        # Calculate similarity statistics
        print(f"\nSimilarity statistics:")
        print(f"Min: {similarity.min():.3f}")
        print(f"Max: {similarity.max():.3f}")
        print(f"Mean: {similarity.mean():.3f}")
        print(f"Median: {np.median(similarity):.3f}")
        
        # Plot similarity distribution
        plt.figure(figsize=(10, 5))
        plt.hist(similarity.flatten(), bins=50)
        plt.title("Cosine Similarity Distribution")
        plt.savefig(os.path.join(self.output_dir, "similarity_distribution.png"))
        plt.close()
        
        return similarity

    def _build_with_dynamic_thresholds(self, stock_data, similarity_matrix):
        """Build graph with dynamic percentile-based thresholds"""
        G = nx.Graph()
        stock_names = list(stock_data.keys())
        
        # Calculate dynamic thresholds
        intra_sims = []
        inter_sims = []
        
        for i, j in combinations(range(len(stock_names)), 2):
            s1, s2 = stock_names[i], stock_names[j]
            if stock_data[s1]['sector'] == stock_data[s2]['sector']:
                intra_sims.append(similarity_matrix[i, j])
            else:
                inter_sims.append(similarity_matrix[i, j])
        
        intra_thresh = np.percentile(intra_sims, 75) if intra_sims else 0
        inter_thresh = np.percentile(inter_sims, 80) if inter_sims else 0
        
        print(f"\nUsing dynamic thresholds:")
        print(f"- Intra-sector (75th percentile): {intra_thresh:.3f}")
        print(f"- Inter-sector (80th percentile): {inter_thresh:.3f}")
        
        # Add nodes
        for name in stock_names:
            G.add_node(name, sector=stock_data[name]['sector'])
        
        # Track connections
        intra_edges = 0
        inter_edges = 0
        
        # Connect top similarities within sectors
        for sector in set(stock_data[n]['sector'] for n in stock_names):
            sector_nodes = [n for n in stock_names if stock_data[n]['sector'] == sector]
            sector_indices = [stock_names.index(n) for n in sector_nodes]
            
            # Create complete similarity matrix for sector
            sector_sim = similarity_matrix[np.ix_(sector_indices, sector_indices)]
            np.fill_diagonal(sector_sim, 0)  # Remove self-similarities
            
            # Connect top 20% of possible edges within sector
            k = int(0.2 * len(sector_nodes) * (len(sector_nodes) - 1) // 2)
            if k > 0:
                flat_indices = np.argpartition(sector_sim.flatten(), -k)[-k:]
                rows, cols = np.unravel_index(flat_indices, sector_sim.shape)
                for r, c in zip(rows, cols):
                    n1, n2 = sector_nodes[r], sector_nodes[c]
                    G.add_edge(n1, n2, weight=sector_sim[r,c], type='intra')
                    intra_edges += 1
        
        # Connect across sectors
        for i, j in combinations(range(len(stock_names)), 2):
            s1, s2 = stock_names[i], stock_names[j]
            sim = similarity_matrix[i, j]
            
            if stock_data[s1]['sector'] != stock_data[s2]['sector'] and sim > inter_thresh:
                G.add_edge(s1, s2, weight=sim, type='inter')
                inter_edges += 1
        
        print(f"\nCreated {intra_edges} intra-sector and {inter_edges} inter-sector edges")
        return G

    def _enhanced_analysis(self, G):
        """Comprehensive graph analysis with community detection"""
        print("\nGraph Statistics:")
        print(f"- Nodes: {len(G.nodes)}")
        print(f"- Edges: {len(G.edges)}")
        print(f"- Density: {nx.density(G):.4f}")
        print(f"- Connected Components: {nx.number_connected_components(G)}")
        
        # Sector analysis
        sectors = set(nx.get_node_attributes(G, 'sector').values())
        print(f"\nSectors ({len(sectors)}): {sorted(sectors)}")
        
        # Sector-level metrics
        print("\nSector Connectivity:")
        sector_stats = []
        for sector in sorted(sectors):
            nodes = [n for n in G.nodes if G.nodes[n]['sector'] == sector]
            sg = G.subgraph(nodes)
            sector_stats.append({
                'Sector': sector,
                'Nodes': len(nodes),
                'Edges': sg.number_of_edges(),
                'Avg Degree': f"{2*sg.number_of_edges()/len(nodes):.1f}" if nodes else "0",
                'Clustering': f"{nx.average_clustering(sg):.3f}" if nodes else "0"
            })
        
        print(pd.DataFrame(sector_stats).to_string(index=False))
        
        # Community detection
        print("\nCommunity Detection:")
        communities = nx.algorithms.community.greedy_modularity_communities(G, weight='weight')
        print(f"Found {len(communities)} communities")
        for i, comm in enumerate(sorted(communities, key=len, reverse=True)[:5], 1):
            print(f"Community {i}: {len(comm)} nodes")
            sector_dist = pd.Series([G.nodes[n]['sector'] for n in comm]).value_counts()
            print(sector_dist.head(3).to_string())
            print("---")
        
        # Save outputs
        nx.write_weighted_edgelist(G, os.path.join(self.output_dir, "Full_graph.csv"))
        # Save edge list in GraphSAGE-friendly format
        pd.DataFrame(
            [(u, v, d['weight']) for u, v, d in G.edges(data=True)],
            columns=['source', 'target', 'similarity']
        ).to_csv(os.path.join(self.output_dir, "Full_graph.csv"), index=False)

        pd.DataFrame.from_dict(dict(G.nodes(data=True)), orient='index').to_csv(
            os.path.join(self.output_dir, "node_metadata.csv"))
        
        # Visualization
        self._visualize_by_sector(G)

    def _visualize_by_sector(self, G):
        """Enhanced sector-colored visualization with communities"""
        plt.figure(figsize=(20, 12))
        
        # Get sectors and colors
        sectors = sorted(set(nx.get_node_attributes(G, 'sector').values()))
        colors = plt.cm.tab20(np.linspace(0, 1, len(sectors)))
        
        # Compute layout
        pos = nx.spring_layout(G, k=0.15, seed=42, weight='weight')
        
        # Draw edges first
        nx.draw_networkx_edges(G, pos, alpha=0.1, edge_color="gray", width=0.5)
        
        # Draw nodes by sector
        for sector, color in zip(sectors, colors):
            nodes = [n for n in G.nodes if G.nodes[n]['sector'] == sector]
            nx.draw_networkx_nodes(G, pos, nodelist=nodes, node_color=[color], 
                                 node_size=50, label=sector)
        
        plt.legend(bbox_to_anchor=(1.05, 1), title="Sectors")
        plt.title("Stock Network by Sector", fontsize=16)
        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, "Full_network.png"), dpi=150)
        plt.close()

if __name__ == "__main__":
    builder = StockGraphBuilder()
    G = builder.build_graphs()
    
    # Verify ISEC
    print("\nISEC verification:", G.nodes["ISEC"])

In [None]:
import os
import pandas as pd
import torch
import torch.nn as nn
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import numpy as np


# ==== Config ====
SHORT_TERM_DIR = "New_ShortTerm_Stock_Embeddings"
FULL_GRAPH_CSV = "New_Graphs/Full_Graph.csv"
INTRA_SECTOR_CSV = "New_Graphs/Intra_Sector_Graph.csv"
OUTPUT_DIR = "New_GraphSAGE_Embeddings"
EMBEDDING_DIM = 128
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EPOCHS = 300
LR = 0.005


# ==== Step 1: Load and average short-term embeddings ====
def load_average_embeddings(short_term_dir):
    stock_to_index = {}
    sector_map = {}
    feature_list = []
    expected_dim = None

    for idx, filename in enumerate(sorted(os.listdir(short_term_dir))):
        if not filename.endswith("_embeddings.csv"):
            continue
        stock_name = filename.replace("_embeddings.csv", "")
        df_path = os.path.join(short_term_dir, filename)
        df = pd.read_csv(df_path)

        if df.empty:
            print(f"⚠️ Skipping {stock_name}: file is empty.")
            continue

        emb_cols = [col for col in df.columns if col.startswith('emb_')]
        if not emb_cols:
            print(f"⚠️ Skipping {stock_name}: no embedding columns.")
            continue

        avg_embedding = df[emb_cols].mean().values.astype(np.float32)

        # Debug: show embedding dimension
        #print(f"{stock_name}: embedding dim = {len(avg_embedding)}")

        if len(avg_embedding) == 0:
            print(f"⚠️ Skipping {stock_name}: embedding dim is 0.")
            continue

        if expected_dim is None:
            expected_dim = len(avg_embedding)
        elif len(avg_embedding) != expected_dim:
            print(f"⚠️ Skipping {stock_name}: mismatched embedding shape {len(avg_embedding)} (expected {expected_dim})")
            continue

        df['sector'] = df['sector'].fillna('Financial Services')
        sector_map[stock_name] = df['sector'].iloc[0]

        feature_list.append(avg_embedding)
        stock_to_index[stock_name] = len(stock_to_index)

    features_tensor = torch.tensor(np.stack(feature_list), dtype=torch.float32)
    return features_tensor, stock_to_index, sector_map

# ==== Step 2: Load Full Graph edges ====
def load_full_graph(full_graph_csv, stock_to_index):
    df = pd.read_csv(full_graph_csv)
    edges, edge_weights = [], []

    for _, row in df.iterrows():
        src, tgt = row['source'], row['target']
        if src in stock_to_index and tgt in stock_to_index:
            edges.append([stock_to_index[src], stock_to_index[tgt]])
            edge_weights.append(row['similarity'])

    # Fix: Print the number of edges instead of incorrect shapes
    print(f"Number of edges: {len(edges)}")
    
    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
    print(f"edge_index shape: {edge_index.shape}")
    
    edge_attr = torch.tensor(edge_weights, dtype=torch.float32)
    return edge_index, edge_attr

# ==== Step 3: Load Intra-Sector Graph ====
def load_intra_sector_graph(intra_sector_csv, stock_to_index, sector_map):
    df = pd.read_csv(intra_sector_csv)
    edges = []

    for _, row in df.iterrows():
        sector = row['source']
        stock = row['target']
        for other_stock, other_sector in sector_map.items():
            if other_sector == sector and stock != other_stock:
                if stock in stock_to_index and other_stock in stock_to_index:
                    edges.append([stock_to_index[stock], stock_to_index[other_stock]])

    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
    edge_attr = torch.ones(edge_index.size(1), dtype=torch.float32)
    return edge_index, edge_attr


# ==== Step 4: Define GraphSAGE Model ====
class GraphSAGE(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, x, edge_index, edge_attr=None):
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        return x


# ==== Step 5: Training ====
def train_graphsage(features, edge_index, edge_attr):
    model = GraphSAGE(features.size(1), 256, EMBEDDING_DIM).to(DEVICE)
    features = features.to(DEVICE)
    edge_index = edge_index.to(DEVICE)

    optimizer = torch.optim.Adam(model.parameters(), lr=LR)
    model.train()

    for epoch in tqdm(range(EPOCHS), desc="Training GraphSAGE"):
        optimizer.zero_grad()
        out = model(features, edge_index)
        loss = ((out - features) ** 2).mean()  # identity reconstruction loss
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        embeddings = model(features, edge_index).cpu().numpy()

    return embeddings


# ==== Step 6: Save Embeddings ====
def save_embeddings(embeddings, stock_to_index, path):
    inv_map = {v: k for k, v in stock_to_index.items()}
    rows = []
    for i, emb in enumerate(embeddings):
        row = [inv_map[i]] + list(emb)
        rows.append(row)

    columns = ['stock'] + [f'emb_{i}' for i in range(embeddings.shape[1])]
    df = pd.DataFrame(rows, columns=columns)
    os.makedirs(os.path.dirname(path), exist_ok=True)
    df.to_csv(path, index=False)


# ==== Step 6: Run GraphSAGE Model 62 times ====
def run_graphsage_pipeline(graph_type='full', run_id=1):
    features, stock_to_index, sector_map = load_average_embeddings(SHORT_TERM_DIR)

    if graph_type == 'full':
        edge_index, edge_attr = load_full_graph(FULL_GRAPH_CSV, stock_to_index)
        output_file = os.path.join(OUTPUT_DIR, f"full_graph_embeddings_{run_id}.csv")
    else:
        edge_index, edge_attr = load_intra_sector_graph(INTRA_SECTOR_CSV, stock_to_index, sector_map)
        output_file = os.path.join(OUTPUT_DIR, f"intra_sector_graph_embeddings_{run_id}.csv")

    embeddings = train_graphsage(features, edge_index, edge_attr)
    save_embeddings(embeddings, stock_to_index, output_file)


# ==== Run the training pipeline ====
if __name__ == "__main__":
    for run_id in range(1, 309):  # 1 to 309 inclusive
        run_graphsage_pipeline("full", run_id)
        run_graphsage_pipeline("intra", run_id)

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import r2_score, mean_squared_error
from tqdm import tqdm
from colorama import Fore, Style
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# --- Configuration ---
class Config:
    SEQ_LENGTH = 75  # Reduced from 90 for faster training
    BATCH_SIZE = 64
    EPOCHS = 100  # Reduced from 150
    TARGET_COL = 'return_ratio'
    FEATURE_COLS = [
        'Open', 'High', 'Low', 'Close', 'Volume',
        'MA_5','MA_10','MA_20', 'RSI', 'MACD','MACD_signal'  
    ]
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    DATA_DIR = "New_NormalizedProcessedData"
    OUTPUT_DIR = "New_Predictions"
    EMBEDDINGS_DIR = "LSTM_Embeddings"  # New directory for storing embeddings
    SHORT_TERM_EMB_DIR = "New_ShortTerm_Stock_Embeddings"  # Short-term embeddings directory
    EMB_DIMENSION = 128  # Dimension of embeddings to save
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    os.makedirs(EMBEDDINGS_DIR, exist_ok=True)

# --- Volatility Dataset ---
class VolatilityDataset(Dataset):
    def __init__(self, data, seq_length, short_term_emb=None):
        self.data = data
        self.seq_length = seq_length
        self.short_term_emb = short_term_emb
        
    def __len__(self):
        return len(self.data) - self.seq_length
        
    def __getitem__(self, idx):
        window = self.data[idx:idx+self.seq_length]
        x = window[:, :-1]  # Features
        y = window[-1, -1]   # Target
        
        # Calculate volatility from returns
        returns = window[:, -1]
        volatility = np.sqrt(np.mean(returns**2)) * 100
        
        # Add short-term embeddings if available
        if self.short_term_emb is not None:
            # Make sure we're within bounds
            if idx < len(self.short_term_emb):
                short_emb = self.short_term_emb[idx]
                return (
                    torch.FloatTensor(x),
                    torch.FloatTensor([y]),
                    torch.FloatTensor([volatility]),
                    torch.FloatTensor(short_emb)
                )
        
        return (
            torch.FloatTensor(x),
            torch.FloatTensor([y]),
            torch.FloatTensor([volatility])
        )

# --- Feature Validator ---
def validate_features(df, required_cols):
    """Ensure all required columns exist"""
    missing = [col for col in required_cols if col not in df.columns]
    if missing:
        raise ValueError(f"Missing columns: {missing}")
    return df[required_cols]

# --- Data Processor (Enhanced) ---
class DataProcessor:
    def __init__(self):
        self.feature_scaler = RobustScaler()
        self.target_scaler = RobustScaler()
        
    def load_and_preprocess(self, filepath, short_term_emb_dir=None):
        df = pd.read_csv(filepath)
        stock_name = os.path.basename(filepath).replace("_norm.csv", "")
        
        # Basic feature engineering
        df['Log_Volume'] = np.log1p(df['Volume'])
        df['Range'] = (df['High'] - df['Low']) / df['Close']
        
        # Select final features
        features = df[Config.FEATURE_COLS + ['Log_Volume', 'Range']]
        target = df[Config.TARGET_COL].values.reshape(-1, 1)
        
        # Load short-term embeddings if available
        short_term_emb = None
        if short_term_emb_dir:
            emb_filepath = os.path.join(short_term_emb_dir, f"{stock_name}_embeddings.csv")
            if os.path.exists(emb_filepath):
                emb_df = pd.read_csv(emb_filepath)
                
                # Filter embedding columns
                emb_cols = [col for col in emb_df.columns if col.startswith('emb_')]
                if emb_cols:
                    # Convert date to datetime for matching
                    emb_df['date'] = pd.to_datetime(emb_df['date'])
                    df['Date'] = pd.to_datetime(df['Date'])
                    
                    # Align embeddings with dataframe by date
                    merged_df = pd.merge(df, emb_df[['date'] + emb_cols], 
                                         left_on='Date', right_on='date', how='left')
                    
                    # Fill NaN values with zeros (for dates without embeddings)
                    for col in emb_cols:
                        merged_df[col].fillna(0, inplace=True)
                    
                    # Extract embeddings as numpy array
                    short_term_emb = merged_df[emb_cols].values
                    
                    print(f"Loaded {len(emb_cols)} short-term embedding dimensions for {stock_name}")
        
        # Scaling
        features = self.feature_scaler.fit_transform(features)
        target = self.target_scaler.fit_transform(target)
        
        # Combine scaled features and target
        data = np.hstack((features, target))
        
        # Convert date into the required test set
        df['Date'] = pd.to_datetime(df['Date'])
        train_end_date = pd.to_datetime("2024 -01-10")
        train_mask = df['Date'] <= train_end_date
        test_mask = df['Date'] > train_end_date

        # Apply split
        train_data = data[train_mask.values]
        test_data_actual = data[test_mask.values]

        # Get padded test data with sequence context
        if len(train_data) < Config.SEQ_LENGTH:
            raise ValueError("Not enough training data to provide sequence context.")
        test_data = np.vstack([train_data[-Config.SEQ_LENGTH:], test_data_actual])

        # Prepare test dates: for each test sample we predict, we need to align it with the correct date
        # i.e., prediction at time t is made after SEQ_LENGTH timesteps
        test_dates = df[test_mask]['Date'].values

        print(f"Target stats - Mean: {target.mean():.6f}, Std: {target.std():.6f}")
        print(f"Train size: {len(train_data)}, Test size (after padding): {len(test_data)}")
        print(f"Test prediction days: {len(test_dates)}")

        return train_data, test_data, test_dates, short_term_emb

# --- Enhanced Volatility-Sensitive LSTM Model ---
class VolatilityLSTM(nn.Module):
    def __init__(self, input_size, short_term_emb_size=0):
        super().__init__()
        self.has_short_term_emb = short_term_emb_size > 0
        
        # LSTM trunk
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=128,
            num_layers=2,
            batch_first=True,
            dropout=0.2
        )
        
        # Volatility attention gate
        self.volatility_net = nn.Sequential(
            nn.Linear(128, 64),
            nn.LeakyReLU(0.1),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )
        
        # Short-term embedding integration
        if self.has_short_term_emb:
            self.short_term_integration = nn.Sequential(
                nn.Linear(short_term_emb_size, 64),
                nn.ReLU(),
                nn.Linear(64, 32)
            )
            head_input_size = 128 + 32 + 1  # LSTM + short-term + volatility
        else:
            head_input_size = 129  # LSTM + volatility
        
        # Prediction head
        self.head = nn.Sequential(
            nn.Linear(head_input_size, 64),
            nn.SiLU(),
            nn.Linear(64, 1)
        )
        
    def forward(self, x, short_term_emb=None):
        lstm_out, (h_n, c_n) = self.lstm(x)
        last_state = lstm_out[:, -1]
        
        # Get embedding from LSTM (this will be saved)
        lstm_embedding = last_state.clone()
        
        volatility = self.volatility_net(last_state)
        
        if self.has_short_term_emb and short_term_emb is not None:
            # Process short-term embeddings
            short_term_features = self.short_term_integration(short_term_emb)
            # Combine features
            combined = torch.cat([last_state, short_term_features, volatility], dim=1)
        else:
            combined = torch.cat([last_state, volatility], dim=1)
            
        prediction = self.head(combined)
        
        return prediction, volatility, lstm_embedding

# --- Model Training (Enhanced) ---
def train_model(model, train_loader, has_short_term_emb):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)  # Lower learning rate
    criterion = nn.HuberLoss()  # More robust than MSE
    
    for epoch in range(Config.EPOCHS):
        model.train()
        for batch in train_loader:
            if has_short_term_emb and len(batch) == 4:
                x, y, vol, short_emb = [item.to(Config.DEVICE) for item in batch]
                optimizer.zero_grad()
                pred, pred_vol, _ = model(x, short_emb)
            else:
                x, y, vol = [item.to(Config.DEVICE) for item in batch]
                optimizer.zero_grad()
                pred, pred_vol, _ = model(x)
            
            # Main loss with reduced volatility weight
            loss = criterion(pred, y) + 0.1 * F.mse_loss(pred_vol, vol)
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)  # Gradient clipping
            optimizer.step()
            
# --- Extract and Save Embeddings ---
def extract_embeddings(model, test_loader, test_dates, stock_name, has_short_term_emb):
    model.eval()
    embeddings = []
    predictions = []
    actuals = []
    volatilities = []
    
    with torch.no_grad():
        for batch in test_loader:
            if has_short_term_emb and len(batch) == 4:
                x, y, vol, short_emb = [item.to(Config.DEVICE) for item in batch]
                pred, pred_vol, emb = model(x, short_emb)
            else:
                x, y, vol = [item.to(Config.DEVICE) for item in batch]
                pred, pred_vol, emb = model(x)
            
            embeddings.append(emb.cpu().numpy())
            predictions.append(pred.cpu().numpy())
            actuals.append(y.cpu().numpy())
            volatilities.append(pred_vol.cpu().numpy())
    
    # Concatenate results
    embeddings = np.vstack(embeddings)
    predictions = np.vstack(predictions)
    actuals = np.vstack(actuals)
    volatilities = np.vstack(volatilities)
    
    # Create DataFrame with date and embedding dimensions
    emb_df = pd.DataFrame({
        'Date': test_dates[:len(embeddings)],
        **{f'emb_{i}': embeddings[:, i] for i in range(embeddings.shape[1])},
        'Predicted_Return': predictions.flatten(),
        'Actual_Return': actuals.flatten(),
        'Predicted_Volatility': volatilities.flatten()
    })
    
    # Ensure the embeddings directory exists
    os.makedirs(Config.EMBEDDINGS_DIR, exist_ok=True)
    
    # Save embeddings
    emb_filepath = os.path.join(Config.EMBEDDINGS_DIR, f"{stock_name}_long_term_embeddings.csv")
    emb_df.to_csv(emb_filepath, index=False)
    
    # Visualize embeddings with PCA
    if len(embeddings) > 2:  # Need at least 2 samples for PCA
        pca = PCA(n_components=2)
        emb_2d = pca.fit_transform(embeddings)
        
        plt.figure(figsize=(12, 8))
        sc = plt.scatter(emb_2d[:, 0], emb_2d[:, 1], 
                         c=actuals.flatten(), cmap='coolwarm', alpha=0.7)
        plt.colorbar(sc, label='Actual Return')
        plt.title(f"Long-Term LSTM Embeddings for {stock_name}")
        plt.xlabel(f"PCA Component 1 (Var: {pca.explained_variance_ratio_[0]:.2f})")
        plt.ylabel(f"PCA Component 2 (Var: {pca.explained_variance_ratio_[1]:.2f})")
        plt.tight_layout()
        plt.savefig(os.path.join(Config.EMBEDDINGS_DIR, f"{stock_name}_long_term_embeddings_viz.png"))
        plt.close()
    
    print(f"Saved {len(embeddings)} embeddings for {stock_name}")
    return embeddings, predictions, actuals

# --- Main Execution (Enhanced with Embeddings) ---
if __name__ == "__main__":
    processor = DataProcessor()
    stock_files = [f for f in os.listdir(Config.DATA_DIR) if f.endswith("_norm.csv")]
    
    # Dictionary to store results for final visualization
    all_embeddings = {}
    all_metrics = []
    
    for stock_file in tqdm(stock_files):
        try:
            stock_name = stock_file.replace("_norm.csv", "")
            print(f"\n{Fore.CYAN}Processing {stock_name}{Style.RESET_ALL}")
            
            # Load data with validation and short-term embeddings
            train_data, test_data, test_dates, short_term_emb = processor.load_and_preprocess(
                os.path.join(Config.DATA_DIR, stock_file), Config.SHORT_TERM_EMB_DIR)
            
            # Skip if insufficient data
            if len(train_data) < Config.SEQ_LENGTH * 2:
                print(f"{Fore.YELLOW}Skipping {stock_name} (insufficient data){Style.RESET_ALL}")
                continue
            
            # Determine if we have short-term embeddings
            has_short_term_emb = short_term_emb is not None
            
            # Prepare datasets
            train_dataset = VolatilityDataset(train_data, Config.SEQ_LENGTH, 
                                              short_term_emb[:len(train_data)-Config.SEQ_LENGTH] if has_short_term_emb else None)
            test_dataset = VolatilityDataset(test_data, Config.SEQ_LENGTH,
                                            short_term_emb[len(train_data)-Config.SEQ_LENGTH:] if has_short_term_emb else None)
            
            train_loader = DataLoader(train_dataset, batch_size=Config.BATCH_SIZE, shuffle=True)
            test_loader = DataLoader(test_dataset, batch_size=Config.BATCH_SIZE, shuffle=False)
            
            # Initialize model with short-term embedding support if available
            short_term_emb_size = short_term_emb.shape[1] if has_short_term_emb else 0
            print(f"Short-term embedding size: {short_term_emb_size}")
            
            model = VolatilityLSTM(
                input_size=train_data.shape[1] - 1,  # features dimension
                short_term_emb_size=short_term_emb_size
            ).to(Config.DEVICE)
            
            # Train
            train_model(model, train_loader, has_short_term_emb)
            
            # Extract embeddings
            embeddings, predictions, actuals = extract_embeddings(
                model, test_loader, test_dates, stock_name, has_short_term_emb)
            
            # Inverse scaling for predictions and actuals
            predictions = processor.target_scaler.inverse_transform(predictions).flatten()
            actuals = processor.target_scaler.inverse_transform(actuals).flatten()
            
            # Save results
            results = pd.DataFrame({
                'Date': test_dates[:len(predictions)],
                'Actual': actuals,
                'Predicted': predictions
            })
            results.to_csv(f"{Config.OUTPUT_DIR}/{stock_name}_predictions.csv", index=False)
            
            # Calculate metrics
            mae = np.mean(np.abs(actuals - predictions))
            mse = mean_squared_error(actuals, predictions)
            rmse = np.sqrt(mse)
            r2 = r2_score(actuals, predictions)
            
            # Directional accuracy
            directional_accuracy = np.mean(np.sign(actuals) == np.sign(predictions))
            
            # Store metrics
            metrics = {
                'stock': stock_name,
                'MAE': mae,
                'MSE': mse,
                'RMSE': rmse,
                'R²': r2,
                'Directional_Accuracy': directional_accuracy
            }
            all_metrics.append(metrics)
            
            print(f"{Fore.GREEN}Success:{Style.RESET_ALL}")
            print(f"MAE: {mae:.6f}")
            print(f"MSE: {mse:.6f}")
            print(f"RMSE: {rmse:.6f}")
            print(f"R²: {r2:.3f}")
            print(f"Directional Accuracy: {directional_accuracy:.3f}")
            print(f"Prediction Range: [{predictions.min():.6f}, {predictions.max():.6f}]")
            
            # Store embeddings for cross-stock analysis
            all_embeddings[stock_name] = {
                'embeddings': embeddings,
                'dates': test_dates[:len(embeddings)]
            }
            
        except Exception as e:
            print(f"{Fore.RED}Error processing {stock_file}: {str(e)}{Style.RESET_ALL}")
    
    # Create a combined embeddings file for all stocks
    try:
        combined_data = []
        for stock_name, stock_info in all_embeddings.items():
            for i, date in enumerate(stock_info['dates']):
                row_data = {
                    'date': date,
                    'stock': stock_name,
                    **{f'emb_{j}': stock_info['embeddings'][i, j] for j in range(stock_info['embeddings'].shape[1])}
                }
                combined_data.append(row_data)
        
        if combined_data:
            combined_df = pd.DataFrame(combined_data)
            combined_df.to_csv(f"{Config.EMBEDDINGS_DIR}/all_stocks_long_term_embeddings.csv", index=False)
            print(f"Created combined embeddings file with {len(combined_data)} records")
    
        # Save metrics summary
        if all_metrics:
            metrics_df = pd.DataFrame(all_metrics)
            metrics_df.to_csv(f"{Config.OUTPUT_DIR}/all_stocks_metrics.csv", index=False)
            
            # Create comparative metrics visualization
            plt.figure(figsize=(14, 8))
            metrics_df.sort_values('Directional_Accuracy', ascending=False).head(20).plot(
                x='stock', y='Directional_Accuracy', kind='bar', color='green', alpha=0.7)
            plt.title("Top 20 Stocks by Directional Accuracy")
            plt.xlabel("Stock")
            plt.ylabel("Directional Accuracy")
            plt.tight_layout()
            plt.savefig(f"{Config.OUTPUT_DIR}/top_directional_accuracy.png")
            plt.close()
            
    except Exception as e:
        print(f"{Fore.RED}Error creating summary files: {str(e)}{Style.RESET_ALL}")

In [None]:
import os
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import rankdata
from tqdm import tqdm

# Constants
lstm_embeddings_path = 'LSTM_Embeddings'
predictions_path = 'New_Predictions'
graphsage_dir = 'New_GraphSAGE_Embeddings'
K_VALUES = [5, 10, 20]

# Date ranges 
train_start = pd.to_datetime("2024-01-11")
train_end = pd.to_datetime("2024-10-24") #80% 
test1_end = pd.to_datetime("2025-1-10") #20%
test2_end = pd.to_datetime("2025-04-11")

# Load all 308 graph embeddings
full_graphs, intra_graphs = [], []
for i in range(1, 309):
    full_path = os.path.join(graphsage_dir, f'full_graph_embeddings_{i}.csv')
    intra_path = os.path.join(graphsage_dir, f'intra_sector_graph_embeddings_{i}.csv')

    full_df = pd.read_csv(full_path, index_col=0).drop(columns=['test_day'], errors='ignore')
    intra_df = pd.read_csv(intra_path, index_col=0).drop(columns=['test_day'], errors='ignore')

    full_graphs.append(full_df.sort_index())
    intra_graphs.append(intra_df.sort_index())

# Collect data
X_all, y_all, stock_names_all, dates_all = [], [], [], []

for stock_file in tqdm(os.listdir(lstm_embeddings_path)):
    if not stock_file.endswith('_long_term_embeddings.csv'):
        continue

    stock_name = stock_file.split('_long_term_embeddings.csv')[0]
    lstm_df = pd.read_csv(os.path.join(lstm_embeddings_path, stock_file))
    prediction_file = os.path.join(predictions_path, f"{stock_name}_predictions.csv")

    if not os.path.exists(prediction_file):
        continue

    lstm_features = lstm_df.drop(columns=['Predicted_Return', 'Actual_Return', 'Predicted_Volatility'], errors='ignore')
    lstm_features['Date'] = pd.to_datetime(lstm_df['Date'])

    actuals = pd.read_csv(prediction_file)
    actuals['Date'] = pd.to_datetime(actuals['Date'])

    stock_full_features, stock_returns, stock_dates = [], [], []

    for day in range(len(lstm_features)):
        date = lstm_features.iloc[day]['Date']
        if stock_name not in full_graphs[day].index or stock_name not in intra_graphs[day].index:
            continue

        lstm_row = lstm_features.iloc[day].drop('Date').values
        full_emb = full_graphs[day].loc[stock_name].values
        intra_emb = intra_graphs[day].loc[stock_name].values

        combined = np.concatenate([lstm_row, full_emb, intra_emb])
        stock_full_features.append(combined)
        stock_returns.append(actuals.iloc[day]['Actual'])
        stock_dates.append(date)

    if len(stock_full_features) == 0:
        continue

    X_all.extend(stock_full_features)
    y_all.extend(stock_returns)
    stock_names_all.extend([stock_name] * len(stock_full_features))
    dates_all.extend(stock_dates)

# Convert to arrays
X_all = np.array(X_all)
y_all = np.array(y_all)
dates_all = np.array(dates_all)
stock_names_all = np.array(stock_names_all)

# Split into train/test1/test2
train_mask = (dates_all >= train_start) & (dates_all <= train_end)
test1_mask = (dates_all > train_end) & (dates_all <= test1_end)
test2_mask = (dates_all > test1_end) & (dates_all <= test2_end)

# Normalize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_all)

# Train
model = XGBRegressor(objective='reg:squarederror', eval_metric='rmse', n_estimators=1000, max_depth=8, learning_rate=0.1)
model.fit(X_scaled[train_mask], y_all[train_mask])

# Predict
y_pred_all = model.predict(X_scaled)

# Ranking Metrics (Revised based on baseline definitions)
def mrr_at_k(actual, pred, k=5):
    df = pd.DataFrame({'Actual': actual, 'Predicted': pred})
    df = df.sort_values(by='Predicted', ascending=False).reset_index(drop=True)
    df['Pred_Rank_Index'] = df.index + 1  # Ranking from prediction
    df = df.sort_values(by='Actual', ascending=False).reset_index(drop=True)
    top_k_actual_pred_ranks = df['Pred_Rank_Index'][:k]
    return np.mean(1 / top_k_actual_pred_ranks)

def precision_at_k(actual, pred, k=5):
    df = pd.DataFrame({'Actual': actual, 'Predicted': pred})
    top_k_pred_idx = df.sort_values(by='Predicted', ascending=False).index[:k]
    top_k_actual_idx = df.sort_values(by='Actual', ascending=False).index[:k]
    intersection = len(set(top_k_pred_idx) & set(top_k_actual_idx))
    return intersection / k

def irr_at_k(actual, pred, k=5):
    df = pd.DataFrame({'Actual': actual, 'Predicted': pred})
    top_k_actual_sum = df.sort_values(by='Actual', ascending=False)['Actual'][:k].sum()
    top_k_pred_sum = df.sort_values(by='Predicted', ascending=False)['Actual'][:k].sum()
    return top_k_actual_sum - top_k_pred_sum


def evaluate_subset(mask, label):
    subset_dates = dates_all[mask]
    subset_actual = y_all[mask]
    subset_pred = y_pred_all[mask]
    subset_stocks = stock_names_all[mask]

    results = []
    daily_rankings = []

    for d in sorted(set(subset_dates)):
        idx = (subset_dates == d)
        actual_day = subset_actual[idx]
        pred_day = subset_pred[idx]
        stock_names_day = subset_stocks[idx]

        ranked_idx = np.argsort(pred_day)[::-1]
        ranked_stocks = np.array(stock_names_day)[ranked_idx]
        ranked_actuals = actual_day[ranked_idx]
        ranked_preds = pred_day[ranked_idx]

        for rank_num, stock in enumerate(ranked_stocks[:20], 1):
            daily_rankings.append({
                'Date': d,
                'Rank': rank_num,
                'Stock': stock,
                'Actual': ranked_actuals[rank_num - 1],
                'Predicted': ranked_preds[rank_num - 1]
            })

        result = {
            'Date': d,
            'MSE': mean_squared_error(actual_day, pred_day),
            'MAE': mean_absolute_error(actual_day, pred_day),
        }

        for k in K_VALUES:
            result[f'MRR@{k}'] = mrr_at_k(actual_day, pred_day, k)
            result[f'Precision@{k}'] = precision_at_k(actual_day, pred_day, k)
            result[f'IRR@{k}'] = irr_at_k(actual_day, pred_day, k)

        results.append(result)

    # Build daily metrics dataframe
    metrics_df = pd.DataFrame(results)
    metrics_df.to_csv(f"{label}_metrics.csv", index=False)

    # === Print Mean Metrics ===
    mean_row = metrics_df.mean(numeric_only=True).round(6)
    print(f"\n📈 [Mean Metrics for {label.upper()} Set]")
    print(mean_row.to_frame().T.to_string(index=False))

    # === Find Best Daily Metrics and Dates ===
    best_metrics_records = []

    for k in K_VALUES:
        best_mrr_value = metrics_df[f'MRR@{k}'].max()
        best_mrr_date = metrics_df.loc[metrics_df[f'MRR@{k}'].idxmax(), 'Date']

        best_precision_value = metrics_df[f'Precision@{k}'].max()
        best_precision_date = metrics_df.loc[metrics_df[f'Precision@{k}'].idxmax(), 'Date']

        best_irr_value = metrics_df[f'IRR@{k}'].min()
        best_irr_date = metrics_df.loc[metrics_df[f'IRR@{k}'].idxmin(), 'Date']

        best_metrics_records.append({
            'Metric': f'MRR@{k}',
            'Best_Value': round(best_mrr_value, 6),
            'Best_Date': pd.to_datetime(best_mrr_date).date()
        })
        best_metrics_records.append({
            'Metric': f'Precision@{k}',
            'Best_Value': round(best_precision_value, 6),
            'Best_Date': pd.to_datetime(best_precision_date).date()
        })
        best_metrics_records.append({
            'Metric': f'IRR@{k}',
            'Best_Value': round(best_irr_value, 6),
            'Best_Date': pd.to_datetime(best_irr_date).date()
        })

    best_metrics_df = pd.DataFrame(best_metrics_records)

    # === Print Best Daily Metrics ===
    print(f"\n🏆 [Best Daily Metrics for {label.upper()} Set]")
    print(best_metrics_df.to_string(index=False))

    # Save best metrics to file
    best_metrics_df.to_csv(f"{label}_best_daily_metrics.csv", index=False)

    # Save daily rankings (Top 20 stocks)
    daily_rankings_df = pd.DataFrame(daily_rankings)
    daily_rankings_df.to_csv(f"{label}_daily_rankings.csv", index=False)

    return pd.DataFrame({
        'Date': subset_dates,
        'Stock': subset_stocks,
        'Actual': subset_actual,
        'Predicted': subset_pred
    })
    
# Function to save daily top-k rankings
def save_daily_topk_rankings(df_preds, label):
    K_VALUES = [5, 10, 20]
    dates = pd.to_datetime(df_preds['Date'].unique())

    records = []

    for d in tqdm(sorted(dates), desc=f"Saving Top-K Rankings ({label})"):
        daily_df = df_preds[df_preds['Date'] == d]

        # Sort by Predicted
        daily_pred_sorted = daily_df.sort_values('Predicted', ascending=False)

        # Sort by Actual
        daily_actual_sorted = daily_df.sort_values('Actual', ascending=False)

        for k in K_VALUES:
            topk_predicted = daily_pred_sorted.head(k)
            topk_actual = daily_actual_sorted.head(k)

            for rank_idx, row in enumerate(topk_predicted.itertuples(), start=1):
                records.append({
                    'Date': d,
                    'Type': f'Predicted_Top{k}',
                    'Rank': rank_idx,
                    'Stock': row.Stock,
                    'Predicted': row.Predicted,
                    'Actual': row.Actual
                })

            for rank_idx, row in enumerate(topk_actual.itertuples(), start=1):
                records.append({
                    'Date': d,
                    'Type': f'Actual_Top{k}',
                    'Rank': rank_idx,
                    'Stock': row.Stock,
                    'Predicted': row.Predicted,
                    'Actual': row.Actual
                })

    rankings_df = pd.DataFrame(records)
    rankings_df.to_csv(f"{label}_daily_topk_rankings.csv", index=False)
    print(f"✅ Saved {label}_daily_topk_rankings.csv successfully.")


# Evaluate both test sets
preds_test1 = evaluate_subset(test1_mask, 'Old_Test')
preds_test2 = evaluate_subset(test2_mask, 'New_Test')

# Save all predictions
pd.DataFrame({
    'Date': dates_all,
    'Stock': stock_names_all,
    'Actual': y_all,
    'Predicted': y_pred_all
}).to_csv("final_predictions.csv", index=False)

print("✅ All prediction and metric files saved.")

# Save the daily Top-K rankings
save_daily_topk_rankings(preds_test1, 'Old_Test')
save_daily_topk_rankings(preds_test2, 'New_test')