In [1]:
import os
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.optim import Adam
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from torchmetrics import MeanAbsolutePercentageError


from data.dataloader import dataloader_info
from utils.utils import load_yaml_config, instantiate_from_config

from models.predictor import GRU
from models.solver import Trainer
from data.dataloader import dataloader_info
from utils.visualize import visualize_pca, visualize_tsne, visualize_kernel

import copy

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load configurations
configs = load_yaml_config("configs/stock_class.yaml")
device = "cuda" if torch.cuda.is_available() else "cpu"

# Initialize Diffusion_TS Model
diffusion_ts = instantiate_from_config(configs['model']).to(device)
batch_size = 128

In [3]:
# dataloader
dl_info = dataloader_info(configs)
dl_info_test = dataloader_info(configs, train=False)

dataset = dl_info['dataset']

In [4]:
dl = dl_info["dataloader"]

In [5]:
class StockPricePredictor(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, dropout_rate=0.5):
        super().__init__()
        self.gru = nn.GRU(input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout_rate)
        self.layer_norm = nn.LayerNorm(hidden_dim)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        h_0 = torch.zeros(self.gru.num_layers, x.size(0), self.gru.hidden_size).to(x.device)
        gru_out, _ = self.gru(x, h_0)
        gru_out = self.layer_norm(gru_out)
        gru_out = self.dropout(gru_out)
        out = self.fc(gru_out[:, -1, :])
        return self.sigmoid(out)

In [6]:
# class StockPricePredictor(nn.Module):
#     def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
#         super().__init__()
#         self.gru = nn.GRU(input_dim, hidden_dim, num_layers, batch_first=True)
#         self.fc = nn.Linear(hidden_dim, output_dim)
#         self.sigmoid = nn.Sigmoid()
    
#     def forward(self, x):
#         gru_out, _ = self.gru(x)
#         out = self.fc(gru_out[:, -1, :])
        
#         return self.sigmoid(out)


In [14]:
input_dim = 6
hidden_dim = 64
output_dim = 1
num_layers = 2

model = StockPricePredictor(input_dim, hidden_dim, output_dim, num_layers).to(device)

# Step 3: Train the Model
from torch import optim
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [15]:
train_dataset = dl_info['dataset']

In [16]:
idx = np.random.permutation(len(train_dataset))


In [17]:
train_n = int(0.7*len(train_dataset))
train_data = train_dataset.data[idx[:train_n],:,:]
validate_data = train_dataset.data[idx[train_n:],:,:]


In [18]:
train_loader = DataLoader(train_data, batch_size=128, shuffle=True)
val_loader = DataLoader(validate_data, batch_size=128, shuffle=True)

In [19]:
# Training function with Early Stopping
def train_model(model, train_loader, val_loader, criterion, optimizer, 
                num_epochs, patience, device):
    model.train()
    best_loss = float('inf')
    epochs_no_improve = 0

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for data in train_loader:
            x_train, y_train = data[:,:-1,:], data[:,-1:,0]
            x_train, y_train = x_train.to(device), y_train.to(device)
            optimizer.zero_grad()
            outputs = model(x_train).to(device)
            loss = criterion(outputs, y_train)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        
        train_loss /= len(train_loader)
        
        # Validation phase
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for data in val_loader:
                x_val, y_val = data[:,:-1,:], data[:,-1:,0]
                x_val, y_val = x_val.to(device), y_val.to(device)
                outputs = model(x_val).to(device)
                loss = criterion(outputs, y_val)
                val_loss += loss.item()
        
        val_loss /= len(val_loader)
        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.6f}, Validation Loss: {val_loss:.6f}')
        
        # Early Stopping
        if val_loss < best_loss:
            best_loss = val_loss
            epochs_no_improve = 0
            best_model = model.state_dict()
        else:
            epochs_no_improve += 1
            if epochs_no_improve == patience:
                print('Early stopping!')
                model.load_state_dict(best_model)
                break

    model.load_state_dict(best_model)
    return model


In [20]:
best_model = train_model(model, train_loader, val_loader, criterion, optimizer, 1000, 200, device)

Epoch 1/1000, Train Loss: 0.741493, Validation Loss: 0.697099
Epoch 2/1000, Train Loss: 0.697969, Validation Loss: 0.690711
Epoch 3/1000, Train Loss: 0.705862, Validation Loss: 0.694034
Epoch 4/1000, Train Loss: 0.701936, Validation Loss: 0.692389
Epoch 5/1000, Train Loss: 0.708236, Validation Loss: 0.693675
Epoch 6/1000, Train Loss: 0.697888, Validation Loss: 0.697686
Epoch 7/1000, Train Loss: 0.696665, Validation Loss: 0.694752
Epoch 8/1000, Train Loss: 0.698994, Validation Loss: 0.694513
Epoch 9/1000, Train Loss: 0.693819, Validation Loss: 0.695347
Epoch 10/1000, Train Loss: 0.691421, Validation Loss: 0.692575
Epoch 11/1000, Train Loss: 0.691853, Validation Loss: 0.694508
Epoch 12/1000, Train Loss: 0.692933, Validation Loss: 0.695379
Epoch 13/1000, Train Loss: 0.693759, Validation Loss: 0.691909
Epoch 14/1000, Train Loss: 0.690400, Validation Loss: 0.694393
Epoch 15/1000, Train Loss: 0.693919, Validation Loss: 0.692712
Epoch 16/1000, Train Loss: 0.694386, Validation Loss: 0.692846
E

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Step 3: Train and Validate with K-Folds
kf = KFold(n_splits=5, shuffle=True, random_state=42)

accuracy_list = []
precision_list = []
recall_list = []
f1_list = []
train_data = train_dataset1.data
x, y = train_data[:,:-1,:], train_data[:,-1:,0]

for fold, (train_idx, test_idx) in enumerate(kf.split(train_data)):
    print(f'Fold {fold + 1}')
    X_train, X_test = x[train_idx], x[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    model = StockPricePredictor(input_dim, hidden_dim, output_dim, num_layers).to(device)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    num_epochs = 100
    batch_size = 128
    
    train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
    
    for epoch in range(num_epochs):
        model.train()
        for i, (sequences_batch, labels_batch) in enumerate(train_loader):
            sequences_batch = sequences_batch.to(device)
            labels_batch = labels_batch.to(device)
            outputs = model(sequences_batch)
            loss = criterion(outputs, labels_batch)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')
    
    # Evaluation
    model.eval()
    with torch.no_grad():
        y_pred = model(X_test.to(device)).squeeze()
        y_pred_class = (y_pred > 0.5).float()
    
    y_test_cpu = y_test.cpu().numpy()
    y_pred_class_cpu = y_pred_class.cpu().numpy()
    
    accuracy = accuracy_score(y_test_cpu, y_pred_class_cpu)
    precision = precision_score(y_test_cpu, y_pred_class_cpu)
    recall = recall_score(y_test_cpu, y_pred_class_cpu)
    f1 = f1_score(y_test_cpu, y_pred_class_cpu)
    
    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)
    
    print(f'Fold {fold + 1} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}')

# Average performance metrics
print(f'Average Accuracy: {np.mean(accuracy_list):.4f}')
print(f'Average Precision: {np.mean(precision_list):.4f}')
print(f'Average Recall: {np.mean(recall_list):.4f}')
print(f'Average F1 Score: {np.mean(f1_list):.4f}')

In [None]:
def train_model(model, dataloader, criterion, optimizer, num_epochs=100, description=""):
    model.train()
    with tqdm(range(num_epochs), total=num_epochs) as pbar:
        for e in pbar:
            for data in dataloader:
                x_train = data[:,:-1,:].float().to(device)
                y_train = data[:,-1:,0].float().to(device)
                optimizer.zero_grad()
                outputs = model(x_train)
                loss = criterion(outputs, y_train)
                loss.backward()
                optimizer.step()
            pbar.set_description(f"{description} loss: {loss.item():.6f}")
    

In [None]:
train_model(model,dl,criterion,optimizer,2000)

In [None]:
data = next(iter(dl))
x, y = data[:,:-1, :], data[:,-1:,0]

In [None]:
data = []
for d in dl:
    data.append(d)
    
data = torch.concat(data)
    
    

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
data = data.to(device)
x, y = data[:, :-1, :], data[:, -1:, 0]
y_pred = model(x)
y_pred_class = (y_pred > 0.5).float()

# Accuracy
accuracy = accuracy_score(y.cpu().detach().numpy(), y_pred_class.cpu().detach().numpy())
print(f'Accuracy: {accuracy:.4f}')


# Classification Report
class_report = classification_report(y.cpu().detach().numpy(), y_pred_class.cpu().detach().numpy())
print(f'Classification Report:\n{class_report}')

In [None]:
# test
dl_test = dl_info_test["dataloader"]



In [None]:
data = []
for d in dl_test:
    data.append(d)
    
data = torch.concat(data)
    
    

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
data = data.to(device)
x, y = data[:, :-1, :], data[:, -1:, 0]
y_pred = model(x)
y_pred_class = (y_pred > 0.5).float()

# Accuracy
accuracy = accuracy_score(y.cpu().detach().numpy(), y_pred_class.cpu().detach().numpy())
print(f'Accuracy: {accuracy:.4f}')


# Classification Report
class_report = classification_report(y.cpu().detach().numpy(), y_pred_class.cpu().detach().numpy())
print(f'Classification Report:\n{class_report}')

In [None]:
trainer = Trainer(config_solver=configs["solver"], model=diffusion_ts, dataloader=dl_info["dataloader"])
trainer.train_decomp()

In [None]:
# generate adversarial data
fake_data = diffusion_ts.generate_mts(batch_size=6000)


In [None]:
# load original data
seq_length, feature_dim = dataset.window, dataset.feature_dim
gt_data = np.load(os.path.join(dataset.dir, f"stock_origin_data_{seq_length}_train.npy"))
idx = np.random.permutation(len(gt_data))[:3000]
ori_data = gt_data[idx]

In [None]:
visualize_pca(ori_data, fake_data, 3000)
visualize_tsne(ori_data, fake_data, 3000)
visualize_kernel(ori_data, fake_data, 3000)

In [None]:
test_data_norm_origin = torch.from_numpy(np.load(os.path.join(dataset.dir, f"stock_origin_data_{seq_length}_test.npy"))).to(device)
test_mean = torch.from_numpy(np.load(os.path.join(dataset.dir, f"stock_origin_mean_{seq_length}_test.npy"))).to(device)
test_std = torch.from_numpy(np.load(os.path.join(dataset.dir, f"stock_origin_std_{seq_length}_test.npy"))).to(device)

test_dataset = TensorDataset(test_data_norm_origin, test_mean, test_std)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

lossfn = nn.L1Loss()
# lossfn = nn.MSELoss()


In [None]:
def train_model(model, dataloader, criterion, optimizer, num_epochs=100, description=""):
    model.train()
    with tqdm(range(num_epochs), total=num_epochs) as pbar:
        for e in pbar:
            for data in dataloader:
                x_train = data[:,:-1,:].float().to(device)
                y_train = data[:,-1:,0].float().to(device)
                optimizer.zero_grad()
                outputs = model(x_train)
                loss = criterion(outputs, y_train)
                loss.backward()
                optimizer.step()
            pbar.set_description(f"{description} loss: {loss.item():.6f}")
    

In [None]:
def evaluate_model(model, dataloader):
    model.eval()
    # define loss for comparison
    l1loss = nn.L1Loss()
    l2loss = nn.MSELoss()
    mapeloss = MeanAbsolutePercentageError().to(device)
    
    total_l1 = 0
    total_l2 = 0
    total_mape = 0

    predictions, true_vals = [], []
    with torch.no_grad():
        for data_norm, data_mean, data_std  in dataloader:
            x_test = data_norm[:, :(seq_length - 1), :].float().to(device)
            y_test = data_norm[:, (seq_length - 1):, :1].float().to(device)
            y_pred = model(x_test).view(-1,1,1)
            
            y_pred_unnorm = y_pred * data_std[:, :, :1] + data_mean[:, :, :1]
            y_test_unnorm = y_test * data_std[:, :, :1] + data_mean[:, :, :1]
            total_l1 += l1loss(y_pred_unnorm, y_test_unnorm) * len(data_norm)
            total_l2 += l2loss(y_pred_unnorm, y_test_unnorm) * len(data_norm)
            total_mape += mapeloss(y_pred_unnorm, y_test_unnorm).item() * len(data_norm)

            predictions.append(y_pred_unnorm.cpu().numpy())
            true_vals.append(y_test_unnorm.cpu().numpy())

    n_data = len(dataloader.dataset)
    total_l1 /= n_data
    total_l2 /= n_data
    total_mape /= n_data
    
    predictions = np.concatenate(predictions).squeeze()
    true_vals = np.concatenate(true_vals).squeeze()
    # mape_loss = mapeloss(torch.tensor(predictions), torch.tensor(true_vals)).item()
    
    return total_l1, total_l2, total_mape, predictions, true_vals

In [None]:
gt_predictor = GRU(6, 50, 1, 2).to(device)
gt_optimizer = Adam(gt_predictor.parameters(), lr=0.001)

train_model(gt_predictor, dl_info["dataloader"],lossfn, gt_optimizer, num_epochs=3000)


In [None]:
train_data_norm_origin = torch.from_numpy(np.load(os.path.join(dataset.dir, f"stock_origin_data_{seq_length}_train.npy"))).to(device)
train_mean = torch.from_numpy(np.load(os.path.join(dataset.dir, f"stock_origin_mean_{seq_length}_train.npy"))).to(device)
train_std = torch.from_numpy(np.load(os.path.join(dataset.dir, f"stock_origin_std_{seq_length}_train.npy"))).to(device)

train_dataset = TensorDataset(train_data_norm_origin, train_mean, train_std)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)

lossfn = nn.L1Loss()
# lossfn = nn.MSELoss()


In [None]:
l1, l2, mape, pre_y, true_y = evaluate_model(gt_predictor, train_loader)
print(f"Adv_synthetic : L1 loss: {l1:0.5f} \t L2 Loss : {l2:0.5f} \t MAPE loss : {mape:0.5f} ")


In [None]:
plt.plot(pre_y[-200:])
plt.plot(true_y[-200:])

In [None]:
l1, l2, mape, pre_y, true_y = evaluate_model(gt_predictor, test_loader)
print(f"Adv_synthetic : L1 loss: {l1:0.5f} \t L2 Loss : {l2:0.5f} \t MAPE loss : {mape:0.5f} ")


In [None]:
a = true_y[-200:]-pre_y[-200:]

In [None]:
plt.plot(a, marker="o")


In [None]:
plt.plot(pre_y[-200:], marker="o")
plt.plot(true_y[-200:], marker="o")

In [None]:
syn_data = np.concatenate([gt_data, fake_data])

In [None]:
syn_data =torch.from_numpy(syn_data)

In [None]:
syn_dl = DataLoader(syn_data, batch_size=batch_size, shuffle=True)


In [None]:
train_model(gt_predictor, syn_dl,lossfn,gt_optimizer, 3000)



In [None]:
l1, l2, mape, pre_y, true_y = evaluate_model(gt_predictor, test_loader)
print(f"Adv_synthetic : L1 loss: {l1:0.5f} \t L2 Loss : {l2:0.5f} \t MAPE loss : {mape:0.5f} ")


In [None]:
Adv_synthetic : L1 loss: 0.81854 	 L2 Loss : 1.96458 	 MAPE loss : 0.00976 

In [None]:
plt.plot(pre_y[-200:], marker="o")
plt.plot(true_y[-200:], marker="o")

In [None]:
ori_data

In [None]:
ori_data.mean(2).reshape(-1,).shape

In [None]:
from scipy import stats


stat, p_value = stats.shapiro(ori_data.mean(2).reshape(-1,))
print(f'Statistic: {stat}, p-value: {p_value}')
if p_value > 0.05:
    print('The data is normally distributed (fail to reject H0)')
else:
    print('The data is not normally distributed (reject H0)')


In [None]:
from scipy import stats

stat, p_value = stats.kstest(ori_data.mean(2).reshape(-1,), 'norm')
print(f'Statistic: {stat}, p-value: {p_value}')
if p_value > 0.05:
    print('The data is normally distributed (fail to reject H0)')
else:
    print('The data is not normally distributed (reject H0)')


In [None]:
import matplotlib.pyplot as plt
import scipy.stats as stats

stats.probplot(ori_data.mean(2).reshape(-1,), dist="norm", plot=plt)
plt.title('Q-Q Plot')
plt.show()
