# Changing to Parent Directory to Import Needed Libraries

In [9]:
import os

# Change this relative to your own directory structure
PARENT_DIR = '/Users/henrygilbert/GitHub/CS-6362/final_project'
os.chdir(PARENT_DIR)

# Importing Needed Libraries

In [10]:
import torch
import numpy as np
import pandas as pd
import importlib
import copy
from torch.utils.data import DataLoader
from matplotlib import pyplot as plt
import mlflow

import utilities
import factor_data_loader
import market_data_loader
from model import CVAE

# optional reload for libraries if needed
importlib.reload(utilities)
mlflow.set_experiment("Experiment 1")

mlflow.end_run()
mlflow.start_run()

<ActiveRun: >

# Loading Data 

In [11]:


mdl = market_data_loader.MarketDataLoader()
start_ts = pd.Timestamp('2016-01-01')
end_ts = pd.Timestamp('2021-02-01')

mlflow.log_param("start_data_date", start_ts.strftime("%Y-%m-%d"))
mlflow.log_param("end_data_date", end_ts.strftime("%Y-%m-%d"))

monthly_eod_prices = mdl.get_eod_price_data_grouped('SPY', start_ts, end_ts, market_data_loader.GroupPeriod.MONTHLY)
all_eod_prices = np.concatenate([prices for prices in list(monthly_eod_prices.values())])

percent_change = np.diff(all_eod_prices)/all_eod_prices[:-1]
std_change = np.std(percent_change)
mean_change = np.mean(percent_change)
print(f"Mean: {mean_change}, Std: {std_change}")
mlflow.log_metric("percent_change_std", std_change)
mlflow.log_metric("percent_change_mean", mean_change)

week_size = 5
weekly_data = [percent_change[i:i+week_size] for i in range(0, len(percent_change), week_size)]

# only condition on previous week, no external factor
weekly_trainng_data = [
    (weekly_data[i-1], weekly_data[i]) 
    for i in range(1, len(weekly_data))
    if len(weekly_data[i]) == week_size and len(weekly_data[i-1]) == week_size]

month_batch_size = 4
monthly_batches = [
    weekly_trainng_data[i:i+month_batch_size] 
    for i in range(0, len(weekly_trainng_data), month_batch_size)]

mlflow.log_param("monthly_batch_size", month_batch_size)
mlflow.log_param("weekly_size", week_size)


Mean: 0.0005577222568962418, Std: 0.011583481180388372


  group.index[0].to_period(group_by.value).to_timestamp(): group['close'].to_numpy()


5

# Training/Evaluation

In [12]:
batch_size = 4
batch_to_rmse = {}
mlflow.log_param("training_batch_size", batch_size)

for i in range(len(monthly_batches)):
    
    print(f"Training on batch {i+1}/{len(monthly_batches)}")
    cvae = CVAE(5, 5).to(utilities.DEVICE)
    training_batches = copy.deepcopy(monthly_batches)
    test_batch = training_batches.pop(i)
    
    training_weeks = [week for batch in training_batches for week in batch]
    training_data = utilities.ConditionedMarketDataset(training_weeks)
    testing_data = utilities.ConditionedMarketDataset(test_batch)
   
    train_dataset = DataLoader(training_data, batch_size=batch_size, shuffle=True)
    testing_dataset = DataLoader(testing_data, batch_size=batch_size, shuffle=True)
    
    history = utilities.train_model(cvae, train_dataset, testing_dataset, epochs=10)
    mlflow.pytorch.log_model(cvae, f"cvae_{i}")    
    [mlflow.log_metric(f"validation_loss_{i}", val_loss) for val_loss in history]
    
    synthetic_mean_val_returns = []
    synthetic_val_returns = []
    actual_val_returns = []
    num_synthetic_samples = 1000
    mlflow.log_param("num_synthetic_samples", num_synthetic_samples)
    
    for batch in testing_dataset:
        
        price_batch = batch['price_data']
        synthetic_price_batches = [
            torch.FloatTensor(np.array([np.random.normal(loc=mean_change, scale=std_change, size=week_size) for _ in price_batch])) 
            for _ in range(num_synthetic_samples)]
        
        conditioned_batch = batch['factor_data']
        price_batch = price_batch.to(utilities.DEVICE)
        
        sample_synthetic_returns = [
            cvae(synthetic_b.float(), conditioned_batch.float()).detach().numpy() 
            for synthetic_b in synthetic_price_batches]
        mean_synthetic_returns = np.mean(sample_synthetic_returns, axis=0)
       
        synthetic_val_returns += sample_synthetic_returns
        synthetic_mean_val_returns += list(mean_synthetic_returns)
        actual_val_returns += list(price_batch.detach().numpy())
    
    synthetic_mean_val_returns = np.array(synthetic_mean_val_returns).flatten()
    actual_val_returns = np.array(actual_val_returns).flatten()
    mean_rmse = np.sqrt(np.mean((synthetic_mean_val_returns - actual_val_returns)**2))
    std_rmse = np.sqrt(np.mean((np.std(synthetic_mean_val_returns) - np.std(actual_val_returns))**2))
    
    batch_to_rmse[i] = std_rmse + mean_rmse
    mlflow.log_metric(f"batch_rmse", batch_to_rmse[i], step=i)
    
    fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(10, 10))
    ax1.plot(synthetic_mean_val_returns, label=f"synthetic mean over {num_synthetic_samples} samples")
    ax1.plot(price_batch.detach().numpy().flatten(), label="historical realization")
    ax1.legend()
    ax1.set_xlabel("week number")
    ax1.set_ylabel("price in USD")

    ax2.plot(price_batch.detach().numpy().flatten(), label="historical realization", color='red')
    [ax2.plot(np.array(synthetic_sample).flatten(), color='blue', alpha=0.01) for synthetic_sample in synthetic_val_returns]
    ax2.legend()
    ax2.set_xlabel("week number")
    ax2.set_ylabel("price in USD")
    
    synthetic_returrns = np.array(synthetic_val_returns).flatten().flatten()
    num_bins = int(len(synthetic_returrns)/100)
    bin_size = (np.max(synthetic_val_returns) - np.min(synthetic_val_returns))/num_bins
    
    synthetic_mean = np.mean(synthetic_returrns)
    synthetic_std = np.std(synthetic_returrns)
    
    historical_mean = np.mean(price_batch.detach().numpy().flatten()) 
    historical_std = np.std(price_batch.detach().numpy().flatten())
    
    mlflow.log_metric(f"batch_synthetic_mean", synthetic_mean, step=i)
    mlflow.log_metric(f"batch_synthetic_std", synthetic_std, step=i)
    mlflow.log_metric(f"batch_historical_mean", historical_mean, step=i)
    mlflow.log_metric(f"batch_historical_std", historical_std, step=i)
    
    ax3.hist(synthetic_returrns, bins=num_bins, color='blue', edgecolor='black', label=f"std: {np.round(synthetic_std, 6)}")
    ax3.axvline(x=synthetic_mean, color='r', label=f'synthetic mean: {np.round(synthetic_mean, 6)}')
    ax3.axvline(x=historical_mean, color='#FF00FF', label=f'historical mean: {np.round(historical_mean, 6)}')
    ax3.legend()
    ax2.set_xlabel("Return")
    ax2.set_ylabel("Synthetic Frequency")
    
    plt.savefig(f"experiments/experiment_1/graphs/synthetic_data_prediction_batch.png")
    mlflow.log_artifact(f"experiments/experiment_1/graphs/synthetic_data_prediction_batch.png", f"batch_{i}")
    plt.clf()
    plt.close()
    
plt.bar(batch_to_rmse.keys(), batch_to_rmse.values())
plt.xlabel("Test Batch Number")
plt.ylabel("RMSE between averaged synthetic and actual prices")
plt.savefig(f"experiments/experiment_1/graphs/batch_rmse.png")
mlflow.log_artifact(f"experiments/experiment_1/graphs/batch_rmse.png", f"batch_rmse")
mlflow.log_metric("average_rmse", np.mean(list(batch_to_rmse.values())))
plt.clf()
plt.close()
mlflow.end_run()
    


Training on batch 1/64
Training on batch 2/64
Training on batch 3/64
Training on batch 4/64
Training on batch 5/64
Training on batch 6/64
Training on batch 7/64
Training on batch 8/64
Training on batch 9/64
Training on batch 10/64
Training on batch 11/64
Training on batch 12/64
Training on batch 13/64
Training on batch 14/64
Training on batch 15/64
Training on batch 16/64
Training on batch 17/64
Training on batch 18/64
Training on batch 19/64
Training on batch 20/64
Training on batch 21/64
Training on batch 22/64
Training on batch 23/64
Training on batch 24/64
Training on batch 25/64
Training on batch 26/64
Training on batch 27/64
Training on batch 28/64
Training on batch 29/64
Training on batch 30/64
Training on batch 31/64
Training on batch 32/64
Training on batch 33/64
Training on batch 34/64
Training on batch 35/64
Training on batch 36/64
Training on batch 37/64
Training on batch 38/64
Training on batch 39/64
Training on batch 40/64
Training on batch 41/64
Training on batch 42/64
T