# Changing to Parent Directory to Import Needed Libraries

In [2]:
import os

# Change this relative to your own directory structure
PARENT_DIR = '/Users/henrygilbert/GitHub/CS-6362/final_project'
os.chdir(PARENT_DIR)

# Importing Needed Libraries

In [3]:
import torch
import numpy as np
import pandas as pd
import importlib
import copy
from typing import Tuple, List
from torch.utils.data import DataLoader
from matplotlib import pyplot as plt
import mlflow
from scipy.stats import norm
import seaborn as sns


import utilities
import factor_data_loader
import market_data_loader
from model import CVAE


# optional reload for libraries if needed
importlib.reload(utilities)
mlflow.set_experiment("Experiment 3")

mlflow.end_run()
mlflow.start_run()

  from .autonotebook import tqdm as notebook_tqdm


<ActiveRun: >

# Loading Data 

In [34]:


mdl = market_data_loader.MarketDataLoader()
fdl = factor_data_loader.FactorDataLoader()

start_ts = pd.Timestamp('2016-01-01')
end_ts = pd.Timestamp('2021-02-01')

mlflow.log_param("start_data_date", start_ts.strftime("%Y-%m-%d"))
mlflow.log_param("end_data_date", end_ts.strftime("%Y-%m-%d"))


# Load SPY Price data
monthly_eod_prices = mdl.get_eod_price_data_grouped('SPY', start_ts, end_ts, market_data_loader.GroupPeriod.MONTHLY)
monthly_percent_change_prices = {k: np.diff(v)/v[:-1] for k, v in monthly_eod_prices.items()}
monthly_means_to_std = {np.mean(v): np.std(v) for k, v in monthly_percent_change_prices.items()}

mean_price_change = np.mean(np.nan_to_num(list(monthly_means_to_std.keys())))
std_price_change = np.mean(np.nan_to_num(list(monthly_means_to_std.values())))

week_size = 5
weekly_data = {k: np.array([v[i:i+week_size] for i in range(0, len(v), week_size)]) for k, v in monthly_percent_change_prices.items()}

# Load Factor Data
conditioning_factors = [factor for factor in factor_data_loader.Factor]
factors_data_by_month = {factor: fdl.get_factor_data_by_month(factor, start_ts, end_ts) for factor in conditioning_factors}

# Removes auto-correlation in the data - get's rid of first month
for factor in factors_data_by_month:
   
    months = list(factors_data_by_month[factor].keys())
    month_data = np.array(list(factors_data_by_month[factor].values()))
    
    percent_diff = np.diff(month_data)/month_data[:-1]
    percent_diff[percent_diff == -np.inf] = 0
    percent_diff[percent_diff == np.inf] = 0
    percent_diff = np.nan_to_num(percent_diff)
    months.pop(0)
    
    assert len(percent_diff) == len(months)
    factors_data_by_month[factor] = {months[i]: percent_diff[i] for i in range(len(months))}

weekly_data = {k: v for k, v in weekly_data.items() if k in factors_data_by_month[conditioning_factors[0]]}
weekly_training_data = []
for month, weekly_prices in weekly_data.items():
    
    factor_values = [factors_data_by_month[factor][month] for factor in conditioning_factors]
    for i in range(1, len(weekly_prices)):
        if len(weekly_prices[i]) != 5 or len(weekly_prices[i-1]) != 5:
            continue

        conditioning_data = np.concatenate((factor_values, weekly_prices[i-1]))
        weekly_training_data.append((conditioning_data, weekly_prices[i]))

month_batch_size = 4
monthly_batches = [
    weekly_training_data[i:i+month_batch_size] 
    for i in range(0, len(weekly_training_data), month_batch_size)]

mlflow.log_param("monthly_batch_size", month_batch_size)
mlflow.log_param("weekly_size", week_size)


  group.index[0].to_period(group_by.value).to_timestamp(): group['close'].to_numpy()
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)
  weekly_data = {k: np.array([v[i:i+week_size] for i in range(0, len(v), week_size)]) for k, v in monthly_percent_change_prices.items()}
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contain

5

# Base Synthetic Generation Evaluation

In [35]:

batch_size = 4
batch_to_rmse = {}
synthetic_means = []
synthetic_stds = []
mlflow.log_param("training_batch_size", batch_size)

for i in range(len(monthly_batches)):
    
    print(f"Training on batch {i+1}/{len(monthly_batches)}")
    cvae = CVAE(5, 12).to(utilities.DEVICE)
    training_batches = copy.deepcopy(monthly_batches)
    test_batch = training_batches.pop(i)
    
    training_weeks = [week for batch in training_batches for week in batch]
    training_data = utilities.ConditionedMarketDataset(training_weeks)
    testing_data = utilities.ConditionedMarketDataset(test_batch)
   
    train_dataset = DataLoader(training_data, batch_size=batch_size, shuffle=True)
    testing_dataset = DataLoader(testing_data, batch_size=batch_size, shuffle=True)
    
    history = utilities.train_model(cvae, train_dataset, testing_dataset, epochs=10)
    mlflow.pytorch.log_model(cvae, f"cvae_{i}")    
    [mlflow.log_metric(f"validation_loss_{i}", val_loss) for val_loss in history]
    
    synthetic_mean_val_returns = []
    synthetic_val_returns = []
    actual_val_returns = []
    num_synthetic_samples = 1000
    mlflow.log_param("num_synthetic_samples", num_synthetic_samples)
  
    for batch in testing_dataset:
        
        price_batch = batch['price_data']
        synthetic_price_batches = [
            torch.FloatTensor(np.array([np.random.normal(loc=mean_price_change, scale=std_price_change, size=week_size) for _ in price_batch])) 
            for _ in range(num_synthetic_samples)]
        
        conditioned_batch = batch['factor_data']
        price_batch = price_batch.to(utilities.DEVICE)
        sample_synthetic_returns = [
            cvae(synthetic_b.float(), conditioned_batch.float()).detach().numpy() 
            for synthetic_b in synthetic_price_batches]
        mean_synthetic_returns = np.mean(sample_synthetic_returns, axis=0)
       
        synthetic_val_returns += sample_synthetic_returns
        synthetic_mean_val_returns += list(mean_synthetic_returns)
        actual_val_returns += list(price_batch.detach().numpy())
    
    synthetic_mean_val_returns = np.array(synthetic_mean_val_returns).flatten()
    actual_val_returns = np.array(actual_val_returns).flatten()
    mean_rmse = np.sqrt(np.mean((synthetic_mean_val_returns - actual_val_returns)**2))
    std_rmse = np.sqrt(np.mean((np.std(synthetic_mean_val_returns) - np.std(actual_val_returns))**2))
    
    batch_to_rmse[i] = std_rmse + mean_rmse
    mlflow.log_metric(f"batch_rmse", batch_to_rmse[i], step=i)
    
    fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(10, 10))
    ax1.plot(synthetic_mean_val_returns, label=f"synthetic mean over {num_synthetic_samples} samples")
    ax1.plot(price_batch.detach().numpy().flatten(), label="historical realization")
    ax1.legend()
    ax1.set_xlabel("week number")
    ax1.set_ylabel("price in USD")

    ax2.plot(price_batch.detach().numpy().flatten(), label="historical realization", color='red')
    [ax2.plot(np.array(synthetic_sample).flatten(), color='blue', alpha=0.01) for synthetic_sample in synthetic_val_returns]
    ax2.legend()
    ax2.set_xlabel("week number")
    ax2.set_ylabel("price in USD")
    
    synthetic_returrns = np.array(synthetic_val_returns).flatten().flatten()
    num_bins = int(len(synthetic_returrns)/100)
    bin_size = (np.max(synthetic_val_returns) - np.min(synthetic_val_returns))/num_bins
    
    synthetic_mean = np.mean(synthetic_returrns)
    synthetic_std = np.std(synthetic_returrns)
    
    historical_mean = np.mean(price_batch.detach().numpy().flatten()) 
    historical_std = np.std(price_batch.detach().numpy().flatten())
    
    mlflow.log_metric(f"batch_synthetic_mean", synthetic_mean, step=i)
    mlflow.log_metric(f"batch_synthetic_std", synthetic_std, step=i)
    mlflow.log_metric(f"batch_historical_mean", historical_mean, step=i)
    mlflow.log_metric(f"batch_historical_std", historical_std, step=i)
    
    ax3.hist(synthetic_returrns, bins=num_bins, color='blue', edgecolor='black', label=f"std: {np.round(synthetic_std, 6)}")
    ax3.axvline(x=synthetic_mean, color='r', label=f'synthetic mean: {np.round(synthetic_mean, 6)}')
    ax3.axvline(x=historical_mean, color='#FF00FF', label=f'historical mean: {np.round(historical_mean, 6)}')
    ax3.legend()
    ax2.set_xlabel("Return")
    ax2.set_ylabel("Synthetic Frequency")
    
    plt.savefig(f"experiments/experiment_3/graphs/synthetic_data_prediction_batch.png")
    mlflow.log_artifact(f"experiments/experiment_3/graphs/synthetic_data_prediction_batch.png", f"batch_{i}")
    plt.clf()
    plt.close()
    
plt.bar(batch_to_rmse.keys(), batch_to_rmse.values())
plt.xlabel("Test Batch Number")
plt.ylabel("RMSE between averaged synthetic and actual prices")
plt.savefig(f"experiments/experiment_3/graphs/batch_rmse.png")
mlflow.log_artifact(f"experiments/experiment_3/graphs/batch_rmse.png", f"batch_rmse")
mlflow.log_metric("average_rmse", np.mean(list(batch_to_rmse.values())))
plt.clf()
plt.close()
mlflow.end_run()


Training on batch 1/41
Training on batch 2/41
Training on batch 3/41
Training on batch 4/41
Training on batch 5/41
Training on batch 6/41
Training on batch 7/41
Training on batch 8/41
Training on batch 9/41
Training on batch 10/41
Training on batch 11/41
Training on batch 12/41
Training on batch 13/41
Training on batch 14/41
Training on batch 15/41
Training on batch 16/41
Training on batch 17/41
Training on batch 18/41
Training on batch 19/41
Training on batch 20/41
Training on batch 21/41
Training on batch 22/41
Training on batch 23/41
Training on batch 24/41
Training on batch 25/41
Training on batch 26/41
Training on batch 27/41
Training on batch 28/41
Training on batch 29/41
Training on batch 30/41
Training on batch 31/41
Training on batch 32/41
Training on batch 33/41
Training on batch 34/41
Training on batch 35/41
Training on batch 36/41
Training on batch 37/41
Training on batch 38/41
Training on batch 39/41
Training on batch 40/41
Training on batch 41/41


# Conditional Synthetic Data Generation Data Loading

In [4]:
mdl = market_data_loader.MarketDataLoader()
fdl = factor_data_loader.FactorDataLoader()

start_ts = pd.Timestamp('2016-01-01')
end_ts = pd.Timestamp('2021-02-01')

mlflow.log_param("start_data_date", start_ts.strftime("%Y-%m-%d"))
mlflow.log_param("end_data_date", end_ts.strftime("%Y-%m-%d"))


# Load SPY Price data
monthly_eod_prices = mdl.get_eod_price_data_grouped('SPY', start_ts, end_ts, market_data_loader.GroupPeriod.MONTHLY)
monthly_percent_change_prices = {k: np.diff(v)/v[:-1] for k, v in monthly_eod_prices.items()}
monthly_means_to_std = {np.mean(v): np.std(v) for k, v in monthly_percent_change_prices.items()}

mean_price_change = np.mean(np.nan_to_num(list(monthly_means_to_std.keys())))
std_price_change = np.mean(np.nan_to_num(list(monthly_means_to_std.values())))

week_size = 5
weekly_data = {k: np.array([v[i:i+week_size] for i in range(0, len(v), week_size)]) for k, v in monthly_percent_change_prices.items()}

# Load Factor Data
conditioning_factors = [factor_data_loader.Factor.CONSUMER_PRICE_INDEX]
factors_data_by_month = {factor: fdl.get_factor_data_by_month(factor, start_ts, end_ts) for factor in conditioning_factors}

# Removes auto-correlation in the data - get's rid of first month
for factor in factors_data_by_month:
   
    months = list(factors_data_by_month[factor].keys())
    month_data = np.array(list(factors_data_by_month[factor].values()))
    
    percent_diff = np.diff(month_data)/month_data[:-1]
    percent_diff[percent_diff == -np.inf] = 0
    percent_diff[percent_diff == np.inf] = 0
    percent_diff = np.nan_to_num(percent_diff)
    months.pop(0)
    
    assert len(percent_diff) == len(months)
    factors_data_by_month[factor] = {months[i]: percent_diff[i] for i in range(len(months))}

weekly_data = {k: v for k, v in weekly_data.items() if k in factors_data_by_month[conditioning_factors[0]]}
weekly_factor_conditioned_training_data = []
weekly_price_conditioned_training_data = []

for month, weekly_prices in weekly_data.items():
    
    factor_values = [factors_data_by_month[factor][month] for factor in conditioning_factors]
    for i in range(1, len(weekly_prices)):
        if len(weekly_prices[i]) != 5 or len(weekly_prices[i-1]) != 5:
            continue

        factor_conditioning_data = np.concatenate((factor_values, weekly_prices[i-1]))
        weekly_factor_conditioned_training_data.append((factor_conditioning_data, weekly_prices[i]))
        weekly_price_conditioned_training_data.append((weekly_prices[i-1], weekly_prices[i]))

month_batch_size = 4
monthly_factor_conditioned_batches = [
    weekly_factor_conditioned_training_data[i:i+month_batch_size] 
    for i in range(0, len(weekly_factor_conditioned_training_data), month_batch_size)]

monthly_price_conditioned_batches = [
    weekly_factor_conditioned_training_data[i:i+month_batch_size] 
    for i in range(0, len(weekly_factor_conditioned_training_data), month_batch_size)]

mlflow.log_param("monthly_batch_size", month_batch_size)
mlflow.log_param("weekly_size", week_size)

  group.index[0].to_period(group_by.value).to_timestamp(): group['close'].to_numpy()
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean, casting='unsafe',
  ret = ret.dtype.type(ret / rcount)
  weekly_data = {k: np.array([v[i:i+week_size] for i in range(0, len(v), week_size)]) for k, v in monthly_percent_change_prices.items()}
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  percent_diff = np.diff(month_data)/month_data[:-1]
  percent_diff = np.diff(month_data)/month_data[:-1]


5

# Conditional Evaluation

In [6]:
batch_size = 4
batch_to_rmse = {}
synthetic_means = []
synthetic_stds = []
mlflow.log_param("training_batch_size", batch_size)

monthly_factor_conditioned_training_data = monthly_factor_conditioned_batches[:40]
monthly_factor_conditioned_testing_data = monthly_factor_conditioned_batches[40:]

monthly_price_conditioned_training_data = monthly_price_conditioned_batches[:40]
monthly_price_conditioned_testing_data = monthly_price_conditioned_batches[40:]

factor_conditioned_training_weeks = [week for batch in monthly_factor_conditioned_training_data for week in batch]
factor_conditioned_testing_weeks = [week for batch in monthly_factor_conditioned_testing_data for week in batch]

price_conditioned_training_weeks = [week for batch in monthly_price_conditioned_training_data for week in batch]
price_conditioned_testing_weeks = [week for batch in monthly_price_conditioned_testing_data for week in batch]

factor_training_data = utilities.ConditionedMarketDataset(factor_conditioned_training_weeks)
factor_testing_data = utilities.ConditionedMarketDataset(factor_conditioned_testing_weeks)

price_training_data = utilities.ConditionedMarketDataset(price_conditioned_training_weeks)
price_testing_data = utilities.ConditionedMarketDataset(price_conditioned_testing_weeks)

factor_train_dataset = DataLoader(factor_training_data, batch_size=batch_size, shuffle=True)
factor_testing_dataset = DataLoader(factor_testing_data, batch_size=batch_size, shuffle=True)

price_train_dataset = DataLoader(price_training_data, batch_size=batch_size, shuffle=True)
price_testing_dataset = DataLoader(price_testing_data, batch_size=batch_size, shuffle=True)

factor_cvae = CVAE(5, 6).to(utilities.DEVICE)
factor_history = utilities.train_model(factor_cvae, factor_train_dataset, factor_testing_dataset, epochs=10)
[mlflow.log_metric(f"factor_validation_loss", val_loss) for val_loss in factor_history]
print(f"factor validation loss: {factor_history[-1]}")

price_cvae = CVAE(5, 6).to(utilities.DEVICE)
price_history = utilities.train_model(price_cvae, price_train_dataset, price_testing_dataset, epochs=10)
[mlflow.log_metric(f"price_validation_loss", val_loss) for val_loss in price_history]
print(f"price validation loss: {price_history[-1]}")


mlflow.end_run()

factor validation loss: 0.01795613393187523
price validation loss: 0.02567167952656746
