In [None]:
import os
import json
import pandas as pd
import numpy as np
import curvlearn as cv
from curvlearn.manifolds.manifold import Manifold
import torch
import torch.nn as nn
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import MinMaxScaler
from diffpool_helpers.model.diffpool_continuous import TSDiffPool
import argparse
import time
import random
from functools import lru_cache
import networkx as nx
import yfinance
import numpy as np
import pandas as pd
import yfinance as yf
from datetime import timedelta
import datetime
import requests
from math import floor
import dgl
from eodhd import APIClient
from sklearn.preprocessing import StandardScaler

# Import X and Y

### X

In [2]:
dt = {}
all_cols = None
Y = {}
for file in os.listdir('filtered_dt_428'):
    if file.endswith('.csv'):
        k = file.split('_')[0]
        dt[k] = pd.read_csv(os.path.join('filtered_dt_428', file))
        dt[k] = dt[k].loc[:, ~dt[k].columns.duplicated()]
        dt[k] = dt[k].reindex(sorted(dt[k].columns), axis=1)

In [3]:
import json

with open('Y.json', 'r') as file:
    Y = json.load(file)

In [5]:
for k, df in dt.items():
    # see if duplicate columns
    duplicate_columns = df.columns[df.columns.duplicated()]
    if len(duplicate_columns) > 0:
        print(f"Duplicate columns found in DataFrame '{k}': {', '.join(duplicate_columns)}")

In [6]:
def parse_columns(df):
    """
    Function to parse column names and remove the {ticker}: prefix.
    """
    new_columns = []
    for col in df.columns:
        if ":" in col:
            ticker, real_col_name = col.split(":")
            new_columns.append(real_col_name)
        else:
            new_columns.append(col)
    return new_columns

# Get the union of all column names across the DataFrames
all_cols = set().union(*[set(parse_columns(df)) for df in dt.values()])
list(all_cols)

# Update the column names for each DataFrame
for k in list(dt.keys()):
    dt[k].columns = parse_columns(dt[k])
    try:
        dt[k] = dt[k].reindex(columns=all_cols, fill_value=0)
    except Exception as e:
        print(k, e)

In [None]:
np.unique([len(df.columns) for df in dt.values()])

## Y

In [8]:
with open('Y.json', 'r') as file:
    # Load the JSON data into a Python dictionary
    Y = json.load(file)

# Autoencoder

# Drop range cols

In [9]:
range_list = pd.Series(range(max(len(df) for df in dt.values())))

for k, df in dt.items():
    dt[k] = dt[k].drop(columns=[col for col in dt[k].columns if isinstance(col, range)])
    dt[k] = dt[k].drop(columns=[col for col in dt[k].columns if (dt[k][col] == range_list[:len(dt[k])]).all()])
    # Compute the mean and standard deviation for each column
    numeric_cols = df.select_dtypes(include=[np.number]).columns

    # Compute the mean and standard deviation for each numerical column
    column_means = df[numeric_cols].mean(axis=0)
    column_stds = df[numeric_cols].std(axis=0) 
    dt[k][numeric_cols] = (df[numeric_cols] - column_means) / column_stds

# Create Matrices

In [10]:
# nodes
padded_tensors = []
max_length = 7
for df in dt.values():
    num_cols = df.columns[df.dtypes != 'object']
    df = df[num_cols]
    # get first 7 rows
    padded_tensor = torch.tensor(df.iloc[-7:].values.tolist())
    padded_tensors.append(padded_tensor)

In [11]:
# T x batch size x features
X = torch.stack(padded_tensors, dim=1)

mean_tensor = torch.mean(X, dim=(0, 1), keepdim=True)
std_tensor = torch.std(X, dim=(0, 1), unbiased=False, keepdim=True)
X = (X - mean_tensor) / std_tensor

padded_tensors = []
max_length = max(len(df) for df in dt.values())

In [12]:
ncol = np.unique([len(v.columns) for v in dt.values()])[0]
adj = torch.zeros(ncol, ncol, dtype=torch.float32)
for i in range(ncol):
    for j in range(ncol):
        adj[i, j] = torch.tensor(mutual_info_regression(X[:, i].reshape(-1, 1) , X[:, j].reshape(-1, 1).ravel()))

In [28]:
with open('X.pkl', 'wb') as f:
    pickle.dump(X, f)

In [13]:
adj = adj / adj.std()

## Create Graphs

In [15]:
from sklearn.feature_selection import mutual_info_regression
max_t, num_symbols, num_features = X.shape

# Create a tensor to store the temporal edge weights
temporal_edge_weights = torch.zeros((num_features, num_features, max_t - 1))

for t in range(1, max_t):
    for i in range(num_features):
            # Calculate the absolute difference between the node's values at time t and t-1
            temporal_edge_weights[i, i, t-1] = torch.tensor(mutual_info_regression(X[t, :, i].unsqueeze(-1), X[t-1, :, i].unsqueeze(-1).ravel()))

In [17]:
def create_graph(symbol_idx, k, dt=dt):
    """
    Create a graph representation from the given data tensor and adjacency matrix.

    Args:
       t (int): Time step index.
       symbol_idx (int): Index of the symbol (e.g., stock index).
       k (str): Key for accessing the data tensor in the dt dictionary.
       dt (dict, optional): Dictionary containing data tensors. Defaults to the global dt variable.

    Returns:
       dgl.DGLGraph: A DGLGraph object representing the input data and adjacency matrix.
    """
    # only take the last seven rows
    df = dt[k]
    weights = []
    n1 = []
    n2 = []
    num_cols = df[df.columns[df.dtypes != 'object']]
    n_feats = len(num_cols.columns)

    node_data = [None for _ in range(7 * n_feats)]
    node_data = torch.zeros((7 * n_feats, 2), dtype=torch.float64)
    for t in range(6):
        for i, col1 in enumerate(num_cols):
            for j, col2 in enumerate(num_cols):
                if col1 != col2 and adj[i, j] > -1:
                    n1.append(t * n_feats + i)
                    n2.append(t * n_feats + j)
                    weights.append(adj[i, j])

            node_data[t * n_feats + j, ] = torch.tensor([X[t, symbol_idx, i], t], dtype=torch.float64)
            n1.append(t * n_feats + i)
            n2.append((t + 1) * n_feats + i)
            weights.append(temporal_edge_weights[i, i, t])

    G = dgl.graph((torch.tensor(n1), torch.tensor(n2)))
    # set node data
    G.ndata['feat'] = node_data
    # set edge data
    G.edata['x'] = torch.tensor(weights, dtype=torch.float64)
    return G

In [18]:
max_t = X.shape[0]
graphs = []
batched_graphs = []

# Assuming dt is an iterable (e.g., list, numpy array, etc.)
for i, k in enumerate(dt):
    if k in Y:
        # Create a graph from the data tensor and append it to the list of graphs
        batched_graphs.append((create_graph(i, k), Y[k]))
    # Batch the graphs and append them along with the corresponding label Y[i]
    

# Create a PyTorch DataLoader from the batched_graphs
train_data = batched_graphs
# train_data = torch.utils.data.DataLoader(batched_graphs)

In [None]:
batched_graphs[0][0].ndata['feat'].shape

# Train/test split

In [19]:
test_size = int(0.1 * len(batched_graphs))
test_set = batched_graphs[:test_size]
train_set = batched_graphs[test_size:]

In [20]:
import pickle

# Save test set
with open('test_set.pkl', 'wb') as f:
    pickle.dump(test_set, f)

# Save train set
with open('train_set.pkl', 'wb') as f:
    pickle.dump(train_set, f)