In [10]:
import numpy as np
import pandas as pd
import torch
import sklearn.model_selection
import sklearn.preprocessing

### Load Data

In [2]:
df = pd.read_pickle('data/processed.pkl')
df = df.set_index('DATE')  # Index for timeseries is datetime

df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 592 entries, 1963-01-01 to 2012-04-01
Data columns (total 20 columns):
 #   Column                                                                     Non-Null Count  Dtype  
---  ------                                                                     --------------  -----  
 0   MSACSR                                                                     592 non-null    float64
 1   MSPUS                                                                      592 non-null    float64
 2   S&P Comp.                                                                  592 non-null    float64
 3   Dividend                                                                   592 non-null    float64
 4   Earnings                                                                   592 non-null    float64
 5   Consumer Price Index CPI                                                   592 non-null    float64
 6   Long Interest Rate GS10                

### Data Interpolation

In [None]:
# Todo: Data interpolation if necessary

### Data Preprocessing

In [12]:
# Split into input and target variables

X = df.drop('MSPUS', axis='columns')
y = df['MSPUS'].values
y = y.reshape(-1, 1)  # Put target variable into column vector shape

print(X.shape)
print(y.shape)

(592, 19)
(592, 1)


In [6]:
# Scale data

standard_scaler = sklearn.preprocessing.StandardScaler()  # Scales to mean of 0 and var of 1
minmax_scaler = sklearn.preprocessing.MinMaxScaler()  # Scale to between 0 and 1

X_scaled = standard_scaler.fit_transform(X)  
y_scaled = minmax_scaler.fit_transform(y)    

print("X mean:", X_scaled.mean())
print("X var:", X_scaled.var())
print("y min:", y_scaled.min())
print("y max:", y_scaled.max())

X mean: -3.916576246188319e-16
X var: 1.0
y min: 0.0
y max: 1.0000000000000002


In [19]:
# NOTE: These values are for monthly data
#       Change these if interpolating!
IN_SEQ_LENGTH = 10
OUT_SEQ_LENGTH = 5

def split_sequences(input_sequences, output_sequence, n_steps_in, n_steps_out):
    X = []
    y = []
    for i in range(len(input_sequences)):
        end_ix = i + n_steps_in
        out_end_ix = end_ix + n_steps_out - 1
        if out_end_ix > len(input_sequences):
            break
        seq_x, seq_y = input_sequences[i:end_ix], output_sequence[end_ix-1:out_end_ix, -1]
        X.append(seq_x), y.append(seq_y)
        
    return np.array(X), np.array(y)

X_seq, y_seq = split_sequences(X_scaled, y_scaled, IN_SEQ_LENGTH, OUT_SEQ_LENGTH)

# Make 
print(X_seq.shape, y_seq.shape)

assert y_seq[0].all() == y_scaled[IN_SEQ_LENGTH-1:IN_SEQ_LENGTH-1+OUT_SEQ_LENGTH].squeeze(1).all()

(579, 10, 19) (579, 5)


In [24]:
# Split data into train and test sets
X_train, X_val, y_train, y_val = sklearn.model_selection.train_test_split(X_seq, y_seq, test_size=0.05)

# Convert datasets to nodes in computational graph
X_train_tensors = torch.Tensor(X_train)
y_train_tensors = torch.Tensor(y_train)
X_val_tensors = torch.Tensor(X_val)
y_val_tensors = torch.Tensor(y_val)

# Convert to sequential data for pytorch
X_train_tensors = torch.reshape(X_train_tensors, 
                                (X_train_tensors.shape[0], IN_SEQ_LENGTH, X_train_tensors.shape[2])
                               )
X_val_tensors = torch.reshape(X_val_tensors, (X_val_tensors.shape[0], IN_SEQ_LENGTH, X_val_tensors.shape[2]))


print(X_train_tensors.shape)
print(X_val_tensors.shape)

torch.Size([550, 10, 19])
torch.Size([29, 10, 19])


In [30]:
# Sanity Check data
X_check, y_checkk = split_sequences(X, y.reshape(-1, 1), IN_SEQ_LENGTH, OUT_SEQ_LENGTH)
print(X_check[-1][0:4])

start_ix = IN_SEQ_LENGTH + OUT_SEQ_LENGTH

assert X.iloc[-start_ix + 1: -start_ix + 5].all() == X_check[1][0:4].all()

[[7.2000000e+00 1.3044900e+03 2.3430000e+01 8.1310000e+01 2.2347000e+02
  3.4100000e+00 1.6948900e+03 3.0440000e+01 8.8240620e+05 1.0564000e+02
  5.5001150e+04 2.2900000e+01 2.5270000e+01 3.3600000e+00 1.0000000e+00
  4.3690000e+01 1.1900000e+01 1.9500000e+00 9.9500000e+00]
 [6.7000000e+00 1.3315100e+03 2.3730000e+01 8.2160000e+01 2.2491000e+02
  3.4600000e+00 1.7189300e+03 3.0640000e+01 8.9625005e+05 1.0607000e+02
  5.5304800e+04 2.3140000e+01 2.5530000e+01 3.2900000e+00 1.0300000e+00
  4.3350000e+01 1.2290000e+01 1.9400000e+00 1.0360000e+01]
 [6.6000000e+00 1.3383100e+03 2.4040000e+01 8.3020000e+01 2.2596000e+02
  3.1700000e+00 1.7196200e+03 3.0890000e+01 8.9795133e+05 1.0667000e+02
  5.5700790e+04 2.3060000e+01 2.5420000e+01 3.6000000e+00 1.0200000e+00
  4.4330000e+01 1.2270000e+01 1.6600000e+00 1.0610000e+01]
 [6.6000000e+00 1.2872900e+03 2.4340000e+01 8.3870000e+01 2.2572000e+02
  3.0000000e+00 1.6558300e+03 3.1310000e+01 8.6600739e+05 1.0788000e+02
  5.6422440e+04 2.2100000e+01 2

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().