In [11]:
import os
import math
import time
import argparse
import numpy as np
from sklearn import metrics
import torch
import torch.optim as optim
torch.set_num_threads(6)

from model import MPA, TotalLoss
from utlis import get_base_company, get_features_n_labels, get_windows_mpa

In [2]:
parser = argparse.ArgumentParser(description='Stock Forecasting')

parser.add_argument('--prestart_time', type=str, default='2000-01-01')
parser.add_argument('--start_time', type=str, default='2004-10-31')
parser.add_argument('--end_time', type=str, default='2020-01-01')
parser.add_argument('--lagend_time', type=str, default='2020-10-31')
parser.add_argument('--save_path', type=str, default='./output')
parser.add_argument('--lr', type=float, default=0.0001)
parser.add_argument('--weight_decay', type=float, default=5e-4)
parser.add_argument('--epochs', type=int, default=400)
parser.add_argument('--device', type=str, default='cpu')
parser.add_argument('--window_size', type=int, default=12)
parser.add_argument('--unmask_num', type=int, default=1)
parser.add_argument('--pred_acc', type=float, default=0.6)

args = parser.parse_args(args=[
    '--save_path', './output',
    '--prestart_time', '2015-06-01',
    '--start_time', '2016-01-01',
    '--end_time', '2020-01-01',
    '--lagend_time', '2020-10-30',
    '--unmask_num', '1',
    '--pred_acc', '0.75', # mutation probability = 0.25
    ])

In [1]:
output_path = args.save_path + '/' + (time.strftime(r'%Y-%m-%d_%H-%M-%S',time.localtime(time.time())))
output_filename = output_path + '/' + 'gdat.pt'
train_log_filename = output_path + '/' + 'gdat.txt'
if not os.path.exists(output_path):
    os.makedirs(output_path)
    print("Output dir '{}' has been created.".format(output_path))
else:
    print("Output dir '{}' is existed.".format(output_path))

## Load Data

In [2]:
selected_tickers = get_base_company(args)
features, labels, company_final, final_timestamps = get_features_n_labels(args=args, selected_tickers=selected_tickers)
binary_labels = (labels > 0) * torch.ones_like(labels)

## Train and Evaluation

In [5]:
def train(mode, args):
    model.train()
    total_loss = np.array([])
    if mode == 'pre-train':
        for x, x_tag, y, mask in get_windows_mpa(features, binary_labels, mode=mode, dataset='train', device=device, args=args, shuffle=True):
            y_hat, h_pmt, l_ort, l_pol, V1, V0 = model(x[:,:,rmv_feature_num:], x_tag)
            loss = criterion(y_hat, h_pmt, l_ort, l_pol, y.long(), V1, V0)
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_value_(model.parameters(), 3.)
            optimizer.step()
            total_loss = np.append(total_loss, loss.item())
        train_loss = total_loss.mean()
        return train_loss

In [6]:
def test(model, dataset, mode, args, cls_report=False):
    all_acc = np.array([])
    predictions = torch.Tensor([])
    ys = torch.Tensor([])
    model.eval()
    if mode == 'pre-train':
        with torch.no_grad():
            for x, x_tag, y, mask in get_windows_mpa(features, binary_labels, mode=mode, dataset=dataset, args=args, device=device):
                y_hat, h_pmt, _, _, V1, V0 = model(x[:,:,rmv_feature_num:], x_tag)
                y_hat[V0] = h_pmt[V0]
                prediction = y_hat.max(1)[1]
                acc = torch.eq(prediction, y).float().mean().cpu().numpy()

                acc = torch.eq(prediction[mask], y[mask]).float().mean().cpu().numpy()
                predictions = torch.cat([predictions, prediction[mask].cpu()], dim=0)
                ys = torch.cat([ys, y[mask].cpu()], dim=0)
                all_acc = np.append(all_acc, acc)
            accuracy = torch.eq(predictions, ys).float().mean().cpu().numpy()
        if cls_report == False:
            return accuracy
        else:
            return accuracy, ys, predictions

In [7]:
def main(pt=True):
    with open(train_log_filename, 'w', encoding='utf-8') as f:
        f.write('Train Log:' + '\n')

    best_val = -math.inf
    loss_train_history = []
    val_acc_history = []
    mode = 'pre-train'
    for epoch in range(1, total_epoch+1):
        # ---------training------------
        train_loss = train(mode=mode, args=args)
        lr_temp = optimizer.param_groups[-1]['lr']
        scheduler.step()
        # --------evaluation-----------
        train_acc = test(mode=mode, model=model, args=args, dataset='train')
        val_acc, ys, preds = test(mode=mode, model=model, args=args, dataset='valid', cls_report=True)
        loss_train_history.append(train_loss)
        val_acc_history.append(val_acc)
        if pt:
            print("| Epoch {:3d} | TrainLoss {:6.4f} | TrainAcc {:6.4} | ValAcc {:6.4f} | ValMCC {:6.4f} | lr {:6.8f} |".format(epoch, train_loss, train_acc, val_acc, metrics.matthews_corrcoef(ys, preds), lr_temp))
        with open(train_log_filename, 'a', encoding='utf-8') as f:
            f.write("| Epoch {:3d} | TrainLoss {:6.4f} | TrainAcc {:6.4} | ValAcc {:6.4f} | ValMCC {:6.4f} | lr {:6.8f} |".format(epoch, train_loss, train_acc, val_acc, metrics.matthews_corrcoef(ys, preds), lr_temp) + '\n')
        test_acc, ys, preds = test(mode=mode, model=model, dataset='test', args=args, cls_report=True)
        if pt:
            print("| TestAcc {:6.4f} | TestMCC {:6.4f} |".format(test_acc, metrics.matthews_corrcoef(ys, preds)))
        with open(train_log_filename, 'a', encoding='utf-8') as f:
            f.write("| TestAcc {:6.4f} | TestMCC {:6.4f} |".format(test_acc, metrics.matthews_corrcoef(ys, preds)) + '\n')
        # -----------------------------
        if (epoch % 5 == 0) or (val_acc > best_val):
            if (val_acc > best_val):
                torch.save(model.state_dict(), output_filename)
                best_val = val_acc
            valid_acc, ys, preds = test(mode=mode, model=model, dataset='valid', args=args, cls_report=True)
            if pt:
                print('VALID CLASSIFICATION: ')
                print(metrics.classification_report(ys, preds))
            test_acc, ys, preds = test(mode=mode, model=model, dataset='test', args=args, cls_report=True)
            if pt:
                print("| TestAcc {:6.4f} | TestMCC {:6.4f} |".format(test_acc, metrics.matthews_corrcoef(ys, preds)))
                print('TEST CLASSIFICATION: ')
                print(metrics.classification_report(ys, preds))
            with open(train_log_filename, 'a', encoding='utf-8') as f:
                f.write("| TestAcc {:6.4f} | TestMCC {:6.4f} |".format(test_acc, metrics.matthews_corrcoef(ys, preds)) + '\n')

In [3]:
learning_rate = 0.0001
total_epoch = 200
rmv_feature_num = 6
device = "cuda:1" if torch.cuda.is_available() else "cpu"
print("Device: '{}'.".format(device))

In [4]:
print("Creating model...")
n_feat = features.size(2) - rmv_feature_num
num_nodes = features.size(1)
model = MPA(n_feat=n_feat).to(device)
criterion = TotalLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.9)
print("Done.")

train_log_filename = output_path + '/' + 'mutation_{}.txt'.format(int((1-args.pred_acc)*100))
main(pt=False)