In [5]:
import math
import numbers
import os
import pandas as pd
import platform
import shutil
import time
import torch
import torch.cuda
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms as transforms

from collections import Counter
from datetime import date
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, random_split
from torch.utils.data.sampler import WeightedRandomSampler
from torchvision.datasets import ImageFolder
from tqdm.notebook import tqdm


In [6]:
CUDA_ENABLED = torch.cuda.is_available()
if CUDA_ENABLED:
    print(torch.cuda.get_device_name())
else:
    print(platform.processor())
device = torch.device('cuda' if CUDA_ENABLED else 'cpu')

NVIDIA GeForce RTX 4090


In [7]:
def create_training_space(market_index: str, lookback_days: int, forecast_days: int, ma_line: int, margins: tuple):
    workspace = os.path.abspath(f'../../../training/{market_index}/i{lookback_days}-r{forecast_days}-ma{ma_line}')
    os.makedirs(workspace, exist_ok=True)
    
    train_folder_path = os.path.join(workspace, 'input/train')
    test_folder_path = os.path.join(workspace, 'input/test')
    for cls_label in (0, 1, 2):
        os.makedirs(os.path.join(train_folder_path, str(cls_label)), exist_ok=True)
        os.makedirs(os.path.join(test_folder_path, str(cls_label)), exist_ok=True)
    
    graph_data_folder = f'../../../dataset/ohlc_graphs/{market_index}/i{lookback_days}-ma{ma_line}'
    train_files, test_files = train_test_split(os.listdir(graph_data_folder), test_size=0.2)
    
    return_data = None
    for item in os.scandir(f'../../../dataset/returns/{market_index}'):
        if item.path.endswith('.pkl'):
            df = pd.read_pickle(os.path.abspath(item.path))
            return_data = pd.concat([return_data, df], ignore_index=True) if isinstance(return_data, pd.DataFrame) else df
    
    return_data = return_data.set_index(['date', 'ticker'])
    for file in tqdm(train_files):
        ticker, dt = file.replace('.png', '').split('_')
        return_val = return_data.loc[dt, ticker][f'r{forecast_days}']
        if isinstance(return_val, numbers.Number) and not math.isnan(return_val):
            if return_val < margins[0]:
                label = 0
            elif return_val > margins[1]:
                label = 2
            else:
                label = 1
            src_path = os.path.join(graph_data_folder, file)
            target_path = os.path.join(train_folder_path, str(label), file)
            shutil.copy(src_path, target_path)

    for file in tqdm(test_files):
        ticker, dt = file.replace('.png', '').split('_')
        return_val = return_data.loc[dt, ticker][f'r{forecast_days}']
        if isinstance(return_val, numbers.Number) and not math.isnan(return_val):
            if return_val < margins[0]:
                label = 0
            elif return_val > margins[1]:
                label = 2
            else:
                label = 1
            src_path = os.path.join(graph_data_folder, file)
            target_path = os.path.join(test_folder_path, str(label), file)
            shutil.copy(src_path, target_path)

# Moving graphs to the /training folder
create_training_space('nikkei_mid_small_cap', 20, 5, 50, (-0.033320, 0.044304))
create_training_space('nikkei_mid_small_cap', 60, 5, 50, (-0.054173, 0.092287))


  0%|          | 0/594426 [00:00<?, ?it/s]

  0%|          | 0/148607 [00:00<?, ?it/s]

  0%|          | 0/588557 [00:00<?, ?it/s]

  0%|          | 0/147140 [00:00<?, ?it/s]

In [16]:
class I5_CNNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 64, (5, 3), padding=(2, 1), stride=(1, 1), dilation=(1, 1))
        self.conv2 = nn.Conv2d(64, 128, (5, 3), padding=(2, 1), stride=(1, 1), dilation=(1, 1))
        self.bn1 = nn.BatchNorm2d(64, affine=True)
        self.bn2 = nn.BatchNorm2d(128, affine=True)
        self.pool = nn.MaxPool2d((2, 1))
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(15360, 3)
        self.init_weights(self.conv1)
        self.init_weights(self.conv2)
        self.init_weights(self.fc)
        
    def init_weights(self, m):
        if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d):
            nn.init.xavier_uniform_(m.weight)
            m.bias.data.fill_(0.01)
            
    def forward(self, x):
        x = self.pool(F.leaky_relu(self.bn1(self.conv1(x))))
        x = self.pool(F.leaky_relu(self.bn2(self.conv2(x))))
        x = self.dropout(x.view(x.shape[0], -1))
        x = self.fc(x)
        x = F.softmax(x, dim=1)
        return x

class I20_CNNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 64, (5, 3), padding=(3, 1), stride=(3, 1), dilation=(2, 1))
        self.conv2 = nn.Conv2d(64, 128, (5, 3), padding=(3, 1), stride=(1, 1), dilation=(1, 1))
        self.conv3 = nn.Conv2d(128, 256, (5, 3), padding=(2, 1), stride=(1, 1), dilation=(1, 1))
        self.bn1 = nn.BatchNorm2d(64, affine=True)
        self.bn2 = nn.BatchNorm2d(128, affine=True)
        self.bn3 = nn.BatchNorm2d(256, affine=True)
        self.pool = nn.MaxPool2d((2, 1))
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(46080, 3)
        self.init_weights(self.conv1)
        self.init_weights(self.conv2)
        self.init_weights(self.conv3)
        self.init_weights(self.fc)
        
    def init_weights(self, m):
        if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d):
            nn.init.xavier_uniform_(m.weight)
            m.bias.data.fill_(0.01)
            
    def forward(self, x):
        x = self.pool(F.leaky_relu(self.bn1(self.conv1(x))))
        x = self.pool(F.leaky_relu(self.bn2(self.conv2(x))))
        x = self.pool(F.leaky_relu(self.bn3(self.conv3(x))))
        x = self.dropout(x.view(x.shape[0], -1))
        x = self.fc(x)
        x = F.softmax(x, dim=1)
        return x
    
class I60_CNNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 64, (5, 3), padding=(3, 1), stride=(3, 1), dilation=(3, 1))
        self.conv2 = nn.Conv2d(64, 128, (5, 3), padding=(2, 1), stride=(1, 1), dilation=(1, 1))
        self.conv3 = nn.Conv2d(128, 256, (5, 3), padding=(2, 1), stride=(1, 1), dilation=(1, 1))
        self.conv4 = nn.Conv2d(256, 512, (5, 3), padding=(2, 1), stride=(1, 1), dilation=(1, 1))
        self.bn1 = nn.BatchNorm2d(64, affine=True)
        self.bn2 = nn.BatchNorm2d(128, affine=True)
        self.pool = nn.MaxPool2d((2, 1))
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(15360, 3)
        self.init_weights(self.conv1)
        self.init_weights(self.conv2)
        self.init_weights(self.fc)
        
    def init_weights(self, m):
        if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d):
            nn.init.xavier_uniform_(m.weight)
            m.bias.data.fill_(0.01)
            
    def forward(self, x):
        x = self.pool(F.leaky_relu(self.bn1(self.conv1(x))))
        x = self.pool(F.leaky_relu(self.bn2(self.conv2(x))))
        x = self.dropout(x.view(x.shape[0], -1))
        x = self.fc(x)
        x = F.softmax(x, dim=1)
        return x



In [18]:
BATCH_SIZE = 128
EPOCH = 100
LEARNING_RATE = 10e-6

def train_model(training_workspace: str, model_name: str):
    cnn_model = I5_CNNet().to(device) # Careful: hard-coded the model here
    
    #optimizer 
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(cnn_model.parameters(), lr=LEARNING_RATE)
    
    # data source (TODO: refactor the sampler to handle both train and test data)
    transform = transforms.Compose([transforms.Grayscale(), transforms.ToTensor()])
    dataset_folder = os.path.join(training_workspace, 'input/train')
    dataset = ImageFolder(dataset_folder, transform=transform)
    train_data, valid_data = random_split(dataset, [0.75, 0.25])
    train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
    valid_loader = DataLoader(valid_data, batch_size=BATCH_SIZE, shuffle=False)
    
    time_now = int(time.time())
    stat = []
    
    patience = 10
    best_val_loss = float('inf')
    best_model_state = None
    no_improvement_count = 0
    for epoch in range(EPOCH):
        # training
        cnn_model.train()
        print(f'Start epoch {epoch}')
        y_true, y_pred = [], []
        train_total_loss = 0.0
        train_correct = 0
        train_total = 0
        for inputs, labels in train_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            outputs = cnn_model(inputs)
            
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            _, predicted = torch.max(outputs.data, 1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())
            train_total_loss += loss.item()
            train_total += labels.size(0)
            train_correct += (predicted == labels).sum().item()
        train_f1 = f1_score(y_true, y_pred, average='macro')
        print(f'[Epoch Train {epoch + 1}] - Avg Loss = {train_total_loss / len(train_loader)} - Correct = {train_correct} - Accuracy = {train_correct / train_total} - f1 = {train_f1}')
        
        # validating
        cnn_model.eval()
        y_true, y_pred = [], []
        val_total_loss = 0.0
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for inputs, labels in valid_loader:
                inputs = inputs.to(device)
                labels = labels.to(device)
                outputs = cnn_model(inputs)
                
                loss = criterion(outputs, labels)
                
                _, predicted = torch.max(outputs.data, 1)
                y_true.extend(labels.cpu().numpy())
                y_pred.extend(predicted.cpu().numpy())
                val_total_loss += loss.item()
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()
            val_f1 = f1_score(y_true, y_pred, average='macro')
            print(f'[Epoch Valid {epoch + 1}] - Avg Loss = {val_total_loss / len(valid_loader)} - Correct = {val_correct} - Accuracy = {val_correct / val_total} - f1 = {val_f1}')
            
        stat.append({
            'epoch': epoch,
            'train_loss': train_total_loss,
            'train_accuracy': train_correct / train_total,
            'train_f1': train_f1,
            'val_loss': val_total_loss,
            'val_accuracy': val_correct / val_total,
            'val_f1': val_f1,
        })
        
        # early stopping
        if val_total_loss < best_val_loss:
            best_val_loss = val_total_loss
            best_model_state = cnn_model.state_dict()
            no_improvement_count = 0
            torch.save({
                'epoch': epoch,
                'model_state_dict': best_model_state,
                'optimizer_state_dict': optimizer.state_dict(),
            }, f'{training_workspace}/{model_name}-{time_now}.pth')
        else:
            no_improvement_count += 1
            if no_improvement_count >= patience:
                print('Early stopping triggered.')
                break
            
    pd.DataFrame(stat).to_csv(f'{training_workspace}/out-{time_now}.csv', index=False)
    

# Please check cnn_model variable in train_model() before you run it!
train_model('../../../training/nikkei_mid_small_cap/i5-r5-ma50', 'nikkei_mid_small_cap-i5r5ma50')

Start epoch 0
[Epoch Train 1] - Avg Loss = 1.1201624910629124 - Correct = 163201 - Accuracy = 0.36558544835465157 - f1 = 0.3652343779070073
[Epoch Valid 1] - Avg Loss = 1.0949459677611766 - Correct = 56560 - Accuracy = 0.3800998635780193 - f1 = 0.37010105240739843
Start epoch 1
[Epoch Train 2] - Avg Loss = 1.1139425636363138 - Correct = 165446 - Accuracy = 0.3706144575614346 - f1 = 0.3703868195063156
[Epoch Valid 2] - Avg Loss = 1.091649971229704 - Correct = 56734 - Accuracy = 0.38126919484150185 - f1 = 0.3778803780078035
Start epoch 2
[Epoch Train 3] - Avg Loss = 1.1098263080855575 - Correct = 166317 - Accuracy = 0.3725655787280751 - f1 = 0.37234475517126153
[Epoch Valid 3] - Avg Loss = 1.0902618649491747 - Correct = 56764 - Accuracy = 0.38147080368003333 - f1 = 0.37957820432962425
Start epoch 3
[Epoch Train 4] - Avg Loss = 1.1080335620961606 - Correct = 165755 - Accuracy = 0.3713066463564884 - f1 = 0.3711368457328217
[Epoch Valid 4] - Avg Loss = 1.0900471156153921 - Correct = 56956 -

[Epoch Train 32] - Avg Loss = 1.083924655349703 - Correct = 175748 - Accuracy = 0.39369189758293943 - f1 = 0.39334513661712506
[Epoch Valid 32] - Avg Loss = 1.0853565430087666 - Correct = 57401 - Accuracy = 0.38575163135151846 - f1 = 0.37165155074626854
Start epoch 32
[Epoch Train 33] - Avg Loss = 1.0836390617790572 - Correct = 175639 - Accuracy = 0.3934477274254609 - f1 = 0.39300912525645565
[Epoch Valid 33] - Avg Loss = 1.0855135552758097 - Correct = 57314 - Accuracy = 0.38516696571977715 - f1 = 0.3819702391384429
Start epoch 33
[Epoch Train 34] - Avg Loss = 1.0836893042224809 - Correct = 175664 - Accuracy = 0.3935037297551578 - f1 = 0.3931856251538384
[Epoch Valid 34] - Avg Loss = 1.0851394446472207 - Correct = 57357 - Accuracy = 0.38545593838833897 - f1 = 0.3747159200507831
Start epoch 34
[Epoch Train 35] - Avg Loss = 1.0831727583063853 - Correct = 176399 - Accuracy = 0.39515019824824715 - f1 = 0.3946768197043203
[Epoch Valid 35] - Avg Loss = 1.0856955330226306 - Correct = 57223 - 