In [None]:
import math
import numbers
import os
import pandas as pd
import platform
import shutil
import time
import torch
import torch.cuda
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms as transforms

from collections import Counter
from datetime import date
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, random_split
from torch.utils.data.sampler import WeightedRandomSampler
from torchvision.datasets import ImageFolder
from tqdm.notebook import tqdm


In [None]:
CUDA_ENABLED = torch.cuda.is_available()
if CUDA_ENABLED:
    print(torch.cuda.get_device_name())
else:
    print(platform.processor())
device = torch.device('cuda' if CUDA_ENABLED else 'cpu')

## CNN Models

In [None]:
class I5_CNNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 64, (5, 3), padding=(2, 1), stride=(1, 1), dilation=(1, 1))
        self.conv2 = nn.Conv2d(64, 128, (5, 3), padding=(2, 1), stride=(1, 1), dilation=(1, 1))
        self.bn1 = nn.BatchNorm2d(64, affine=True)
        self.bn2 = nn.BatchNorm2d(128, affine=True)
        self.pool = nn.MaxPool2d((2, 1))
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(15360, 3)
        self.init_weights(self.conv1)
        self.init_weights(self.conv2)
        self.init_weights(self.fc)
        
    def init_weights(self, m):
        if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d):
            nn.init.xavier_uniform_(m.weight)
            m.bias.data.fill_(0.01)
            
    def forward(self, x):
        x = self.pool(F.leaky_relu(self.bn1(self.conv1(x))))
        x = self.pool(F.leaky_relu(self.bn2(self.conv2(x))))
        x = self.dropout(x.view(x.shape[0], -1))
        x = self.fc(x)
        x = F.softmax(x, dim=1)
        return x

class I20_CNNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 64, (5, 3), padding=(3, 1), stride=(3, 1), dilation=(2, 1))
        self.conv2 = nn.Conv2d(64, 128, (5, 3), padding=(3, 1), stride=(1, 1), dilation=(1, 1))
        self.conv3 = nn.Conv2d(128, 256, (5, 3), padding=(2, 1), stride=(1, 1), dilation=(1, 1))
        self.bn1 = nn.BatchNorm2d(64, affine=True)
        self.bn2 = nn.BatchNorm2d(128, affine=True)
        self.bn3 = nn.BatchNorm2d(256, affine=True)
        self.pool = nn.MaxPool2d((2, 1))
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(46080, 3)
        self.init_weights(self.conv1)
        self.init_weights(self.conv2)
        self.init_weights(self.conv3)
        self.init_weights(self.fc)
        
    def init_weights(self, m):
        if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d):
            nn.init.xavier_uniform_(m.weight)
            m.bias.data.fill_(0.01)
            
    def forward(self, x):
        x = self.pool(F.leaky_relu(self.bn1(self.conv1(x))))
        x = self.pool(F.leaky_relu(self.bn2(self.conv2(x))))
        x = self.pool(F.leaky_relu(self.bn3(self.conv3(x))))
        x = self.dropout(x.view(x.shape[0], -1))
        x = self.fc(x)
        x = F.softmax(x, dim=1)
        return x
    
class I60_CNNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 64, (5, 3), padding=(3, 1), stride=(3, 1), dilation=(3, 1))
        self.conv2 = nn.Conv2d(64, 128, (5, 3), padding=(3, 1), stride=(1, 1), dilation=(1, 1))
        self.conv3 = nn.Conv2d(128, 256, (5, 3), padding=(2, 1), stride=(1, 1), dilation=(1, 1))
        self.conv4 = nn.Conv2d(256, 512, (5, 3), padding=(2, 1), stride=(1, 1), dilation=(1, 1))
        self.bn1 = nn.BatchNorm2d(64, affine=True)
        self.bn2 = nn.BatchNorm2d(128, affine=True)
        self.bn3 = nn.BatchNorm2d(256, affine=True)
        self.bn4 = nn.BatchNorm2d(512, affine=True)
        self.pool = nn.MaxPool2d((2, 1))
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(184320, 3)
        self.init_weights(self.conv1)
        self.init_weights(self.conv2)
        self.init_weights(self.conv3)
        self.init_weights(self.conv4)
        self.init_weights(self.fc)
        
    def init_weights(self, m):
        if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d):
            nn.init.xavier_uniform_(m.weight)
            m.bias.data.fill_(0.01)
            
    def forward(self, x):
        x = self.pool(F.leaky_relu(self.bn1(self.conv1(x))))
        x = self.pool(F.leaky_relu(self.bn2(self.conv2(x))))
        x = self.pool(F.leaky_relu(self.bn3(self.conv3(x))))
        x = self.pool(F.leaky_relu(self.bn4(self.conv4(x))))
        x = self.dropout(x.view(x.shape[0], -1))
        x = self.fc(x)
        x = F.softmax(x, dim=1)
        return x

## Training & Validation

In [None]:
BATCH_SIZE = 16
EPOCH = 100
LEARNING_RATE = 1e-5

def train_model(training_workspace: str, model_name: str):
    cnn_model = I60_CNNet().to(device) # Careful: hard-coded the model here
    
    #optimizer 
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(cnn_model.parameters(), lr=LEARNING_RATE)
    
    # data source (TODO: refactor the sampler to handle both train and test data)
    transform = transforms.Compose([transforms.Grayscale(), transforms.ToTensor()])
    dataset_folder = os.path.join(training_workspace, 'input/train')
    dataset = ImageFolder(dataset_folder, transform=transform)
    train_data, valid_data = random_split(dataset, [0.75, 0.25])
    train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
    valid_loader = DataLoader(valid_data, batch_size=BATCH_SIZE, shuffle=False)
    
    time_now = int(time.time())
    stat = []
    
    patience = 5
    best_val_loss = float('inf')
    best_model_state = None
    no_improvement_count = 0
    for epoch in range(EPOCH):
        # training
        cnn_model.train()
        print(f'Start epoch {epoch}')
        y_true, y_pred = [], []
        train_total_loss = 0.0
        train_correct = 0
        train_total = 0
        for inputs, labels in train_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            outputs = cnn_model(inputs)
            
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            _, predicted = torch.max(outputs.data, 1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())
            train_total_loss += loss.item()
            train_total += labels.size(0)
            train_correct += (predicted == labels).sum().item()
        train_f1 = f1_score(y_true, y_pred, average='macro')
        print(f'[Epoch Train {epoch + 1}] - Avg Loss = {train_total_loss / len(train_loader)} - Correct = {train_correct} - Accuracy = {train_correct / train_total} - f1 = {train_f1}')
        
        # validating
        cnn_model.eval()
        y_true, y_pred = [], []
        val_total_loss = 0.0
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for inputs, labels in valid_loader:
                inputs = inputs.to(device)
                labels = labels.to(device)
                outputs = cnn_model(inputs)
                
                loss = criterion(outputs, labels)
                
                _, predicted = torch.max(outputs.data, 1)
                y_true.extend(labels.cpu().numpy())
                y_pred.extend(predicted.cpu().numpy())
                val_total_loss += loss.item()
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()
            val_f1 = f1_score(y_true, y_pred, average='macro')
            print(f'[Epoch Valid {epoch + 1}] - Avg Loss = {val_total_loss / len(valid_loader)} - Correct = {val_correct} - Accuracy = {val_correct / val_total} - f1 = {val_f1}')
            
        stat.append({
            'epoch': epoch,
            'train_loss': train_total_loss,
            'train_accuracy': train_correct / train_total,
            'train_f1': train_f1,
            'val_loss': val_total_loss,
            'val_accuracy': val_correct / val_total,
            'val_f1': val_f1,
        })
        
        # early stopping
        if val_total_loss < best_val_loss:
            best_val_loss = val_total_loss
            best_model_state = cnn_model.state_dict()
            no_improvement_count = 0
            torch.save({
                'epoch': epoch,
                'model_state_dict': best_model_state,
                'optimizer_state_dict': optimizer.state_dict(),
            }, f'{training_workspace}/{model_name}-{time_now}.pth')
        else:
            no_improvement_count += 1
            if no_improvement_count >= patience:
                print('Early stopping triggered.')
                break
            
    pd.DataFrame(stat).to_csv(f'{training_workspace}/out-{time_now}.csv', index=False)
    

# Please check cnn_model variable in train_model() before you run it!
train_model('../../../training/nikkei_mid_small_cap/i250-r5-ma50', 'nikkei_mid_small_cap-i250r5ma50')

## Model Evaluation

In [None]:
from collections import defaultdict
market_index = 'nikkei_mid_small_cap'
batch_size = 128
for item in os.scandir(f'C:/Users/User/Downloads/models'):
    if market_index in item.path:
        model_name = item.path.split('-')[1]
        if 'i5' in item.path:
            model = I5_CNNet().to(device)
        elif 'i20' in item.path:
            model = I20_CNNet().to(device)
        elif 'i60' in item.path:
            model = I60_CNNet().to(device)
        checkpoint = torch.load(item.path)
        model.load_state_dict(checkpoint['model_state_dict'])
        model.eval()

        transform = transforms.Compose([transforms.Grayscale(), transforms.ToTensor()])
        dataset_folder = os.path.join('C:/Users/User/OneDrive/OneDrive - The University of Hong Kong - Connect/Documents/HKU MScCS/COMP7705 MSc(CompSc) Project/training', market_index, model_name.replace('r', '-r').replace('ma', '-ma'), 'input/test')
        dataset = ImageFolder(dataset_folder, transform=transform)
        test_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
        
        label_accuracies = defaultdict(list)
        y_true, y_pred = [], []
        test_correct = 0
        test_total = 0
        
        with torch.no_grad():
            for inputs, labels in test_loader:

                inputs = inputs.to(device)
                labels = labels.to(device)
                outputs = model(inputs)
                                
                _, predicted = torch.max(outputs.data, 1)
                for i, label in enumerate(labels):
                    l = label.item()
                    label_accuracies[l].append(int(l==predicted[i].item()))
                    
                y_true.extend(labels.cpu().numpy())
                y_pred.extend(predicted.cpu().numpy())
                test_total += labels.size(0)
                test_correct += (predicted == labels).sum().item()
            labels_f1 = f1_score(y_true, y_pred, average=None)
            avg_f1 = f1_score(y_true, y_pred, average='macro')
            print({
                'index': f'{market_index}-{model_name}',
                'accuracy': test_correct / test_total,
                'acuracy_0': sum(label_accuracies.get(0)) / len(label_accuracies.get(0)),
                'acuracy_1': sum(label_accuracies.get(1)) / len(label_accuracies.get(1)),
                'acuracy_2': sum(label_accuracies.get(2)) / len(label_accuracies.get(2)),
                'f1': avg_f1,
                'f1_labels': labels_f1
            })