In [202]:
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.impute as im
%matplotlib inline

In [29]:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim

In [255]:
class StockDataset(torch.utils.data.Dataset):
    def __init__(self, csv_file):
        self.data = pd.read_csv(csv_file)
    
    def __len__(self):
        return len(self.data)
    
    def __impute__(self):
        fill_NaN = im.SimpleImputer(missing_values=np.nan, strategy='mean')
        print(type(self.data))
        for col in self.data.columns:
            imputed_column = fill_NaN.fit_transform(self.data[col].values.reshape(-1, 1)).T
            self.data[col] = pd.Series(imputed_column.flatten())
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        return (torch.Tensor(self.data.iloc[idx,1:-2]), self.data.iloc[idx,-1])

In [256]:
training_data = StockDataset('data/train.csv')
StockDataset.__impute__(training_data)
testing_data = StockDataset('data/test.csv')
training_data_loader = torch.utils.data.DataLoader(training_data, batch_size=32, shuffle=False)
testing_data_loader = torch.utils.data.DataLoader(testing_data, batch_size=32, shuffle=False)

In [257]:
StockDataset.__impute__(training_data)

<class 'pandas.core.frame.DataFrame'>


In [258]:
training_data.data

Unnamed: 0,id,last_price,mid,opened_position_qty,closed_position_qty,transacted_qty,d_open_interest,bid1,bid2,bid3,...,bid2vol,bid3vol,bid4vol,bid5vol,ask1vol,ask2vol,ask3vol,ask4vol,ask5vol,y
0,0.0,3842.4,3842.6,1.400583,1.964277,103.0,0.0,3842.4,3842.0,3841.8,...,1.0,6.0,14.0,6.0,6.0,1.0,1.0,10.0,2.0,1.0
1,1.0,3842.8,3843.4,6.000000,49.000000,55.0,-43.0,3843.0,3842.8,3842.4,...,6.0,11.0,1.0,6.0,1.0,4.0,4.0,1.0,13.0,0.0
2,2.0,3844.0,3844.3,7.000000,77.000000,84.0,-69.0,3843.8,3843.6,3843.2,...,1.0,4.0,21.0,12.0,1.0,16.0,10.0,4.0,9.0,0.0
3,3.0,3843.8,3843.4,3.000000,34.000000,37.0,-30.0,3843.0,3842.8,3842.4,...,13.0,12.0,2.0,4.0,2.0,7.0,1.0,2.0,11.0,1.0
4,4.0,3843.2,3843.1,3.000000,38.000000,41.0,-35.0,3842.8,3842.4,3842.0,...,12.0,2.0,2.0,4.0,1.0,3.0,1.0,11.0,15.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
592375,592375.0,4110.2,4110.3,1.000000,1.000000,2.0,1.0,4110.2,4110.0,4109.4,...,2.0,1.0,3.0,2.0,2.0,3.0,10.0,7.0,7.0,1.0
592376,592376.0,4109.4,4110.5,6.000000,5.000000,11.0,1.0,4109.2,4109.0,4108.6,...,2.0,2.0,6.0,5.0,3.0,9.0,7.0,7.0,5.0,0.0
592377,592377.0,4109.4,4110.5,1.400583,1.964277,0.0,0.0,4109.2,4109.0,4108.6,...,2.0,2.0,6.0,5.0,3.0,9.0,7.0,7.0,5.0,0.0
592378,592378.0,4109.4,4110.5,1.400583,1.964277,0.0,0.0,4109.2,4109.0,4108.6,...,2.0,2.0,6.0,5.0,3.0,9.0,7.0,7.0,5.0,0.0


In [259]:
training_data.data.columns

Index(['id', 'last_price', 'mid', 'opened_position_qty ',
       'closed_position_qty', 'transacted_qty', 'd_open_interest', 'bid1',
       'bid2', 'bid3', 'bid4', 'bid5', 'ask1', 'ask2', 'ask3', 'ask4', 'ask5',
       'bid1vol', 'bid2vol', 'bid3vol', 'bid4vol', 'bid5vol', 'ask1vol',
       'ask2vol', 'ask3vol', 'ask4vol', 'ask5vol', 'y'],
      dtype='object')

In [260]:
for col in training_data.data.columns:
    print(col)

id
last_price
mid
opened_position_qty 
closed_position_qty
transacted_qty
d_open_interest
bid1
bid2
bid3
bid4
bid5
ask1
ask2
ask3
ask4
ask5
bid1vol
bid2vol
bid3vol
bid4vol
bid5vol
ask1vol
ask2vol
ask3vol
ask4vol
ask5vol
y


In [261]:
def train(model, use_cuda, n_epochs, training_data, test_data):
    if use_cuda:
        model.cuda()
    else:
        model.cpu()
    # optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    optimizer = optim.RMSprop(model.parameters())
    criterion = nn.CrossEntropyLoss()
    
    # Train the model for n_epochs epochs, iterating on the data in batches

    # store metrics
    training_accuracy_history = np.zeros([n_epochs, 1])
    training_loss_history = np.zeros([n_epochs, 1])
    validation_accuracy_history = np.zeros([n_epochs, 1])
    validation_loss_history = np.zeros([n_epochs, 1])
    
    start_time = time.time()
    for epoch in range(n_epochs):
        print(f'Epoch {epoch+1}/10:', end='')
        train_total = 0
        train_correct = 0
        # train
        model.train()
        for i, data in enumerate(training_data):
            #print(len(data))
            images, labels = data
            #print(type(images), type(labels))
            if use_cuda:
                images, labels = images.cuda(), labels.cuda()
            else:
                images, labels = images.cpu(), labels.cpu()
            optimizer.zero_grad()
            # forward pass
            output = model(images)
            # calculate categorical cross entropy loss
            loss = criterion(output, labels)
            # backward pass
            loss.backward()
            optimizer.step()

            # track training accuracy
            _, predicted = torch.max(output.data, 1)
            train_total += labels.size(0)
            train_correct += (predicted == labels).sum().item()
            # track training loss
            training_loss_history[epoch] += loss.item()
            # progress update after 180 batches (~1/10 epoch for batch size 32)
            if i % 180 == 0: print('.',end='')
        training_loss_history[epoch] /= len(training_data)
        training_accuracy_history[epoch] = train_correct / train_total
        print(f'\n\tloss: {training_loss_history[epoch,0]:0.4f}, acc: {training_accuracy_history[epoch,0]:0.4f}',end='')

        # validate
        test_total = 0
        test_correct = 0
        with torch.no_grad():
            model.eval()
            for i, data in enumerate(test_data):
                images, labels = data
                if use_cuda:
                    images, labels = images.cuda(), labels.cuda()
                else:
                    images, labels = images.cpu(), labels.cpu()
                # forward pass
                output = model(images)
                # find accuracy
                _, predicted = torch.max(output.data, 1)
                test_total += labels.size(0)
                test_correct += (predicted == labels).sum().item()
                # find loss
                loss = criterion(output, labels)
                validation_loss_history[epoch] += loss.item()
            validation_loss_history[epoch] /= len(test_data)
            validation_accuracy_history[epoch] = test_correct / test_total
        print(f', val loss: {validation_loss_history[epoch,0]:0.4f}, val acc: {validation_accuracy_history[epoch,0]:0.4f}')
    elapsed_time = time.time() - start_time
    print(elapsed_time, "seconds elapsed")

In [262]:
model = nn.Sequential(
    nn.Linear(25, 15),
    nn.ReLU(),
    nn.Dropout(0.5),
    nn.Linear(15, 10)
)

In [263]:
train(model, True, 10, training_data_loader, testing_data_loader)

Epoch 1/10:

RuntimeError: Expected object of scalar type Long but got scalar type Float for argument #2 'target' in call to _thnn_nll_loss_forward