# VGGNet Training

In [1]:
# Mount to Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Move to your current working directory
%cd drive/MyDrive/image_processing

Mounted at /content/drive
/content/drive/MyDrive/image_processing


In [2]:
import torch
if torch.cuda.is_available():
  device = torch.device('cuda')
else:
  device = torch.device('cpu')
print('Device', device)

Device cuda


In [3]:
import os
import pandas as pd
import numpy as np
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as T
import torch.optim as optim
import matplotlib.pyplot as plt
from PIL import Image
from torch.utils.data import Dataset, DataLoader

## Load Images

In [4]:
class CustomDataset(Dataset):
  def __init__(self, image_dir, transform):
    super().__init__()
    self.image_dir = image_dir
    self.transform = transform
    self.images = os.listdir(self.image_dir)

  def __len__(self):
    return len(self.images)

  def __getitem__(self, i):
    image_path = os.path.join(self.image_dir, self.images[i])
    image = Image.open(image_path).convert('L')
    image = self.transform(image)
    label = int(self.images[i][1])
    return (image, label)

In [5]:
training_data = CustomDataset('img_span12/training', T.Compose([T.Resize((224, 224)), T.ToTensor()]))
testing_data = CustomDataset('img_span12/testing', T.Compose([T.Resize((224, 224)), T.ToTensor()]))

In [6]:
print('training images:', len(training_data))
print('training images:', len(testing_data))

training images: 7746
training images: 2350


In [7]:
print(training_data[0])
print(testing_data[0])

(tensor([[[0.0000, 0.7059, 0.4902,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.7059, 0.4902,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.7059, 0.4902,  ..., 0.0000, 0.0000, 0.0000],
         ...,
         [0.0000, 0.7059, 0.4902,  ..., 0.4902, 0.7059, 0.0000],
         [0.0000, 0.7059, 0.4902,  ..., 0.4902, 0.7059, 0.0000],
         [0.0000, 0.7059, 0.4902,  ..., 0.4902, 0.7059, 0.0000]]]), 0)
(tensor([[[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         ...,
         [0.0000, 0.7059, 0.4902,  ..., 0.2118, 0.3020, 0.0000],
         [0.0000, 0.7059, 0.4902,  ..., 0.4196, 0.6039, 0.0000],
         [0.0000, 0.7059, 0.4902,  ..., 0.4902, 0.7059, 0.0000]]]), 0)


## Build VGGNet Model

In [8]:
class VGGNetModel(nn.Module):
  def __init__(self):
    super().__init__()

    self.features = nn.Sequential(
        # Conv Layer block 1
        nn.Conv2d(in_channels=1, out_channels=64, kernel_size=3, padding=1),
        nn.ReLU(),
        nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, padding=1),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2, stride=2),

        # Conv Layer block 2
        nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1),
        nn.ReLU(),
        nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, padding=1),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2, stride=2),

        # Conv Layer block 3
        nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=1),
        nn.ReLU(),
        nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1),
        nn.ReLU(),
        nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2, stride=2),

        # Conv Layer block 4
        nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, padding=1),
        nn.ReLU(),
        nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
        nn.ReLU(),
        nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2, stride=2),

        # Conv Layer block 5
        nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
        nn.ReLU(),
        nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
        nn.ReLU(),
        nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2, stride=2),
    )

    self.classifier = nn.Sequential(
      nn.Linear(512*7*7, 4096),
      nn.ReLU(),
      nn.Dropout(p=0.5),
      nn.Linear(4096, 4096),
      nn.ReLU(),
      nn.Dropout(p=0.5),
      nn.Linear(4096, 1000),
    )
    self.dropout = nn.Dropout(p=0.5)
    self.fc1 = nn.Linear(1000, 10)
    self.fc2 = nn.Linear(1000, 2)

  def forward(self, x):
    x = self.features(x)
    x = x.view(x.size(0), -1)  # Flatten
    x = self.classifier(x)
    x = self.dropout(x)
    out1 = self.fc1(x)
    out2 = self.fc2(x)
    return out1, out2

In [9]:
model = VGGNetModel()
print(model)

model = model.cuda()

VGGNetModel(
  (features): Sequential(
    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU()
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU()
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU()
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU()
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU()
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU()
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (17): Conv2d(256, 512, kernel_size=(3, 3), stride=(1,

## Training

In [10]:
# Constants for model training process
BATCH_SIZE = 128
NUM_EPOCHS = 10
PRINT_EVERY = 10

In [11]:
train_loader = DataLoader(training_data, batch_size=BATCH_SIZE, shuffle=True)

In [12]:
next(iter(train_loader))

[tensor([[[[0.0000, 0.0000, 0.0000,  ..., 0.4902, 1.0000, 1.0000],
           [0.0000, 0.0000, 0.0000,  ..., 0.4902, 0.9569, 0.8588],
           [0.0000, 0.0000, 0.0000,  ..., 0.4902, 0.8314, 0.4275],
           ...,
           [0.0000, 0.7059, 0.4902,  ..., 0.4902, 0.7059, 0.0000],
           [0.0000, 0.7059, 0.4902,  ..., 0.4902, 0.7059, 0.0000],
           [0.0000, 0.7059, 0.4902,  ..., 0.4902, 0.7059, 0.0000]]],
 
 
         [[[0.0000, 0.0000, 0.0000,  ..., 0.4902, 0.7059, 0.0000],
           [0.0000, 0.0000, 0.0000,  ..., 0.4902, 0.7490, 0.1412],
           [0.0000, 0.0000, 0.0000,  ..., 0.4902, 0.8745, 0.5725],
           ...,
           [0.0000, 0.7059, 0.4902,  ..., 0.4902, 0.7059, 0.0000],
           [0.0000, 0.7059, 0.4902,  ..., 0.4902, 0.7059, 0.0000],
           [0.0000, 0.7059, 0.4902,  ..., 0.4902, 0.7059, 0.0000]]],
 
 
         [[[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
           [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
           [0.0000

In [13]:
# Define loss function and optimizer
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

In [14]:
for epoch in range(NUM_EPOCHS):
  training_acc = 0
  total = 0
  for counter, (x, y) in enumerate(train_loader):
    model.train()
    x, y = x.to(device), y.to(device)
    _, scores = model(x)
    loss = loss_function(scores, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    training_acc += scores.max(1)[1].eq(y).sum().item()
    total += y.size(0)
    if counter % PRINT_EVERY == 0:
      print(f'Epoch[{epoch+1}], Batch[{counter}], Loss: {loss.item()}, Training acc: {training_acc/total}')
  print(f'Epoch[{epoch+1}] completed. Training acc: {training_acc/total}')

Epoch[1], Batch[0], Loss: 0.6925040483474731, Training acc: 0.546875
Epoch[1], Batch[10], Loss: 0.6987624168395996, Training acc: 0.5142045454545454
Epoch[1], Batch[20], Loss: 0.6902292370796204, Training acc: 0.49069940476190477
Epoch[1], Batch[30], Loss: 0.6964388489723206, Training acc: 0.4967237903225806
Epoch[1], Batch[40], Loss: 0.705978512763977, Training acc: 0.5066692073170732
Epoch[1], Batch[50], Loss: 0.6929206252098083, Training acc: 0.5131740196078431
Epoch[1], Batch[60], Loss: 0.6879047751426697, Training acc: 0.5151045701006971
Epoch[1] completed. Training acc: 0.5151045701006971
Epoch[2], Batch[0], Loss: 0.6888668537139893, Training acc: 0.5390625
Epoch[2], Batch[10], Loss: 0.6858114004135132, Training acc: 0.5241477272727273
Epoch[2], Batch[20], Loss: 0.6934009790420532, Training acc: 0.5279017857142857
Epoch[2], Batch[30], Loss: 0.6902215480804443, Training acc: 0.5262096774193549
Epoch[2], Batch[40], Loss: 0.7016856074333191, Training acc: 0.53125
Epoch[2], Batch[50]

## Output Result

In [15]:
class OutputDataset(Dataset):
  def __init__(self, image_dir, transform):
    super().__init__()
    self.image_dir = image_dir
    self.transform = transform
    self.images = os.listdir(self.image_dir)

  def __len__(self):
    return len(self.images)

  def __getitem__(self, i):
    image_path = os.path.join(self.image_dir, self.images[i])
    image = Image.open(image_path).convert('L')
    image = self.transform(image)
    label = int(self.images[i][1])
    stock = self.images[i].split('_')[1]
    date = self.images[i].split('_')[2].replace('.png', '')
    return (image, label, stock, date)

In [16]:
training_data = OutputDataset('img_span12/training', T.Compose([T.Resize((224, 224)), T.ToTensor()]))
testing_data = OutputDataset('img_span12/testing', T.Compose([T.Resize((224, 224)), T.ToTensor()]))

In [17]:
train_loader = DataLoader(training_data, batch_size=32, shuffle=False)
test_loader = DataLoader(testing_data, batch_size=32, shuffle=False)

In [18]:
def extract_vectors(model, dataloader):
  model.eval()
  all_records = []
  total_correct = 0
  total_samples = 0

  with torch.no_grad():
    for x, y, stocks, dates in dataloader:
      x, y = x.to(device), y.to(device)
      outputs, scores = model(x)
      outputs = outputs.cpu().numpy()

      predictions = scores.max(1)[1]
      total_correct += predictions.eq(y).sum().item()
      total_samples += y.size(0)

      for i in range(len(dates)):
        record = {
            'stock': stocks[i],
            'date': dates[i],
            'vector': ','.join(map(str, outputs[i])),
            'label': y[i].item()
        }
        all_records.append(record)
    print('Acc:', total_correct/total_samples)
  return all_records

In [19]:
# training datasets
records = extract_vectors(model, train_loader)

df = pd.DataFrame(records)
df.sort_values(by=['stock', 'date'])
print(df)

# Save the DataFrame to a CSV file
df.to_csv('output/VGGNet_output_vectors_training.csv', index=True)

Acc: 0.5325329202168861
     stock        date                                             vector  \
0     2912  2017-02-10  -0.0469331,-0.009767657,-0.012199484,-0.015377...   
1     2912  2017-03-02  -0.0469331,-0.009767657,-0.012199484,-0.015377...   
2     2912  2017-03-20  -0.0469331,-0.009767657,-0.012199484,-0.015377...   
3     2912  2017-04-07  -0.0469331,-0.009767657,-0.012199484,-0.015377...   
4     2912  2017-04-25  -0.0469331,-0.009767657,-0.012199484,-0.015377...   
...    ...         ...                                                ...   
7741  1303  2018-05-24  -0.0469331,-0.009767657,-0.012199484,-0.015377...   
7742  1303  2018-06-11  -0.0469331,-0.009767657,-0.012199484,-0.015377...   
7743  1303  2018-06-28  -0.0469331,-0.009767657,-0.012199484,-0.015377...   
7744  1303  2018-07-16  -0.046933096,-0.009767646,-0.01219948,-0.01537...   
7745  1303  2018-08-01  -0.046933096,-0.009767646,-0.01219948,-0.01537...   

      label  
0         0  
1         1  
2        

In [20]:
# testing datasets
records = extract_vectors(model, test_loader)

df = pd.DataFrame(records)
df.sort_values(by=['stock', 'date'])
print(df)

# Save the DataFrame to a CSV file
df.to_csv('output/VGGNet_output_vectors_testing.csv', index=True)

Acc: 0.5114893617021277
     stock        date                                             vector  \
0     2615  2022-01-10  -0.0469331,-0.009767657,-0.012199484,-0.015377...   
1     2615  2022-01-26  -0.0469331,-0.009767657,-0.012199484,-0.015377...   
2     2615  2022-02-22  -0.0469331,-0.009767657,-0.012199484,-0.015377...   
3     2615  2022-03-11  -0.0469331,-0.009767657,-0.012199484,-0.015377...   
4     2615  2022-03-29  -0.0469331,-0.009767657,-0.012199484,-0.015377...   
...    ...         ...                                                ...   
2345  1402  2024-04-09  -0.046933092,-0.009767648,-0.012199477,-0.0153...   
2346  1402  2024-04-25  -0.046933092,-0.009767648,-0.012199477,-0.0153...   
2347  1402  2024-05-14  -0.046933092,-0.009767648,-0.012199477,-0.0153...   
2348  1402  2024-05-30  -0.046933092,-0.009767648,-0.012199477,-0.0153...   
2349  1402  2024-06-18  -0.046933092,-0.009767648,-0.012199477,-0.0153...   

      label  
0         0  
1         1  
2        