### Setup

In [None]:
# Import Libraries
import os
import getpass
import pandas as pd
import re
from io import StringIO
from sklearn.model_selection import train_test_split
from torch.utils import data
import time
import sklearn
import copy
import random
from datetime import datetime

import torch
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
import torch.nn as nn
import torch.optim as optim
import numpy as np
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt

from pytorch_pretrained_bert import BertTokenizer
import torch.nn.functional as F
import math

In [None]:
## Directory
user = getpass.getuser()
if user == 'scgst':
    dir_home = "C:\\Users\\scgst\\Documents\\Git\\COMP5329\\Assignment_2\\Code\\"
elif user == 'mgup6878':
    dir_home = "C:\\Users\\mgup6878\\Desktop\\Deep Learning\\COMP5329 Assignment 2-20200513T155933Z-001\\COMP5329 Assignment 2\\Code\\"

dir_input = os.path.join(dir_home, 'Input')
dir_output = os.path.join(dir_home, 'Output')

dir_data = os.path.join(dir_input, 'data')
train_csv = os.path.join(dir_input,'train.csv')
test_csv = os.path.join(dir_input,'test.csv')

In [None]:
def seed_all(seed = 27):
    
    """https://pytorch.org/docs/stable/notes/randomness.html"""
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_all(28)

In [None]:
# Parameters
BATCH_SIZE = 50 #30
NUM_EPOCHS = 20
LEARNING_RATE = 0.0001

# GPU or CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Loading Data csv

In [None]:
## Read in train and test tables
with open(train_csv) as file:
    lines = [re.sub(r'([^,])"(\s*[^\n])', r'\1/"\2', line) for line in file]
    train_df_full = pd.read_csv(StringIO(''.join(lines)), escapechar="/")
    
print(train_df_full.head())
print(train_df_full.shape)
print("")

with open(test_csv) as file:
    lines = [re.sub(r'([^,])"(\s*[^\n])', r'\1/"\2', line) for line in file]
    test_df = pd.read_csv(StringIO(''.join(lines)), escapechar="/")
    
print(test_df.head())
print(test_df.shape)

### Encoding

In [None]:
def get_encoding(labels):
    labels = [[int(n) for n in el ]for el in [w.split(' ') for w in labels.tolist()]]
    
    # Get 
    flat_list = []
    for sublist in labels:
        for item in sublist:
            flat_list.append(item)
            
    unique_labels = sorted(list(set(flat_list)))
    n_classes = len(unique_labels)
    
    label_dict = {l:i for i,l in enumerate(unique_labels)}
    label_dict_revert = {i:l for i,l in enumerate(unique_labels)}
    
    return(n_classes, label_dict, label_dict_revert)

def encode_target(labels, label_dict, n_classes):
    labels = [[int(n) for n in el ]for el in [w.split(' ') for w in labels.tolist()]]
    
    labels_expanded = []
    for el in labels:
        label_arr = [0] * n_classes
        for l in el:
            d = label_dict[l]
            label_arr[d] = 1
        labels_expanded.append(label_arr)
        
    return labels_expanded
# labels_expanded = encode_target(labels, label_dict, n_classes)

def revert_encoding(labels_expanded, label_dict_revert):
    full_map = []
    for el in labels_expanded:
        c = 0
        label_revert = []
        for l in el:
            if (l == 1):
                d = label_dict_revert[c]
                d = str(d)
                label_revert.append(d)
            c += 1
        s = " ".join(label_revert)
        full_map.append(s)
    
    return full_map

# encode_reverted = revert_encoding(labels_expanded, label_dict_revert)

In [None]:
labels = train_df_full['Labels']
n_classes, label_dict, label_dict_revert = get_encoding(labels)
print(n_classes)
print(label_dict)
print(label_dict_revert)

In [None]:
# Encode labels
labels_expanded = encode_target(labels, label_dict, n_classes)

# Add encoded labels to train table
train_df_full['Expanded_Labels'] = labels_expanded
train_df_full.head()

### Data Partition

In [None]:
# Split train and validation set
train_df, val_df = train_test_split(train_df_full, test_size = 0.30)
train_df = train_df.reset_index(drop = True)
val_df = val_df.reset_index(drop = True)

### BERB

In [None]:
X_train_captions = train_df.iloc[:, 2]
X_val_captions = val_df.iloc[:, 2]
X_test_captions = test_df.iloc[:, 1]

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

X_train_captions = [tokenizer.tokenize('[CLS] ' + sent + ' [SEP]') for sent in X_train_captions]
X_train_tokens = [tokenizer.convert_tokens_to_ids(sent) for sent in X_train_captions]

X_val_captions = [tokenizer.tokenize('[CLS] ' + sent + ' [SEP]') for sent in X_val_captions]
X_val_tokens = [tokenizer.convert_tokens_to_ids(sent) for sent in X_val_captions]

X_test_captions = [tokenizer.tokenize('[CLS] ' + sent + ' [SEP]') for sent in X_test_captions]
X_test_tokens = [tokenizer.convert_tokens_to_ids(sent) for sent in X_test_captions]


In [None]:
max_token = -math.inf

for i in range(len(X_train_tokens)):
    if max_token < max(X_train_tokens[i]):
        max_token = max(X_train_tokens[i])

for i in range(len(X_val_tokens)):
    if max_token < max(X_val_tokens[i]):
        max_token = max(X_val_tokens[i])
    
for i in range(len(X_test_tokens)):
    if max_token < max(X_test_tokens[i]):
        max_token = max(X_test_tokens[i])
        
max_token

for i in range(len(X_train_tokens)):
    X_train_tokens[i] = [j / max_token for j in X_train_tokens[i]]
    
for i in range(len(X_val_tokens)):
    X_val_tokens[i] = [j / max_token for j in X_val_tokens[i]]
    
for i in range(len(X_test_tokens)):
    X_test_tokens[i] = [j / max_token for j in X_test_tokens[i]]

In [None]:
MAX_LEN = max(
    max([len(i) for i in X_train_tokens]),
    max([len(i) for i in X_val_tokens]),
    max([len(i) for i in X_test_tokens])
)

X_train_tokens = [i + [0] * (MAX_LEN - len(i)) for i in X_train_tokens]
X_val_tokens = [i + [0] * (MAX_LEN - len(i)) for i in X_val_tokens]
X_test_tokens = [i + [0] * (MAX_LEN - len(i)) for i in X_test_tokens]

X_train_tokens = torch.tensor(X_train_tokens)
X_val_tokens = torch.tensor(X_val_tokens)
X_test_tokens = torch.tensor(X_test_tokens)

print(X_train_tokens.shape)
print(X_val_tokens.shape)
print(X_test_tokens.shape)

### Data Extraction

In [None]:
# Extract Data
class ImageData(data.Dataset):
    def __init__(self, df, dirpath, transform, test = False):
        self.df = df
        self.test = test
        self.dirpath = dirpath
        self.transform = transform
        
        # image data 
        self.image_arr = np.asarray(str(self.dirpath) + '/' + self.df.iloc[:, 0])          
        
        # labels data
        if not self.test:
             self.label_df = self.df.iloc[:, 3]
        
        # Calculate length of df
        self.data_len = len(self.df.index)

    def __len__(self):
        return self.data_len
    
    def __getitem__(self, idx):
        image_name = self.image_arr[idx]
        img = Image.open(image_name)
        img_tensor = self.transform(img)
        if not self.test:
            image_labels = self.label_df[idx]                
            image_label = torch.tensor(image_labels, dtype= torch.float32)
            return (img_tensor, image_label.squeeze())
        
        return (img_tensor)

In [None]:
# Image transformation
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(224), 
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225])
    ]),
    'test': transforms.Compose([
        transforms.Resize(255),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225])
    ])
}

# Loading data
train_dataset = ImageData(train_df, dir_data, data_transforms['train'])
train_loader = data.DataLoader(
    dataset = train_dataset,
    batch_size = BATCH_SIZE,
    shuffle = False
)
features_train, labels_train = next(iter(train_loader))

val_dataset = ImageData(val_df, dir_data, data_transforms['test'])
val_loader = data.DataLoader(
    dataset = val_dataset,
    batch_size = BATCH_SIZE,
    shuffle = False
)
features_val, labels_val = next(iter(val_loader))

train_full_dataset = ImageData(train_df_full, dir_data, data_transforms['train'])
train_full_loader = data.DataLoader(
    dataset = train_full_dataset,
    batch_size = BATCH_SIZE,
    shuffle = False
)
features_train_full, labels_train_full = next(iter(train_full_loader))

test_dataset = ImageData(test_df, dir_data, data_transforms['test'], test = True)
test_loader = data.DataLoader(
    dataset = test_dataset,
    batch_size = BATCH_SIZE,
    shuffle = False
)
features_test = next(iter(test_loader))

In [None]:
print(f"Train Data Length: {len(train_df)}\nMini Batch Size: {BATCH_SIZE}\nBatch Numbers: {len(train_loader)}\nTrain Features: {features_train.shape}\nTrain Labels: {labels_train.shape}")
print()
print(f"Validation Data Length: {len(val_df)}\nMini Batch Size: {BATCH_SIZE}\nBatch Numbers: {len(val_loader)}\nValidation Features: {features_val.shape}\nValidation Labels: {labels_val.shape}")
print()
print(f"Full Train Data Length: {len(train_df_full)}\nMini Batch Size: {BATCH_SIZE}\nBatch Numbers: {len(train_full_loader)}\nFull Train Features: {features_train_full.shape}\nFull Train Labels: {labels_train_full.shape}")
print()
print(f"Test Data Length: {len(test_df)}\nMini Batch Size: {BATCH_SIZE}\nBatch Numbers: {len(test_loader)}\nTest Features: {features_test.shape}")

### Model Development

In [None]:
# torch.cuda.empty_cache()

In [None]:
# Get pretrained model using torchvision.models as models library
model = models.densenet161(pretrained = True)
for param in model.parameters():
    param.requires_grad = False
    
total_params = sum(p.numel() for p in model.parameters())
print(f'{total_params:,} total parameters.')
total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'{total_trainable_params:,} training parameters.')
print()

# Create new classifier for model using torch.nn as nn library
classifier_input = model.classifier.in_features
print('Number of Outputs from densenet161 features: ' + str(classifier_input))
print()
num_labels = n_classes #PUT IN THE NUMBER OF LABELS IN YOUR DATA
classifier = nn.Sequential(
    nn.Linear(classifier_input + MAX_LEN, 1024),
    nn.ReLU(),
    nn.Linear(1024, 512),
    nn.ReLU(),
    nn.Linear(512, 300),
    nn.ReLU(),
    nn.Linear(300, 200),
    nn.ReLU(),
    nn.Linear(200, 100),
    nn.ReLU(),
    nn.Linear(100, num_labels),
    nn.LogSoftmax(dim = 1)
)
# Replace default classifier with new classifier
model.classifier = classifier

total_params = sum(p.numel() for p in model.parameters())
print(f'{total_params:,} total parameters.')
total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'{total_trainable_params:,} training parameters.')

# Move model to the device specified above
model.to(device)

# Set the error function using torch.nn as nn library
criterion = nn.BCEWithLogitsLoss()

# Set the optimizer function using torch.optim as optim library
optimizer = optim.Adam(model.classifier.parameters(), lr = LEARNING_RATE)

### Training Model

In [None]:
running_train_loss = []
running_val_loss = []
best_loss = np.inf

for epoch in range(NUM_EPOCHS):
    start_time = time.time()
    train_loss = 0
    val_loss = 0
    
    # Training the model
    model.train()
    mini_batch_counter = 0
    for inputs, labels in train_loader:
        # Print the progress of our training
        if (mini_batch_counter % 50) == 0:
            print("Epoch: {}/{} | Phase: 'Train' | Batch: {}/{} | Time: {}".format(
              epoch + 1,
              NUM_EPOCHS, 
              mini_batch_counter + 1,
              len(train_loader),
              datetime.now()
            ))
        
        # Text mini batch
        text_train_mini_batch = X_train_tokens[mini_batch_counter * BATCH_SIZE : (mini_batch_counter + 1) * BATCH_SIZE]
        text_train_mini_batch = text_train_mini_batch.float()     
        # Move to device
        inputs, labels = inputs.to(device), labels.to(device)
        # Clear optimizers
        optimizer.zero_grad()
        # Forward pass
        # output = model.forward(inputs)
        features = model.features(inputs)
        out = F.relu(features, inplace=True)
        out = F.adaptive_avg_pool2d(out, (1, 1))
        out = torch.flatten(out, 1)
        concatenated_embeddings_torch = torch.cat((out.to(device), text_train_mini_batch.to(device)), 1)
        output = model.classifier(concatenated_embeddings_torch)
        # Loss
        loss = criterion(output, labels)
        # Calculate gradients (backpropogation)
        loss.backward()
        # Adjust parameters based on gradients
        optimizer.step()
        # Add the loss to the training set's running loss
        train_loss += loss.item() * inputs.size(0)
        
        mini_batch_counter += 1
    
    # Get the average loss for the entire epoch
    train_loss = train_loss / len(train_loader.dataset)   
    running_train_loss.append(train_loss)
    elapsed_train_time = time.time() - start_time
    
    print('Epoch: {} / {} \tTraining Loss: {:.6f} \tTrain Time: {:.6f}mins'.format(
        epoch + 1, NUM_EPOCHS, train_loss, elapsed_train_time / 60
    ))

    # Evaluating the model
    model.eval()
    mini_batch_counter = 0
    # Tell torch not to calculate gradients
    with torch.no_grad():
        for inputs, labels in val_loader:
            # Print the progress of our training
            if (mini_batch_counter % 50) == 0:
                print("Epoch: {}/{} | Phase: 'Test' | Batch: {}/{} | Time: {}".format(
                  epoch + 1,
                  NUM_EPOCHS, 
                  mini_batch_counter + 1,
                  len(val_loader),
                  datetime.now()
                ))
                
            # Text mini batch
            text_val_mini_batch = X_val_tokens[mini_batch_counter * BATCH_SIZE : (mini_batch_counter + 1) * BATCH_SIZE]
            text_val_mini_batch = text_val_mini_batch.float()     
            # Move to device
            inputs, labels = inputs.to(device), labels.to(device)
            # Forward pass
            # output = model.forward(inputs)
            features = model.features(inputs)
            out = F.relu(features, inplace = True)
            out = F.adaptive_avg_pool2d(out, (1, 1))
            out = torch.flatten(out, 1)
            concatenated_embeddings_torch = torch.cat((out.to(device), text_val_mini_batch.to(device)), 1)
            output = model.classifier(concatenated_embeddings_torch)
            # Calculate Loss
            valloss = criterion(output, labels)
            # Add loss to the validation set's running loss
            val_loss += valloss.item()*inputs.size(0)

            mini_batch_counter += 1
            
    # Get the average loss for the entire epoch
    valid_loss = val_loss/len(val_loader.dataset)
    running_val_loss.append(valid_loss)
    elapsed_test_time = time.time() - start_time - elapsed_train_time
    
    if valid_loss < best_loss:
        best_loss = valid_loss
        best_epoch = epoch
        best_model_wts = copy.deepcopy(model.state_dict())
    
    # Print out the information
    print('Epoch: {} / {} \tValidation Loss: {:.6f} \tValidation Time: {:.6f}mins'.format(
        epoch + 1, NUM_EPOCHS, valid_loss, elapsed_test_time/60
    ))
    
    # plot the cost
    plt.plot(running_val_loss)
    plt.ylabel('cost')
    plt.xlabel('epochs')
    plt.show()

print('Best Epoch is ' + str(best_epoch))
model.load_state_dict(best_model_wts)

### Loss v.s. Epochs

In [None]:
# plot the cost
plt.plot(running_train_loss)
plt.ylabel('cost')
plt.xlabel('epochs')
plt.show()

In [None]:
# plot the cost
plt.plot(running_val_loss)
plt.ylabel('cost')
plt.xlabel('epochs')
plt.show()

### Prediction and Scoring on Validation Set

In [None]:
model.eval()

# Get output
start_time = time.time()
whole_val_outputs = np.zeros((len(val_dataset), n_classes))
whole_val_labels = np.zeros((len(val_dataset), n_classes))

mini_batch_counter = 0
for val_batch_input, val_batch_labels in val_loader:
    if ((mini_batch_counter) % 50 == 0):
        print(str(mini_batch_counter + 1) + '/' + str(len(val_loader)))

    # Text mini batch
    text_val_mini_batch = X_val_tokens[mini_batch_counter * BATCH_SIZE : (mini_batch_counter + 1) * BATCH_SIZE]
    text_val_mini_batch = text_val_mini_batch.float()

    # Move to device
    val_batch_input = val_batch_input.to(device)
    # Forward pass
    # val_batch_output = model.forward(val_batch_input).detach().cpu().numpy()
    features = model.features(val_batch_input)
    out = F.relu(features, inplace = True)
    out = F.adaptive_avg_pool2d(out, (1, 1))
    out = torch.flatten(out, 1)
    concatenated_embeddings_torch = torch.cat((out.to(device), text_val_mini_batch.to(device)), 1)
    val_batch_output = model.classifier(concatenated_embeddings_torch).detach().cpu().numpy()
    
    val_batch_labels = val_batch_labels.detach().cpu().numpy()
    
    # Since our model outputs a LogSoftmax, find the real 
    # percentages by reversing the log function
    whole_val_outputs[mini_batch_counter * BATCH_SIZE:(mini_batch_counter + 1) * BATCH_SIZE, :] = np.exp(val_batch_output)
    whole_val_labels[mini_batch_counter * BATCH_SIZE:(mini_batch_counter + 1) * BATCH_SIZE, :] = val_batch_labels
    mini_batch_counter += 1
    
elapsed_time = time.time() - start_time
print(elapsed_time)

In [None]:
whole_val_labels[0]

In [None]:
whole_val_outputs[0]

In [None]:
# Get Prediction on Validation
PERCENTILE = 99.7
whole_val_predictions = np.zeros(whole_val_outputs.shape)
for i in range(len(whole_val_outputs)):
    whole_val_predictions[i, whole_val_outputs[i] > np.percentile(whole_val_outputs[i], PERCENTILE)] = 1

# Calculate F1 Score on validation set
print(sklearn.metrics.f1_score(y_true = whole_val_labels, y_pred = whole_val_predictions, average = 'weighted'))
# # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
# print(sklearn.metrics.f1_score(y_true = whole_val_labels, y_pred = whole_val_predictions, average = 'sample'))

In [None]:
whole_val_predictions[0]

### Prediction on Test Set

In [None]:
# Final Prediction
# Get output
start_time = time.time()
whole_test_outputs = np.zeros((len(test_dataset), n_classes))
whole_test_outputs = np.zeros((len(test_dataset), n_classes))
mini_batch_counter = 0
for test_batch_input in test_loader:
    if ((mini_batch_counter) % 50 == 0):
        print(str(mini_batch_counter + 1) + '/' + str(len(test_loader)))
    
    # Text mini batch
    text_test_mini_batch = X_test_tokens[mini_batch_counter * BATCH_SIZE : (mini_batch_counter + 1) * BATCH_SIZE]
    text_test_mini_batch = text_test_mini_batch.float()     

    test_batch_input = test_batch_input.to(device)
    
    # Forward
    # test_batch_output = model.forward(test_batch_input).detach().cpu().numpy()
    features = model.features(test_batch_input)
    out = F.relu(features, inplace = True)
    out = F.adaptive_avg_pool2d(out, (1, 1))
    out = torch.flatten(out, 1)
    concatenated_embeddings_torch = torch.cat((out.to(device), text_test_mini_batch.to(device)), 1)

    test_batch_output = model.classifier(concatenated_embeddings_torch).detach().cpu().numpy()

    # Since our model outputs a LogSoftmax, find the real 
    # percentages by reversing the log function
    whole_test_outputs[mini_batch_counter * BATCH_SIZE:(mini_batch_counter + 1) * BATCH_SIZE, :] = np.exp(test_batch_output)
    mini_batch_counter += 1
    
elapsed_time = time.time() - start_time
print(elapsed_time)

In [None]:
# Get Prediction on Test
PERCENTILE = 99.7
whole_test_predictions = np.zeros(whole_test_outputs.shape)
for i in range(len(whole_test_predictions)):
    whole_test_predictions[i, whole_test_outputs[i] > np.percentile(whole_test_outputs[i], PERCENTILE)] = 1

### Submission

In [None]:
# Submission
submission = revert_encoding(whole_test_predictions, label_dict_revert)

In [None]:
np.array(submission).shape

In [None]:
test_df['Labels'] = submission
test_df = test_df.drop(columns = 'Caption')
test_df

In [None]:
test_df.to_csv(os.path.join(dir_output, 'Submission_Model_V8_densenet161_BERB.csv'), index = False)

### Save the Model

In [None]:
# Save the model
PATH = os.path.join(dir_output, 'Model_V8_densenet161_BERB.pth')
torch.save(model.state_dict(), PATH)