# Setup

In [1]:
%load_ext autoreload
%autoreload 2

## Definitions

In [51]:
from utils import GPTVectorizedDataset
from torch.utils.data import IterableDataset, Dataset, DataLoader, random_split
import os
import sys
import torch
from pathlib import Path
import gc
import utils
import time
import math
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

class ConcatenatingDataset(IterableDataset):
    def __init__(self, ensemble_dataset, gpt_dataset, start=0.1, end=1):
        super().__init__()
#       assert len(ensemble_dataset) == len(gpt_dataset)
        self.start = start
        self.end = end
        self.ensemble_dataset = ensemble_dataset
        self.gpt_dataset = gpt_dataset
        self.idx = 0

    def __len__(self):
        return len(self.gpt_dataset) if len(self.gpt_dataset) < len(self.ensemble_dataset) else len(self.ensemble_dataset)

    def __iter__(self):
        length = len(self)
        start = math.floor(self.start * length)
        end = math.floor(self.end * length)
        for ensemble_X, gpt in zip(self.ensemble_dataset, self.gpt_dataset):
            self.idx += 1
            yield torch.cat((ensemble_X, gpt[0]), 0), gpt[1]

            
class ChunkDataset(IterableDataset):
    def __init__(self, file_paths):
        super().__init__()
        self.file_paths = file_paths
        self.total_length = None

    def __len__(self):
        if self.total_length is not None:
            return self.total_length
        
        self.total_length = 0
        for fp in self.file_paths:
            print(f'Loading for len {fp}')
            self.total_length += len(torch.load(fp).X)
            gc.collect()
        return self.total_length

    def __iter__(self):
        for fp in self.file_paths:
            print(f'Loading for iter {fp}')
            dataset = torch.load(fp)
            gc.collect()
            if hasattr(dataset, 'y'):
                for x, y in zip(dataset.X, dataset.y):
                    yield x, y
            else:
                for x in dataset.X:
                    yield x


def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


def validation_split(dataset, validation_subset, seed=42):

    if validation_subset > 0:
        n_total_samples = len(dataset)
        n_train_samples = math.floor(n_total_samples * (1-validation_subset))
        n_valid_samples = n_total_samples - n_train_samples

        train_subset, valid_subset = random_split(
            dataset,
            [n_train_samples, n_valid_samples],
            generator=torch.Generator().manual_seed(seed)
        )  # reproducible results

    else:
        train_subset = dataset
        valid_subset = None

    return train_subset, valid_subset


def multi_label_scores(correct_labels, predicted_labels):

    accuracy = accuracy_score(correct_labels, predicted_labels)
    precision = precision_score(correct_labels, predicted_labels, average='weighted', zero_division=0)
    recall = recall_score(correct_labels, predicted_labels, average='weighted', zero_division=0)
    f_1_score = f1_score(correct_labels, predicted_labels, average='weighted', zero_division=0)
    
    return accuracy, precision, recall, f_1_score

def gettags(head_model, features, eval=False):
    head_model.eval()
    features = features.unsqueeze(0).to(device)
    with torch.no_grad():
        logits = head_model(features)
        multi_label_sigmoids = head_model.act(logits)

    preds = multi_label_sigmoids > 0.5
    preds = preds.detach().cpu()

    return mlb.inverse_transform(preds)

In [21]:
import torch
import torch.optim as optim
import torch.nn as nn
from tqdm.notebook import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    epoch_precision = 0
    epoch_recall = 0
    epoch_f_score = 0
    
    model.train()
    

    for i, batch in enumerate(iterator):

        article_embeddings, labels, idx  = batch
        article_embeddings = article_embeddings.to(device)
        labels = labels.type(torch.float).to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(article_embeddings)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # calculate metrics
        preds = model.act(outputs) > 0.5

        acc, precision, recall, f1 = multi_label_scores(labels.detach().cpu(), preds.detach().cpu())
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        epoch_precision += precision.item()
        epoch_recall += recall.item()
        epoch_f_score += f1.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator), \
        epoch_precision / len(iterator), epoch_recall / len(iterator), \
        epoch_f_score / len(iterator)


def evaluate(model, iterator, criterion, scheduler=None):
    
    epoch_loss = 0
    epoch_acc = 0
    epoch_precision = 0
    epoch_recall = 0
    epoch_f_score = 0
    
    model.eval()
    
    with torch.no_grad():
        for i, batch in enumerate(iterator):

            article_embeddings, labels, idx  = batch
            article_embeddings = article_embeddings.to(device)
            labels = labels.type(torch.float).to(device)

            outputs = model(article_embeddings)
            loss = criterion(outputs, labels)

            # calculate metrics
            preds = model.act(outputs) > 0.5

            acc, precision, recall, f1 = multi_label_scores(labels.detach().cpu(), preds.detach().cpu())
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            epoch_precision += precision.item()
            epoch_recall += recall.item()
            epoch_f_score += f1.item()
            
    # lr scheduling
    if scheduler:
        scheduler.step(epoch_loss / len(iterator))
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator), \
        epoch_precision / len(iterator), epoch_recall / len(iterator), \
        epoch_f_score / len(iterator)

## Load training data

In [22]:
gpt_vectorized_chunks_path = Path('/gpfs/space/projects/stud_nlp_share/cutoff/GPT/vectorized/train')
gpt_file_paths = os.listdir(gpt_vectorized_chunks_path)
sorted_gpt_filenames = sorted(gpt_file_paths, key=lambda fn: int(fn.split('chunk')[1].split('of')[0]))
sorted_gpt_filepaths = [gpt_vectorized_chunks_path / Path(p) for p in sorted_gpt_filenames]

In [23]:
ensemble_vectorized_chunks_path = '/gpfs/space/projects/stud_nlp_share/cutoff/ensemble/vectorized/train'
ensemble_file_paths = os.listdir(ensemble_vectorized_chunks_path)
sorted_ensemble_filenames = sorted(ensemble_file_paths, key=lambda fn: int(fn.split('chunk')[1].split('of')[0]))
sorted_ensemble_filepaths = [ensemble_vectorized_chunks_path / Path(p) for p in sorted_ensemble_filenames]

In [24]:
sorted_gpt_filenames

['train_1195k_min500_cutoff_replace_chunk1of4.pt',
 'train_1195k_min500_cutoff_replace_chunk2of4.pt',
 'train_1195k_min500_cutoff_replace_chunk3of4.pt',
 'train_1195k_min500_cutoff_replace_chunk4of4.pt']

In [52]:
ensemble_dataset = ChunkDataset(sorted_ensemble_filepaths)
gpt_dataset_first_chunk = ChunkDataset(sorted_gpt_filepaths)

concat_dataset = ConcatenatingDataset(ensemble_dataset, gpt_dataset_first_chunk)

In [53]:
# splitting train/validation
batch_size = 256
seed = 42

train_subset, valid_subset = validation_split(concat_dataset, 0.1, seed)
train_loader = DataLoader(train_subset, batch_size=batch_size)
valid_loader = DataLoader(valid_subset, batch_size=batch_size)

n_training_samples = len(concat_dataset)
n_training_samples

Loading for len /gpfs/space/projects/stud_nlp_share/cutoff/GPT/vectorized/train/train_1195k_min500_cutoff_replace_chunk1of4.pt
Loading for len /gpfs/space/projects/stud_nlp_share/cutoff/GPT/vectorized/train/train_1195k_min500_cutoff_replace_chunk2of4.pt
Loading for len /gpfs/space/projects/stud_nlp_share/cutoff/GPT/vectorized/train/train_1195k_min500_cutoff_replace_chunk3of4.pt
Loading for len /gpfs/space/projects/stud_nlp_share/cutoff/GPT/vectorized/train/train_1195k_min500_cutoff_replace_chunk4of4.pt
Loading for len /gpfs/space/projects/stud_nlp_share/cutoff/ensemble/vectorized/train/ensemble_vect_train_chunk1of24.pt
Loading for len /gpfs/space/projects/stud_nlp_share/cutoff/ensemble/vectorized/train/ensemble_vect_train_chunk2of24.pt
Loading for len /gpfs/space/projects/stud_nlp_share/cutoff/ensemble/vectorized/train/ensemble_vect_train_chunk3of24.pt
Loading for len /gpfs/space/projects/stud_nlp_share/cutoff/ensemble/vectorized/train/ensemble_vect_train_chunk4of24.pt
Loading for len 

1195938

In [32]:
len(train_loader)

4205

In [54]:
for i in train_loader:
    break

Loading for iter /gpfs/space/projects/stud_nlp_share/cutoff/ensemble/vectorized/train/ensemble_vect_train_chunk1of24.pt
Loading for iter /gpfs/space/projects/stud_nlp_share/cutoff/GPT/vectorized/train/train_1195k_min500_cutoff_replace_chunk1of4.pt


## Define model

In [28]:
class HeadClassifier(nn.Module):

    def __init__(self, hidden_dim, num_classes=538):
        super().__init__()
        self.relu = nn.ReLU()

        self.lin1 = nn.Linear(hidden_dim, 1600)
        self.lin2 = nn.Linear(1600, 900)
        self.lin3 = nn.Linear(900, num_classes)
        
        self.act = nn.Sigmoid()  # used manually after `forward()` to compute preds/accuracies

    def forward(self, x):
        x = self.relu(self.lin1(inputs))
        x = self.relu(self.lin2(inputs))
        x = self.relu(self.lin3(inputs))

        return x

## Training!

In [29]:
import wandb

# Start a new run
run = wandb.init(project='ensemble-plus-gpt2', entity='ut-mit-news-classify')

#hyperparams
epochs = 200
patience = 50
# `batch_size` is defined earlier
# `seed` is defined earlier
learning_rate = 1e-3
validation_split = 0.1

# reduce learning rate callback params
factor = 0.2
reduce_lr_patience = 5
min_lr = 1e-6

# Save hyperparameters
config = wandb.config
config.batch_size = batch_size
config.epochs = epochs
config.early_stopping_patience = patience
config.learning_rate = learning_rate
config.validation_split = validation_split

config.reduce_lr = f'fac{factor}_patience{reduce_lr_patience}_min_lr{min_lr}'

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

In [30]:
# model
model = HeadClassifier(1768, 538).to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(),
                    lr = 5e-1, # default is 5e-5, our notebook had 2e-5
                  )
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=factor, patience=reduce_lr_patience, min_lr=min_lr, verbose=True)

# training
epochs_of_no_improvement = 0
best_valid_loss = float('inf')

best_val_loss_model_filename = 'models/ensemble-gpt-vectors-with-filtered-and-cut-off-ends-full.h5'
wandb.save(best_val_loss_model_filename)

for epoch in range(epochs):

    start_time = time.time()
    
    train_loss, train_acc, train_precision, train_recall, train_f_score \
        = train(model, train_loader, optimizer, criterion)
    valid_loss, valid_acc, valid_precision, valid_recall, valid_f_score \
        = evaluate(model, valid_loader, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        print(f'New validation loss {valid_loss} is better than the best validation loss {best_valid_loss} so far.')
        best_valid_loss = valid_loss
        torch.save(model, best_val_loss_model_filename)
        epochs_of_no_improvement = 0
    else: 
        epochs_of_no_improvement += 1
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | ' +
          f'Train Precision: {train_precision*100:.2f}% | Train Recall: {train_recall*100:.2f}% | ' +
          f'Train F1-score: {train_f_score*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% | ' +
          f'Val. Precision: {valid_precision*100:.2f}% | Val. Recall: {valid_recall*100:.2f}% | ' +
          f'Val. F1-score: {valid_f_score*100:.2f}%')
    
    wandb.log({"train_loss": train_loss, 
                "train_precision": train_precision, 
                "train_f_score": train_f_score, 
                "train_acc": train_acc,
                "train_recall": train_recall,
               "valid_loss": valid_loss,
               "valid_acc": valid_acc,
               "valid_precision": valid_precision,
               "valid_recall": valid_recall,
               "valid_f_score": valid_f_score,
               "epoch": epoch+1,
                })
    # check if the training should be stopped and then stop the training
    if epochs_of_no_improvement == patience : 
        print(f'Early stopping, on epoch: {epoch+1}.')
        break

torch.cuda.empty_cache()

NotImplementedError: 

## Evaluate on test (same wandb run)

In [16]:
gpt_vectorized_chunks_path = Path('/gpfs/space/projects/stud_nlp_share/cutoff/GPT/vectorized/test')
gpt_file_paths = os.listdir(gpt_vectorized_chunks_path)
sorted_gpt_filenames = sorted(gpt_file_paths, key=lambda fn: int(fn.split('chunk')[1].split('of')[0]))
sorted_gpt_filepaths = [gpt_vectorized_chunks_path / Path(p) for p in sorted_gpt_filenames]

In [17]:
ensemble_vectorized_chunks_path = '/gpfs/space/projects/stud_nlp_share/cutoff/ensemble/vectorized/test'
ensemble_file_paths = os.listdir(ensemble_vectorized_chunks_path)
sorted_ensemble_filenames = sorted(ensemble_file_paths, key=lambda fn: int(fn.split('chunk')[1].split('of')[0]))
sorted_ensemble_filepaths = [ensemble_vectorized_chunks_path / Path(p) for p in sorted_ensemble_filenames]

In [20]:
test_ensemble_dataset = ChunkDataset(sorted_ensemble_filepaths)
test_gpt_dataset = ChunkDataset(sorted_gpt_filepaths)

test_dataset = ConcatenatingDataset(test_ensemble_dataset, test_gpt_dataset)

In [21]:
len(test_dataset)

Loading for len /gpfs/space/projects/stud_nlp_share/cutoff/GPT/vectorized/test/test_1195k_min500_cutoff_replace_chunk1of4.pt
Loading for len /gpfs/space/projects/stud_nlp_share/cutoff/GPT/vectorized/test/test_1195k_min500_cutoff_replace_chunk2of4.pt
Loading for len /gpfs/space/projects/stud_nlp_share/cutoff/GPT/vectorized/test/test_1195k_min500_cutoff_replace_chunk3of4.pt
Loading for len /gpfs/space/projects/stud_nlp_share/cutoff/GPT/vectorized/test/test_1195k_min500_cutoff_replace_chunk4of4.pt
Loading for len /gpfs/space/projects/stud_nlp_share/cutoff/ensemble/vectorized/test/ensemble_vect_test_chunk1of3.pt
Loading for len /gpfs/space/projects/stud_nlp_share/cutoff/ensemble/vectorized/test/ensemble_vect_test_chunk2of3.pt
Loading for len /gpfs/space/projects/stud_nlp_share/cutoff/ensemble/vectorized/test/ensemble_vect_test_chunk3of3.pt


133032

In [None]:
# evaluate the model on the test set

test_model = torch.load(best_val_loss_model_filename)
test_loader = DataLoader(test_dataset, batch_size=256)
criterion = nn.BCEWithLogitsLoss()

start_time = time.time()
test_loss, test_acc, test_precision, test_recall, test_f_score \
    = evaluate(test_model, test_loader, criterion)
end_time = time.time()
epoch_mins, epoch_secs = epoch_time(start_time, end_time)

wandb.run.summary[f'test_acc'] = test_acc
wandb.run.summary[f'test_precision'] = test_precision
wandb.run.summary[f'test_recall'] = test_recall
wandb.run.summary[f'test_f_score'] = test_f_score

print(f'Epoch: test | Epoch Time: {epoch_mins}m {epoch_secs}s')
print(f'\tTest Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% | ' +
      f'Test Precision: {test_precision*100:.2f}% | Test Recall: {test_recall*100:.2f}% | ' +
      f'Test F1-score: {test_f_score*100:.2f}%')

wandb.finish()

# Save model to Wandb

In [None]:
artifact = wandb.Artifact('ensemble-plus-gpt2-bigger-layers-full-train', type='keras-model')
artifact.add_file(best_val_loss_model_filename)
run.log_artifact(artifact)

<wandb.sdk.wandb_artifacts.Artifact at 0x7f7dd9f269d0>

# Load model and test on test set

In [None]:
run.finish()

VBox(children=(Label(value=' 96.71MB of 96.71MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.…

0,1
epoch,145.0
loss,0.00742
accuracy,0.53805
recall,0.68013
precision,0.84985
f1,0.75536
val_loss,0.00855
val_accuracy,0.52053
val_recall,0.64804
val_precision,0.82058


0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss,█▄▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
accuracy,▃█▇▆▅▄▃▃▂▂▂▂▂▂▂▂▂▁▂▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▃▃
recall,▁▄▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████████████████
precision,█▃▂▁▁▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆
f1,▁▅▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████████████
val_loss,█▄▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_accuracy,▆█▇▇▇█▇▆▅▅▄▅▄▃▄▃▃▃▃▂▃▄▃▃▄▅▆▁▄▅▅▅▃▄▆▃▄▂▂▁
val_recall,▁▄▆▆▇▇▇▇▇▇▇█▇▇█▇██▇█▇██▇████████████████
val_precision,█▅▅▄▂▄▄▃▄▄▄▃▄▄▃▄▂▃▄▂▃▂▃▃▂▃▁▃▃▃▃▁▂▂▁▃▂▃▃▂


In [None]:
del test_y
del test_X
gc.collect()

NameError: ignored