# Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd "/content/drive/MyDrive/Colab Notebooks/NLP/project/"


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/NLP/project


# Load training data

In [2]:
from torch.utils.data import IterableDataset, DataLoader
import os
import sys
import torch
from pathlib import Path
import gc
import numpy as np

module_path = "ut-mit-news-classify/NYT/"
if module_path not in sys.path:
    sys.path.append(module_path)
import utils


class ConcatenatingDataset(IterableDataset):
    def __init__(self, ensemble_dataset, gpt_dataset):
#       assert len(ensemble_dataset) == len(gpt_dataset)
        self.ensemble_dataset = ensemble_dataset
        self.gpt_dataset = gpt_dataset

    def __len__(self):
        return len(self.gpt_dataset) if len(self.gpt_dataset) < len(self.ensemble_dataset) else len(self.ensemble_dataset)

    def __iter__(self):       
        for ensemble_X, gpt in zip(self.ensemble_dataset, self.gpt_dataset):
            yield torch.cat((ensemble_X, gpt[0]), 0), gpt[1]

            
class ChunkDataset(IterableDataset):
    def __init__(self, file_paths):
        self.file_paths = file_paths
        self.total_length = None

    def __len__(self):
        if self.total_length is not None:
            return self.total_length
        
        self.total_length = 0
        for fp in self.file_paths:
            print(f'Loading for len {fp}')
            self.total_length += len(torch.load(fp).X)
            gc.collect()
        return self.total_length

    def __iter__(self):
        for fp in self.file_paths:
            print(f'Loading for iter {fp}')
            dataset = torch.load(fp)
            gc.collect()
            if hasattr(dataset, 'y'):
                for x, y in zip(dataset.X, dataset.y):
                    yield x, y
            else:
                for x in dataset.X:
                    yield x


In [3]:
gpt_vectorized_chunks_path = Path('gpt2-vectors/train')
gpt_file_paths = os.listdir(gpt_vectorized_chunks_path)
sorted_gpt_filenames = sorted(gpt_file_paths, key=lambda fn: int(fn.split('chunk')[1].split('of')[0]))
sorted_gpt_filepaths = [gpt_vectorized_chunks_path / Path(p) for p in sorted_gpt_filenames]


In [4]:
ensemble_vectorized_chunks_path = 'vectorized-fixed/train'
ensemble_file_paths = os.listdir(ensemble_vectorized_chunks_path)
sorted_ensemble_filenames = sorted(ensemble_file_paths, key=lambda fn: int(fn.split('chunk')[1].split('of')[0]))
sorted_ensemble_filepaths = [ensemble_vectorized_chunks_path / Path(p) for p in sorted_ensemble_filenames]


In [7]:
concat_dataset = ConcatenatingDataset(
    ChunkDataset(sorted_ensemble_filepaths),
    ChunkDataset(sorted_gpt_filepaths))

train_X, train_y = [], []
gc.collect()
for x, y in concat_dataset:
    train_X.append(x.numpy())
    train_y.append(y.numpy())

train_X = np.asarray(train_X)
train_y = np.asarray(train_y)
gc.collect()

Loading for iter vectorized-fixed/train/ensemble_vect_train_chunk1of24.pt
Loading for iter gpt2-vectors/train/train_1195k_min500_cutoff_replace_chunk1of4.pt
Loading for iter vectorized-fixed/train/ensemble_vect_train_chunk2of24.pt
Loading for iter vectorized-fixed/train/ensemble_vect_train_chunk3of24.pt
Loading for iter vectorized-fixed/train/ensemble_vect_train_chunk4of24.pt
Loading for iter vectorized-fixed/train/ensemble_vect_train_chunk5of24.pt
Loading for iter vectorized-fixed/train/ensemble_vect_train_chunk6of24.pt
Loading for iter gpt2-vectors/train/train_1195k_min500_cutoff_replace_chunk2of4.pt
Loading for iter vectorized-fixed/train/ensemble_vect_train_chunk7of24.pt
Loading for iter vectorized-fixed/train/ensemble_vect_train_chunk8of24.pt
Loading for iter vectorized-fixed/train/ensemble_vect_train_chunk9of24.pt
Loading for iter vectorized-fixed/train/ensemble_vect_train_chunk10of24.pt
Loading for iter vectorized-fixed/train/ensemble_vect_train_chunk11of24.pt
Loading for iter v

In [10]:
train_X.shape, train_y.shape

((1195938, 1768), (1195938, 538))

In [11]:
del concat_dataset
gc.collect()

176

# Configure and start Wandb job

In [12]:
%%capture
pip install --upgrade wandb

In [13]:
!wandb login 

[34m[1mwandb[0m: Currently logged in as: [33mkristjan[0m (use `wandb login --relogin` to force relogin)


In [14]:
import wandb

# Start a new run
run = wandb.init(project='ensemble-plus-gpt2', entity='ut-mit-news-classify')

#hyperparams
epochs = 200
patience = 50
batch_size = 256
learning_rate = 1e-5
validation_split = 0.2

#reduce learning rate callback params
factor = 0.2
reduce_lr_patience = 5
min_lr = 0.001

# Save hyperparameters
config = wandb.config
config.batch_size = batch_size
config.epochs = epochs
config.early_stopping_patience = patience
config.learning_rate = learning_rate
config.validation_split = validation_split

config.reduce_lr = f'factor-{factor}__patience-{reduce_lr_patience}__min_lr-{min_lr}'


[34m[1mwandb[0m: Currently logged in as: [33mkristjan[0m (use `wandb login --relogin` to force relogin)


# Metrics functions for training with Keras

In [15]:
from keras import backend as K

def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1(y_true, y_pred):
    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    return 2*((p*r)/(p+r+K.epsilon()))


# Create and Compile Model

In [16]:
import keras
from keras import layers

model = keras.Sequential()
model.add(layers.Dense(1400, activation="relu", input_shape=(1768,)))
model.add(layers.Dense(900, activation="relu", input_shape=(1768,)))
model.add(layers.Dense(538, activation="sigmoid", input_shape=(1768,)))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1400)              2476600   
_________________________________________________________________
dense_1 (Dense)              (None, 900)               1260900   
_________________________________________________________________
dense_2 (Dense)              (None, 538)               484738    
Total params: 4,222,238
Trainable params: 4,222,238
Non-trainable params: 0
_________________________________________________________________


In [17]:
best_val_loss_model_filename = 'ensemble-gpt-vectors-with-filtered-and-cut-off-ends-298984-articles.h5'


In [18]:
from wandb.keras import WandbCallback
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau


mc = ModelCheckpoint(best_val_loss_model_filename, monitor='val_loss', mode='min')
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=patience)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=factor, patience=reduce_lr_patience, min_lr=min_lr)
callbacks = [mc, es, reduce_lr, WandbCallback()]

opt = keras.optimizers.Adam(learning_rate=learning_rate)

model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy', recall, precision, f1])


# Train!

In [19]:

model.fit(train_X, train_y, epochs=epochs, callbacks=callbacks, validation_split=validation_split)


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7f298f0e2cd0>

In [20]:
config.train_set_shape = str(train_X.shape)

del train_X
del train_y
gc.collect()

2133

# Save model to Wandb

In [21]:
artifact = wandb.Artifact('ensemble-plus-gpt2-bigger-layers-full-train-filtered-and-cut-off-ends', type='keras-model')
artifact.add_file(best_val_loss_model_filename)
run.log_artifact(artifact)

<wandb.sdk.wandb_artifacts.Artifact at 0x7f298e68acd0>

# Load model and test on test set

In [22]:

model = keras.models.load_model(best_val_loss_model_filename, custom_objects={"f1": f1, "recall": recall, "precision": precision})
model.save('ensemble-gpt-vectors-with-filtered-and-cut-off-ends-all-train-articles.h5')

In [24]:
gpt_vectorized_chunks_path = Path('gpt2-vectors/test')
gpt_file_paths = os.listdir(gpt_vectorized_chunks_path)
sorted_gpt_filenames = sorted(gpt_file_paths, key=lambda fn: int(fn.split('chunk')[1].split('of')[0]))
sorted_gpt_filepaths = [gpt_vectorized_chunks_path / Path(p) for p in sorted_gpt_filenames]

ensemble_vectorized_chunks_path = 'vectorized-fixed/test'
ensemble_file_paths = os.listdir(ensemble_vectorized_chunks_path)
sorted_ensemble_filenames = sorted(ensemble_file_paths, key=lambda fn: int(fn.split('chunk')[1].split('of')[0]))
sorted_ensemble_filepaths = [ensemble_vectorized_chunks_path / Path(p) for p in sorted_ensemble_filenames]

concat_dataset = ConcatenatingDataset(
    ChunkDataset(sorted_ensemble_filepaths),
    ChunkDataset(sorted_gpt_filepaths))

test_X, test_y = [], []
gc.collect()
for x, y in concat_dataset:
    test_X.append(x.numpy())
    test_y.append(y.numpy())

test_X = np.asarray(test_X)
test_y = np.asarray(test_y)
gc.collect()

Loading for iter vectorized-fixed/test/ensemble_vect_test_chunk1of3.pt
Loading for iter gpt2-vectors/test/test_1195k_min500_cutoff_replace_chunk1of4.pt
Loading for iter gpt2-vectors/test/test_1195k_min500_cutoff_replace_chunk2of4.pt
Loading for iter vectorized-fixed/test/ensemble_vect_test_chunk2of3.pt
Loading for iter gpt2-vectors/test/test_1195k_min500_cutoff_replace_chunk3of4.pt
Loading for iter gpt2-vectors/test/test_1195k_min500_cutoff_replace_chunk4of4.pt
Loading for iter vectorized-fixed/test/ensemble_vect_test_chunk3of3.pt


0

In [25]:
config.test_set_shape = str(test_X.shape)


In [26]:
%%time
test_preds = model.predict(test_X)

CPU times: user 4.42 s, sys: 972 ms, total: 5.39 s
Wall time: 3.17 s


In [27]:
import torch

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

def multi_label_accuracy(correct_labels, predicted_labels, average):
    predicted_labels = torch.round(torch.tensor(predicted_labels)).numpy()
    accuracy = accuracy_score(correct_labels, predicted_labels)
    precision = precision_score(correct_labels, predicted_labels, average=average, zero_division=0)
    recall = recall_score(correct_labels, predicted_labels, average=average, zero_division=0)
    f_1_score = f1_score(correct_labels, predicted_labels, average=average, zero_division=0)
    
    return accuracy, precision, recall, f_1_score


In [28]:
for average in ['weighted', 'micro', 'macro', 'samples']:
    accuracy, precision, recall, f_score = multi_label_accuracy(test_y, test_preds, average)
    print(average + ' ' + str(round(f_score,3)))
    metrics = { 
            "test_precision"+'_'+ average: precision, 
            "test_f1"+'_'+ average: f_score, 
            "test_recall"+'_'+ average: recall,
            }
    wandb.log(metrics)
    
wandb.log({ 
        "test_accuracy": accuracy,
        })

weighted 0.666
micro 0.679
macro 0.569
samples 0.697


In [29]:
run.finish()
len(test_X)

VBox(children=(Label(value=' 96.71MB of 96.71MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.…

0,1
epoch,79.0
loss,0.00669
accuracy,0.5332
recall,0.71521
precision,0.866
f1,0.7821
val_loss,0.0093
val_accuracy,0.5164
val_recall,0.64582
val_precision,0.80199


0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
loss,█▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
accuracy,▁▇▆▅▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█▇█████████
recall,▁▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇███████████
precision,▁▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
f1,▁▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████
val_loss,█▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂
val_accuracy,▆█▅▄▃▁▅▄▃▄▄▃▄▄▄▃▄▄▃▃▄▄▄▃▄▃▄▅▃▃▃▄▄▄▃▄▄▄▃▃
val_recall,▁▅▆▆▆▆▇▆▆▇▇▇▇▇█▇▇█▇▇▇▇██▇▇▇███████▇█▇▇▇▇
val_precision,█▅▅▇▅▆▆█▇▅▆▇▆▄▄▅▅▃▄▅▆▄▃▃▅▅▄▂▂▃▂▁▁▂▄▂▃▃▃▂


In [30]:
del test_y
del test_X
gc.collect()

222