# Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd "/content/drive/MyDrive/Colab Notebooks/NLP/project/"


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/NLP/project


In [2]:
%%capture
pip install --upgrade wandb

In [3]:
!wandb login 

[34m[1mwandb[0m: Currently logged in as: [33mkristjan[0m (use `wandb login --relogin` to force relogin)


# Load training data

In [4]:
%%time
import os
from tqdm.notebook import tqdm
import torch
from torch.utils.data import Dataset


class GPTEmbeddedDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx], idx


def load_all(directory):
    sorted_filenames = sorted(os.listdir(directory), key=lambda fn: int(fn.split('of')[0].split('chunk')[1]))
    all_X = None
    all_y = None
    for filename in tqdm(sorted_filenames):
        saved_dataset = torch.load(directory + filename)

        if all_X is not None:
            all_X = torch.cat([all_X, saved_dataset.X])
            all_y = torch.cat([all_y, saved_dataset.y])
        else:
            all_X = saved_dataset.X
            all_y = saved_dataset.y
    all_X = all_X.numpy()
    all_y = all_y.numpy()
    
    return all_X, all_y


drive_dir = 'nikita-vectors/tfidf/train/'
X, y = load_all(drive_dir)


HBox(children=(FloatProgress(value=0.0, max=25.0), HTML(value='')))


CPU times: user 33 s, sys: 30.2 s, total: 1min 3s
Wall time: 2min 59s


In [5]:
X.shape

(1298479, 1000)

# Configure and start Wandb job

In [19]:
import wandb

# Start a new run
run = wandb.init(project='mit-ensemble', entity='ut-mit-news-classify')

#hyperparams
epochs = 200
patience = 10
batch_size = 512
learning_rate = 1e-5
validation_split = 0.2

#reduce learning rate callback params
factor = 0.2
reduce_lr_patience = 5
min_lr = 0.001

# Save hyperparameters
config = wandb.config
config.batch_size = batch_size
config.epochs = epochs
config.early_stopping_patience = patience
config.learning_rate = learning_rate
config.validation_split = validation_split
config.train_set_shape = str(X.shape)

config.reduce_lr = f'factor-{factor}__patience-{reduce_lr_patience}__min_lr-{min_lr}'


VBox(children=(Label(value=' 36.80MB of 36.80MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.…

0,1
epoch,199.0
loss,0.00827
accuracy,0.53287
recall,0.64623
precision,0.83318
f1,0.72782
val_loss,0.0085
val_accuracy,0.52873
val_recall,0.64024
val_precision,0.82834


0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss,█▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
accuracy,▁█████████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇
recall,▁▅▇▇▇▇██████████████████████████████████
precision,█▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
f1,▁▆▇▇████████████████████████████████████
val_loss,█▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁████████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇
val_recall,▁▅▇▇▇▇██████████████████████████████████
val_precision,█▅▂▃▂▂▂▁▁▂▂▂▂▂▁▂▂▂▂▁▂▁▁▂▂▂▂▂▂▁▂▂▂▂▂▂▁▁▁▂


# Metrics functions for training with Keras

In [20]:
from keras import backend as K

def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1(y_true, y_pred):
    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    return 2*((p*r)/(p+r+K.epsilon()))


# Create and Compile Model

In [21]:
import keras
from keras import layers

model = keras.Sequential()
model.add(layers.Dense(800, activation="relu", input_shape=(1000,)))
model.add(layers.Dense(600, activation="relu", input_shape=(1000,)))
model.add(layers.Dense(538, activation="sigmoid", input_shape=(1000,)))

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 800)               800800    
_________________________________________________________________
dense_4 (Dense)              (None, 600)               480600    
_________________________________________________________________
dense_5 (Dense)              (None, 538)               323338    
Total params: 1,604,738
Trainable params: 1,604,738
Non-trainable params: 0
_________________________________________________________________


In [22]:
from wandb.keras import WandbCallback
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau


best_val_loss_model_filename = 'best_val_loss_model_with_reduce_lr.h5'
mc = ModelCheckpoint(best_val_loss_model_filename, monitor='val_loss', mode='min')
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=patience)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=factor, patience=reduce_lr_patience, min_lr=min_lr)
callbacks = [mc, es, reduce_lr, WandbCallback()]

opt = keras.optimizers.Adam(learning_rate=learning_rate)

model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy', recall, precision, f1])


# Train!

In [23]:

model.fit(X, y, epochs=epochs, validation_split=validation_split, callbacks=callbacks, batch_size=batch_size)


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7f78cd6658d0>

# Save model to Wandb

In [24]:
artifact = wandb.Artifact('simple-mit-ensemble', type='keras-model')
artifact.add_file(best_val_loss_model_filename)
run.log_artifact(artifact)

<wandb.sdk.wandb_artifacts.Artifact at 0x7f7785712510>

# Load model and test on test set

In [25]:

model = keras.models.load_model(best_val_loss_model_filename, custom_objects={"f1": f1, "recall": recall, "precision": precision})


In [26]:
%%time
drive_dir = 'nikita-vectors/tfidf/test/'
test_X, test_y = load_all(drive_dir)


HBox(children=(FloatProgress(value=0.0, max=25.0), HTML(value='')))


CPU times: user 4.62 s, sys: 2.1 s, total: 6.72 s
Wall time: 17.9 s


In [27]:
config.test_set_shape = str(test_X.shape)


In [28]:
%%time
test_preds = model.predict(test_X)

CPU times: user 4.57 s, sys: 833 ms, total: 5.4 s
Wall time: 3.1 s


In [29]:
import torch

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

def multi_label_accuracy(correct_labels, predicted_labels, average):
    predicted_labels = torch.round(torch.tensor(predicted_labels)).numpy()
    accuracy = accuracy_score(correct_labels, predicted_labels)
    precision = precision_score(correct_labels, predicted_labels, average=average, zero_division=0)
    recall = recall_score(correct_labels, predicted_labels, average=average, zero_division=0)
    f_1_score = f1_score(correct_labels, predicted_labels, average=average, zero_division=0)
    
    return accuracy, precision, recall, f_1_score


In [30]:
for average in ['weighted', 'micro', 'macro', 'samples']:
    accuracy, precision, recall, f_score = multi_label_accuracy(test_y, test_preds, average)
    print(average + ' ' + str(round(f_score,3)))
    metrics = { 
            "test_precision"+'_'+ average: precision, 
            "test_f1"+'_'+ average: f_score, 
            "test_recall"+'_'+ average: recall,
            }
    wandb.log(metrics)
    
wandb.log({ 
        "test_accuracy": accuracy,
        })

weighted 0.673
micro 0.688
macro 0.567
samples 0.698


In [31]:
metrics

{'test_f1_samples': 0.6984067592643254,
 'test_precision_samples': 0.7870070214478407,
 'test_recall_samples': 0.6890209643692411}