# Definitions (run first!)

In [3]:
import gzip
import pickle
import requests
import csv
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import GPT2Tokenizer, GPT2Model
import torch
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
import wandb
import gc

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [6]:
class GPTEmbeddedDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx], idx

In [7]:
import csv

def loadcsv(filename):
    with open(filename, newline='', encoding='utf-8') as f:
        return list(csv.reader(f))

def load_label_map(out2id_path, id2label_path):
    
    out2id = loadcsv(out2id_path)
    out2id = {int(row[0]): row[1] for row in out2id}

    id2label_raw = loadcsv(id2label_path)
    id2label = {}

    for row in id2label_raw:
        if row == []:
            continue
        id2label[row[1]] = row[2]

    out2label = [id2label[out2id[out]] for out in sorted(out2id.keys())]
    
    return out2label

out2label = load_label_map('../data/labels_dict_gpt.csv', '../data/nyt-theme-tags.csv')
mlb = MultiLabelBinarizer(classes=out2label)
mlb.fit(out2label)

MultiLabelBinarizer(classes=['suspensions, dismissals and resignations',
                             'education and schools',
                             'colleges and universities', 'blacks',
                             'population', 'economic conditions and trends',
                             'labor',
                             'office buildings and commercial properties',
                             'architecture', 'medicine and health',
                             'awards, decorations and honors',
                             'diseases and conditions', 'research', 'cancer',
                             'basketball', 'design', 'interior design',
                             'real estate', 'trades (sports)',
                             'demonstrations and riots', 'dancing',
                             'hockey, ice', 'games', 'playoff games',
                             'baseball', 'travel and vacations', 'finances',
                             'books and literature',
   

In [8]:
import time
import math

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def validation_split(dataset, validation_subset, seed=42):

    if validation_subset > 0:
        n_total_samples = len(dataset)
        n_train_samples = math.floor(n_total_samples * (1-validation_subset))
        n_valid_samples = n_total_samples - n_train_samples

        train_subset, valid_subset = random_split(
            dataset,
            [n_train_samples, n_valid_samples],
            generator=torch.Generator().manual_seed(seed)
        )  # reproducible results

    else:
        train_subset = dataset
        valid_subset = None

    return train_subset, valid_subset

In [9]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    epoch_precision = 0
    epoch_recall = 0
    epoch_f_score = 0
    
    model.train()
    

    for i, batch in enumerate(iterator):

        article_embeddings, labels, idx  = batch
        article_embeddings = article_embeddings.to(device)
        labels = labels.type(torch.float).to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(article_embeddings)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # calculate metrics
        preds = model.act(outputs) > 0.5

        acc, precision, recall, f1 = multi_label_scores(labels.detach().cpu(), preds.detach().cpu())
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        epoch_precision += precision.item()
        epoch_recall += recall.item()
        epoch_f_score += f1.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator), \
        epoch_precision / len(iterator), epoch_recall / len(iterator), \
        epoch_f_score / len(iterator)

In [10]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    epoch_precision = 0
    epoch_recall = 0
    epoch_f_score = 0
    
    model.eval()
    
    with torch.no_grad():
        for i, batch in enumerate(iterator):

            article_embeddings, labels, idx  = batch
            article_embeddings = article_embeddings.to(device)
            labels = labels.type(torch.float).to(device)

            outputs = model(article_embeddings)
            loss = criterion(outputs, labels)

            # calculate metrics
            preds = model.act(outputs) > 0.5

            acc, precision, recall, f1 = multi_label_scores(labels.detach().cpu(), preds.detach().cpu())
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            epoch_precision += precision.item()
            epoch_recall += recall.item()
            epoch_f_score += f1.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator), \
        epoch_precision / len(iterator), epoch_recall / len(iterator), \
        epoch_f_score / len(iterator)

# Training

In [11]:
import io
import os
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
from mitnewsclassify2.gpt_model import GPTModel as GPTHead2

%load_ext autoreload
%autoreload 2

In [12]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

def multi_label_scores(correct_labels, predicted_labels):

    accuracy = accuracy_score(correct_labels, predicted_labels)
    precision = precision_score(correct_labels, predicted_labels, average='weighted', zero_division=0)
    recall = recall_score(correct_labels, predicted_labels, average='weighted', zero_division=0)
    f_1_score = f1_score(correct_labels, predicted_labels, average='weighted', zero_division=0)
    
    return accuracy, precision, recall, f_1_score

def gettags(head_model, features, eval=False):
    head_model.eval()
    features = features.unsqueeze(0).to(device)
    with torch.no_grad():
        logits = head_model(features)
        multi_label_sigmoids = head_model.act(logits)

    preds = multi_label_sigmoids > 0.5
    preds = preds.detach().cpu()

    return mlb.inverse_transform(preds)

In [17]:
%%time

# for ARTICLE_TYPE in ['cutoff', 'complete']:
for ARTICLE_TYPE in ['complete']:

    train_dataset = torch.load(f'vectorized/train_150k_{ARTICLE_TYPE}.pt')

    print(f'X_train_{ARTICLE_TYPE}', train_dataset.X.shape)
    print(f'y_train_{ARTICLE_TYPE}', train_dataset.y.shape)

    # splitting train/validation
    batch_size = 256
    seed = 42

    train_subset, valid_subset = validation_split(train_dataset, 0.1, seed)
    train_loader = DataLoader(train_subset, batch_size=batch_size)
    valid_loader = DataLoader(valid_subset, batch_size=batch_size)

    n_training_samples = train_dataset.X.shape[0]

    # hyperparams
    max_epochs = 1000

    patience = 10

    # model
    model_path = f'models/{ARTICLE_TYPE}.pt'
    model = GPTHead2(768, 538).to(device)
    criterion = nn.BCEWithLogitsLoss()

    optimizer = optim.Adam(model.parameters(),
                        lr = 1e-3, # default is 5e-5, our notebook had 2e-5
                      )

    wandb.init(
        entity='ut-mit-news-classify',
        project="NYT Multilabeling",
        tags=[ARTICLE_TYPE, 'cutoff-experiment'],
    )

    # training
    epochs_of_no_improvement = 0
    best_valid_loss = float('inf')

    wandb.config.early_stopping_patience = patience
    wandb.config.training_samples=n_training_samples
    wandb.config.model_path = model_path

    wandb.save(model_path)  # this will make wandb upload the model after call to `wandb.finish()`
    
    print('Starting training...')

    for epoch in range(max_epochs):

        start_time = time.time()

        train_loss, train_acc, train_precision, train_recall, train_f_score \
            = train(model, train_loader, optimizer, criterion)
        valid_loss, valid_acc, valid_precision, valid_recall, valid_f_score \
            = evaluate(model, valid_loader, criterion)

        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_loss < best_valid_loss:
            print(f'New validation loss {valid_loss} is better than the best validation loss {best_valid_loss} so far.')
            best_valid_loss = valid_loss
            torch.save(model, model_path)
            epochs_of_no_improvement = 0
        else: 
            epochs_of_no_improvement += 1

        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | ' +
              f'Train Precision: {train_precision*100:.2f}% | Train Recall: {train_recall*100:.2f}% | ' +
              f'Train F1-score: {train_f_score*100:.2f}%')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% | ' +
              f'Val. Precision: {valid_precision*100:.2f}% | Val. Recall: {valid_recall*100:.2f}% | ' +
              f'Val. F1-score: {valid_f_score*100:.2f}%')

        wandb.log({"train_loss": train_loss, 
                    "train_precision": train_precision, 
                    "train_f_score": train_f_score, 
                    "train_acc": train_acc,
                    "train_recall": train_recall,
                   "valid_loss": valid_loss,
                   "valid_acc": valid_acc,
                   "valid_precision": valid_precision,
                   "valid_recall": valid_recall,
                   "valid_f_score": valid_f_score,
                   "epoch": epoch+1,
                    })
        # check if the training should be stopped and then stop the training
        if epochs_of_no_improvement == patience : 
            print(f'Early stopping, on epoch: {epoch+1}.')
            break

    del train_dataset
    del train_subset
    del valid_subset
    del train_loader
    del valid_loader
    del model
    gc.collect()
    
    
    ####### TESTING #######
    
    print('Testing...')
    
    
    model = torch.load(model_path)

    test_dataset_names = [('cutoff', 'vectorized/test_150k_cutoff.pt'), ('complete', 'vectorized/test_150k_complete.pt')]

    for article_type, dataset_path in test_dataset_names:
        test_dataset = torch.load(dataset_path)

        test_loader = DataLoader(test_dataset, batch_size=128)

        start_time = time.time()
        test_loss, test_acc, test_precision, test_recall, test_f_score \
            = evaluate(model, test_loader, criterion)
        end_time = time.time()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        wandb.run.summary[f'test_{article_type}_acc'] = test_acc
        wandb.run.summary[f'test_{article_type}_precision'] = test_precision
        wandb.run.summary[f'test_{article_type}_recall'] = test_recall
        wandb.run.summary[f'test_{article_type}_f_score'] = test_f_score

        print(f'Epoch: test | Epoch Time: {epoch_mins}m {epoch_secs}s | Dataset: {article_type}')
        print(f'\tTest Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% | ' +
              f'Test Precision: {test_precision*100:.2f}% | Test Recall: {test_recall*100:.2f}% | ' +
              f'Test F1-score: {test_f_score*100:.2f}%')

    wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33mut-mit-news-classify[0m (use `wandb login --relogin` to force relogin)


X_train_complete torch.Size([150000, 768])
y_train_complete torch.Size([150000, 538])


Starting training...
New validation loss 0.5034549271656295 is better than the best validation loss inf so far.
Epoch: 01 | Epoch Time: 0m 46s
	Train Loss: 0.608 | Train Acc: 3.20% | Train Precision: 7.38% | Train Recall: 19.70% | Train F1-score: 8.26%
	 Val. Loss: 0.503 |  Val. Acc: 7.48% | Val. Precision: 12.42% | Val. Recall: 18.04% | Val. F1-score: 13.45%
New validation loss 0.3402182116346844 is better than the best validation loss 0.5034549271656295 so far.
Epoch: 02 | Epoch Time: 0m 40s
	Train Loss: 0.410 | Train Acc: 8.53% | Train Precision: 15.68% | Train Recall: 19.49% | Train F1-score: 16.50%
	 Val. Loss: 0.340 |  Val. Acc: 9.68% | Val. Precision: 17.37% | Val. Recall: 19.07% | Val. F1-score: 17.51%
New validation loss 0.2437568887815637 is better than the best validation loss 0.3402182116346844 so far.
Epoch: 03 | Epoch Time: 0m 41s
	Train Loss: 0.288 | Train Acc: 11.23% | Train Precision: 21.04% | Train Recall: 19.48% | Train F1-score: 19.54%
	 Val. Loss: 0.244 |  Val. Acc

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,0.0105
train_precision,0.73088
train_f_score,0.61764
train_acc,0.34537
train_recall,0.56424
valid_loss,0.02128
valid_acc,0.22927
valid_precision,0.50132
valid_recall,0.32183
valid_f_score,0.37105


0,1
train_loss,█▆▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_precision,▁▂▂▃▄▄▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇███████
train_f_score,▁▂▂▃▃▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇█████
train_acc,▁▂▃▄▄▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇██████
train_recall,▁▁▁▁▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
valid_loss,█▆▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid_acc,▁▂▃▅▆▆▇▇▇▇▇▇████████████▇███████
valid_precision,▁▂▃▄▅▅▆▆▆▇▇▇▇▇▇▇████████████████
valid_recall,▁▁▂▃▄▅▅▅▆▆▅▅▆▆▆▆▇▇█▇█▇██▆▇█▆█▇█▇
valid_f_score,▁▂▃▄▅▅▆▆▆▆▆▆▇▇▇▇▇██▇████▇██▇████


CPU times: user 21min 14s, sys: 1min 48s, total: 23min 3s
Wall time: 22min 44s


In [16]:
model = torch.load(model_path)

test_dataset_names = [('cutoff', 'vectorized/test_150k_cutoff.pt'), ('complete', 'vectorized/test_150k_complete.pt')]

for article_type, dataset_path in test_dataset_names:
    test_dataset = torch.load(dataset_path)

    test_loader = DataLoader(test_dataset, batch_size=128)

    start_time = time.time()
    test_loss, test_acc, test_precision, test_recall, test_f_score \
        = evaluate(model, test_loader, criterion)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    wandb.run.summary[f'test_{article_type}_acc'] = test_acc
    wandb.run.summary[f'test_{article_type}_precision'] = test_precision
    wandb.run.summary[f'test_{article_type}_recall'] = test_recall
    wandb.run.summary[f'test_{article_type}_f_score'] = test_f_score

    print(f'Epoch: test | Epoch Time: {epoch_mins}m {epoch_secs}s | Dataset: {article_type}')
    print(f'\tTest Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% | ' +
          f'Test Precision: {test_precision*100:.2f}% | Test Recall: {test_recall*100:.2f}% | ' +
          f'Test F1-score: {test_f_score*100:.2f}%')

wandb.finish()

Epoch: test | Epoch Time: 0m 4s | Dataset: cutoff
	Test Loss: 0.023 | Test Acc: 17.66% | Test Precision: 39.19% | Test Recall: 27.00% | Test F1-score: 30.26%
Epoch: test | Epoch Time: 0m 4s | Dataset: complete
	Test Loss: 0.021 | Test Acc: 20.41% | Test Precision: 42.71% | Test Recall: 31.10% | Test F1-score: 34.10%


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_loss,0.01194
train_precision,0.68204
train_f_score,0.55809
train_acc,0.29066
train_recall,0.50251
valid_loss,0.02432
valid_acc,0.1849
valid_precision,0.40368
valid_recall,0.2731
valid_f_score,0.30717


0,1
train_loss,█▆▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_precision,▁▁▂▂▃▃▄▄▄▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇███████
train_f_score,▁▁▂▂▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇██████
train_acc,▁▂▂▃▃▄▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇██████
train_recall,▃▁▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇█████
valid_loss,█▆▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid_acc,▁▂▂▃▄▅▆▆▇▇▇▇▇█████████████████████
valid_precision,▁▂▃▄▄▅▆▆▇▇▇▇█▇████████████████████
valid_recall,▁▂▃▄▄▅▅▅▆▆▇▆▇▇▇▇▇▇███████▇████▇▇██
valid_f_score,▁▂▃▄▅▅▅▆▆▇▇▇▇▇▇▇▇█████████████████


Epoch: test | Epoch Time: 0m 1s
	Test Loss: 0.018 | Test Acc: 23.51% | Test Precision: 45.93% | Test Recall: 31.37% | Test F1-score: 35.50%


In [81]:
idx = 15
print('predicted', gettags(model, train_dataset.X[idx]))
print('gold:', mlb.inverse_transform(train_dataset.y[idx].unsqueeze(0)))

predicted [('politics and government',)]
gold: [('armament, defense and military forces',)]
