# Definitions (run first!)

In [69]:
!pip install wandb
!wandb login

Collecting wandb
  Downloading wandb-0.10.30-py2.py3-none-any.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 1.6 MB/s 
[?25hCollecting sentry-sdk>=0.4.0
  Downloading sentry_sdk-1.1.0-py2.py3-none-any.whl (131 kB)
[K     |████████████████████████████████| 131 kB 10.7 MB/s 
[?25hCollecting subprocess32>=3.5.3
  Using cached subprocess32-3.5.4-py3-none-any.whl
Collecting PyYAML
  Using cached PyYAML-5.4.1-cp38-cp38-manylinux1_x86_64.whl (662 kB)
Collecting shortuuid>=0.5.0
  Using cached shortuuid-1.0.1-py3-none-any.whl (7.5 kB)
Collecting GitPython>=1.0.0
  Downloading GitPython-3.1.14-py3-none-any.whl (159 kB)
[K     |████████████████████████████████| 159 kB 10.1 MB/s 
[?25hCollecting configparser>=3.8.1
  Downloading configparser-5.0.2-py3-none-any.whl (19 kB)
Collecting promise<3,>=2.0
  Using cached promise-2.3-py3-none-any.whl
Collecting psutil>=5.0.0
  Using cached psutil-5.8.0-cp38-cp38-manylinux2010_x86_64.whl (296 kB)
Collecting pathtools
  Using cached pat

In [1]:
import gzip
import pickle
import random
import requests
import csv
from torch.utils.data import Dataset, DataLoader, random_split
from mitnewsclassify.gpt_model import GPTModel as GPTHead
from transformers import GPT2Tokenizer, GPT2Model
import torch
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
import wandb

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
seed = 42

In [3]:
class GPTEmbeddedDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx], idx

In [4]:
import csv

def loadcsv(filename):
    with open(filename, newline='', encoding='utf-8') as f:
        return list(csv.reader(f))

def load_label_map(out2id_path, id2label_path):
    
    out2id = loadcsv(out2id_path)
    out2id = {int(row[0]): row[1] for row in out2id}

    id2label_raw = loadcsv(id2label_path)
    id2label = {}

    for row in id2label_raw:
        if row == []:
            continue
        id2label[row[1]] = row[2]

    out2label = [id2label[out2id[out]] for out in sorted(out2id.keys())]
    
    return out2label

out2label = load_label_map('labels_dict_gpt.csv', 'nyt-theme-tags.csv')
mlb = MultiLabelBinarizer(classes=out2label)
mlb.fit(out2label)

MultiLabelBinarizer(classes=['suspensions, dismissals and resignations',
                             'education and schools',
                             'colleges and universities', 'blacks',
                             'population', 'economic conditions and trends',
                             'labor',
                             'office buildings and commercial properties',
                             'architecture', 'medicine and health',
                             'awards, decorations and honors',
                             'diseases and conditions', 'research', 'cancer',
                             'basketball', 'design', 'interior design',
                             'real estate', 'trades (sports)',
                             'demonstrations and riots', 'dancing',
                             'hockey, ice', 'games', 'playoff games',
                             'baseball', 'travel and vacations', 'finances',
                             'books and literature',
   

In [5]:
# temporary dataset for storing tokenized articles & transformed labels
class NYTDataset(Dataset):
    def __init__(self, articles, labels):

        self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        self.tokenizer.pad_token = self.tokenizer.eos_token

        print('Tokenizing...')
        self.articles = self.tokenizer(articles, add_special_tokens=True, padding="max_length", truncation=True,
                                       max_length=1024, return_tensors="pt", return_attention_mask=True)

        self.input_ids = self.articles['input_ids']
        self.attention_mask = self.articles['attention_mask']

        print('Preprocessing labels...')
        self.labels = mlb.transform(labels)
        print('Done')

    def __len__(self):
        return len(self.articles)

    def __getitem__(self, idx):
        # return self.articles[idx], self.labels[idx]
        return self.input_ids[idx], self.attention_mask[idx], self.labels[idx]

In [6]:
import time
import math

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def validation_split(dataset, validation_subset, seed=42):

    if validation_subset > 0:
        n_total_samples = len(dataset)
        n_train_samples = math.floor(n_total_samples * (1-validation_subset))
        n_valid_samples = n_total_samples - n_train_samples

        train_subset, valid_subset = random_split(
            dataset,
            [n_train_samples, n_valid_samples],
            generator=torch.Generator().manual_seed(seed)
        )  # reproducible results

    else:
        train_subset = dataset
        valid_subset = None

    return train_subset, valid_subset

In [7]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    epoch_precision = 0
    epoch_recall = 0
    epoch_f_score = 0
    
    model.train()
    

    for i, batch in enumerate(iterator):

        article_embeddings, labels, idx  = batch
        article_embeddings = article_embeddings.to(device)
        labels = labels.type(torch.float).to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(article_embeddings)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # calculate metrics
        preds = model.act(outputs) > 0.5

        acc, precision, recall, f1 = multi_label_scores(labels.detach().cpu(), preds.detach().cpu())
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        epoch_precision += precision.item()
        epoch_recall += recall.item()
        epoch_f_score += f1.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator), \
        epoch_precision / len(iterator), epoch_recall / len(iterator), \
        epoch_f_score / len(iterator)

In [8]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    epoch_precision = 0
    epoch_recall = 0
    epoch_f_score = 0
    
    model.eval()
    
    with torch.no_grad():
        for i, batch in enumerate(iterator):

            article_embeddings, labels, idx  = batch
            article_embeddings = article_embeddings.to(device)
            labels = labels.type(torch.float).to(device)

            outputs = model(article_embeddings)
            loss = criterion(outputs, labels)

            # calculate metrics
            preds = model.act(outputs) > 0.5

            acc, precision, recall, f1 = multi_label_scores(labels.detach().cpu(), preds.detach().cpu())
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            epoch_precision += precision.item()
            epoch_recall += recall.item()
            epoch_f_score += f1.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator), \
        epoch_precision / len(iterator), epoch_recall / len(iterator), \
        epoch_f_score / len(iterator)

# Training

In [9]:
import io
import os
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from mitnewsclassify.gpt_model import GPTModel as GPTHead
from tqdm.notebook import tqdm

%load_ext autoreload
%autoreload 2

In [10]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

def multi_label_scores(correct_labels, predicted_labels):

    accuracy = accuracy_score(correct_labels, predicted_labels)
    precision = precision_score(correct_labels, predicted_labels, average='weighted', zero_division=0)
    recall = recall_score(correct_labels, predicted_labels, average='weighted', zero_division=0)
    f_1_score = f1_score(correct_labels, predicted_labels, average='weighted', zero_division=0)
    
    return accuracy, precision, recall, f_1_score

def gettags(head_model, features, eval=False):
    head_model.eval()
    features = features.unsqueeze(0).to(device)
    with torch.no_grad():
        logits = head_model(features)
        multi_label_sigmoids = head_model.act(logits)

    preds = multi_label_sigmoids > 0.5
    preds = preds.detach().cpu()

    return mlb.inverse_transform(preds)

In [11]:
train_dataset = torch.load('gpt_embedded_train_100k.pt')
test_dataset = torch.load('gpt_embedded_test_100k.pt')

print('X_train', train_dataset.X.shape)
print('y_train', train_dataset.y.shape)

print('X_test', test_dataset.X.shape)
print('y_test', test_dataset.y.shape)

X_train torch.Size([100000, 768])
y_train torch.Size([100000, 538])
X_test torch.Size([10000, 768])
y_test torch.Size([10000, 538])


In [14]:
# splitting train/validation
batch_size = 128

train_subset, valid_subset = validation_split(train_dataset, 0.1, seed)
train_loader = DataLoader(train_subset, batch_size=batch_size)
valid_loader = DataLoader(valid_subset, batch_size=batch_size)

n_training_samples = train_dataset.X.shape[0]

In [15]:
from mitnewsclassify2.gpt_model import GPTModel as GPTHead2

criterion = nn.BCEWithLogitsLoss()

In [16]:
%%time
# hyperparams
max_epochs = 1000

patience = 10

# model
model = GPTHead2(768, 538).to(device)

optimizer = optim.Adam(model.parameters(),
                    lr = 5e-1, # default is 5e-5, our notebook had 2e-5
                  )

wandb.init(
    entity='ut-mit-news-classify',
    project="NYT Multilabeling",
)
# Magic
# wandb.watch(model)

# training
epochs_of_no_improvement = 0
best_valid_loss = float('inf')

wandb.config.early_stopping_patience = patience
wandb.config.training_samples=n_training_samples

model_file_name = f'nyt_gtp2_prevectorized_on_{n_training_samples}_samples.pt'

for epoch in range(max_epochs):

    start_time = time.time()
    
    train_loss, train_acc, train_precision, train_recall, train_f_score \
        = train(model, train_loader, optimizer, criterion)
    valid_loss, valid_acc, valid_precision, valid_recall, valid_f_score \
        = evaluate(model, valid_loader, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        print(f'New validation loss {valid_loss} is better than the best validation loss {best_valid_loss} so far.')
        best_valid_loss = valid_loss
        torch.save(model, model_file_name)
        epochs_of_no_improvement = 0
    else: 
        epochs_of_no_improvement += 1
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | ' +
          f'Train Precision: {train_precision*100:.2f}% | Train Recall: {train_recall*100:.2f}% | ' +
          f'Train F1-score: {train_f_score*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% | ' +
          f'Val. Precision: {valid_precision*100:.2f}% | Val. Recall: {valid_recall*100:.2f}% | ' +
          f'Val. F1-score: {valid_f_score*100:.2f}%')
    
    wandb.log({"train_loss": train_loss, 
                "train_precision": train_precision, 
                "train_f_score": train_f_score, 
                "train_acc": train_acc,
                "train_recall": train_recall,
               "valid_loss": valid_loss,
               "valid_acc": valid_acc,
               "valid_precision": valid_precision,
               "valid_recall": valid_recall,
               "valid_f_score": valid_f_score,
               "epoch": epoch+1,
                })
    # check if the training should be stopped and then stop the training
    if epochs_of_no_improvement == patience : 
        print(f'Early stopping, on epoch: {epoch+1}.')
        break


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mut-mit-news-classify[0m (use `wandb login --relogin` to force relogin)


New validation loss 0.02632288663070413 is better than the best validation loss inf so far.
Epoch: 01 | Epoch Time: 0m 16s
	Train Loss: 0.032 | Train Acc: 3.90% | Train Precision: 7.77% | Train Recall: 4.80% | Train F1-score: 5.46%
	 Val. Loss: 0.026 |  Val. Acc: 4.84% | Val. Precision: 11.35% | Val. Recall: 5.18% | Val. F1-score: 6.44%
New validation loss 0.023372430354356766 is better than the best validation loss 0.02632288663070413 so far.
Epoch: 02 | Epoch Time: 0m 16s
	Train Loss: 0.025 | Train Acc: 7.87% | Train Precision: 16.27% | Train Recall: 10.44% | Train F1-score: 11.83%
	 Val. Loss: 0.023 |  Val. Acc: 8.77% | Val. Precision: 18.98% | Val. Recall: 10.29% | Val. F1-score: 12.35%
Epoch: 03 | Epoch Time: 0m 16s
	Train Loss: 0.024 | Train Acc: 9.89% | Train Precision: 21.46% | Train Recall: 13.81% | Train F1-score: 15.67%
	 Val. Loss: 0.024 |  Val. Acc: 9.22% | Val. Precision: 16.41% | Val. Recall: 7.16% | Val. F1-score: 8.81%
New validation loss 0.022612555683413638 is better

In [81]:
idx = 15
print('predicted', gettags(model, train_dataset.X[idx]))
print('gold:', mlb.inverse_transform(train_dataset.y[idx].unsqueeze(0)))

predicted [('politics and government',)]
gold: [('armament, defense and military forces',)]


In [19]:
test_model = torch.load('nyt_gtp2_prevectorized_on_100000_samples.pt')
test_loader = DataLoader(test_dataset, batch_size=128)

start_time = time.time()
test_loss, test_acc, test_precision, test_recall, test_f_score \
    = evaluate(test_model, test_loader, criterion)
end_time = time.time()
epoch_mins, epoch_secs = epoch_time(start_time, end_time)

print(f'Epoch: test | Epoch Time: {epoch_mins}m {epoch_secs}s')
print(f'\tTest Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% | ' +
      f'Test Precision: {test_precision*100:.2f}% | Test Recall: {test_recall*100:.2f}% | ' +
      f'Test F1-score: {test_f_score*100:.2f}%')

Epoch: test | Epoch Time: 0m 1s
	Test Loss: 0.018 | Test Acc: 23.51% | Test Precision: 45.93% | Test Recall: 31.37% | Test F1-score: 35.50%
