# Definitions (run first!)

In [1]:
import gzip
import pickle
import random
import requests
import csv
from torch.utils.data import Dataset, DataLoader
from mitnewsclassify.gpt_model import GPTModel as GPTHead
from transformers import GPT2Tokenizer, GPT2Model
import torch
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
seed = 42

In [3]:
class GPTEmbeddedDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx], idx

In [4]:
import csv

def loadcsv(filename):
    with open(filename, newline='', encoding='utf-8') as f:
        return list(csv.reader(f))

def load_label_map(out2id_path, id2label_path):
    
    out2id = loadcsv(out2id_path)
    out2id = {int(row[0]): row[1] for row in out2id}

    id2label_raw = loadcsv(id2label_path)
    id2label = {}

    for row in id2label_raw:
        if row == []:
            continue
        id2label[row[1]] = row[2]

    out2label = [id2label[out2id[out]] for out in sorted(out2id.keys())]
    
    return out2label

out2label = load_label_map('labels_dict_gpt.csv', 'nyt-theme-tags.csv')
mlb = MultiLabelBinarizer(classes=out2label)
mlb.fit(out2label)

MultiLabelBinarizer(classes=['suspensions, dismissals and resignations',
                             'education and schools',
                             'colleges and universities', 'blacks',
                             'population', 'economic conditions and trends',
                             'labor',
                             'office buildings and commercial properties',
                             'architecture', 'medicine and health',
                             'awards, decorations and honors',
                             'diseases and conditions', 'research', 'cancer',
                             'basketball', 'design', 'interior design',
                             'real estate', 'trades (sports)',
                             'demonstrations and riots', 'dancing',
                             'hockey, ice', 'games', 'playoff games',
                             'baseball', 'travel and vacations', 'finances',
                             'books and literature',
   

In [5]:
# temporary dataset for storing tokenized articles & transformed labels
class NYTDataset(Dataset):
    def __init__(self, articles, labels):

        self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        self.tokenizer.pad_token = self.tokenizer.eos_token

        print('Tokenizing...')
        self.articles = self.tokenizer(articles, add_special_tokens=True, padding="max_length", truncation=True,
                                       max_length=1024, return_tensors="pt", return_attention_mask=True)

        self.input_ids = self.articles['input_ids']
        self.attention_mask = self.articles['attention_mask']

        print('Preprocessing labels...')
        self.labels = mlb.transform(labels)
        print('Done')

    def __len__(self):
        return len(self.articles)

    def __getitem__(self, idx):
        # return self.articles[idx], self.labels[idx]
        return self.input_ids[idx], self.attention_mask[idx], self.labels[idx]

# Training

In [15]:
import io
import os
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from mitnewsclassify.gpt_model import GPTModel as GPTHead
from tqdm.notebook import tqdm

%load_ext autoreload
%autoreload 2

In [135]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

def multi_label_scores(correct_labels, predicted_labels):

    accuracy = accuracy_score(correct_labels, predicted_labels)
    precision = precision_score(correct_labels, predicted_labels, average='weighted', zero_division=0)
    recall = recall_score(correct_labels, predicted_labels, average='weighted', zero_division=0)
    f_1_score = f1_score(correct_labels, predicted_labels, average='weighted', zero_division=0)
    
    return accuracy, precision, recall, f_1_score

def gettags(head_model, features, eval=False):
    head_model.eval()
    features = features.unsqueeze(0).to(device)
    with torch.no_grad():
        logits = head_model(features)
        multi_label_sigmoids = head_model.act(logits)

    preds = multi_label_sigmoids > 0.5
    preds = preds.detach().cpu()

    return mlb.inverse_transform(preds)

In [16]:
train_dataset = torch.load('gpt_embedded_train_v1.1.pt')
test_dataset = torch.load('gpt_embedded_test_v1.1.pt')

print('X_train', train_dataset.X.shape)
print('y_train', train_dataset.y.shape)

print('X_test', test_dataset.X.shape)
print('y_test', test_dataset.y.shape)

X_train torch.Size([10000, 768])
y_train torch.Size([10000, 538])
X_test torch.Size([1000, 768])
y_test torch.Size([1000, 538])


In [97]:
from torch.utils.data import Subset

toy_samples = torch.arange(0,50)
toy_dataset = Subset(train_dataset, toy_samples)

### Overfit to a 50-sample toy dataset

In [191]:
epochs = 300
batch_size = 128

# train_loader = DataLoader(train_dataset, batch_size=batch_size)
toy_loader = DataLoader(toy_dataset, batch_size=batch_size)
model = GPTHead(768, 538).to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(),
                    lr = 2e-2, # default is 5e-5, our notebook had 2e-5
                  )

model.train()
progress_bar = tqdm(range(epochs))
for epoch in progress_bar:  # loop over the dataset multiple times

    running_loss = 0.0
    # progress_bar = tqdm(toy_loader)
    for i, batch in enumerate(toy_loader):

        article_embeddings, labels, idx  = batch

        article_embeddings = article_embeddings.to(device)
        labels = labels.type(torch.float).to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(article_embeddings)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()

    preds = model.act(outputs) > 0.5
    preds = preds.detach().cpu()

    acc, precision, recall, f1 = multi_label_scores(labels.detach().cpu(), preds)
    progress_bar.set_description(f'loss: {running_loss/(i+1):.4f} acc: {acc:.3f} f1-score: {f1:.3f}')

print('Finished Training')

  0%|          | 0/300 [00:00<?, ?it/s]

Finished Training


In [196]:
idx = 2
print('predicted:', gettags(model, train_dataset[idx][0]))
print('gold:', mlb.inverse_transform(labels.cpu())[idx])

predicted: [('animals', 'food', 'birds')]
gold: ('animals', 'food', 'birds')


('animals', 'food', 'birds')