# Description

This notebook solves a mistake made in `align_ensemble_with_gpt2_large` (and `_test`). The mistake was that the labels were saved as lists of strings. They should be saved as sparse vectors instead.

# Definitions

In [4]:
from torch.utils.data import IterableDataset, Dataset, DataLoader
import os
import sys
import torch
from pathlib import Path
import gc
import numpy as np
from tqdm.autonotebook import tqdm
import random

module_path = "/gpfs/space/home/mykyta/nlp/ut-mit-news-classify/NYT/"
if module_path not in sys.path:
    sys.path.append(module_path)
import utils

In [6]:
from utils import labels2vec

class EmbeddedDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx], idx

# Load data

In [22]:
shared_dir = Path('/gpfs/space/projects/stud_nlp_share')

ensemble_train_path = shared_dir / 'ensemble-aligned' / 'train' / 'aligned_filtered.pt'
ensemble_test_path = shared_dir / 'ensemble-aligned' / 'test' / 'aligned_filtered.pt'

gpt2large_train_path = shared_dir / 'single-gpt2-large-aligned' / 'train' / 'aligned_filtered.pt'
gpt2large_test_path = shared_dir / 'single-gpt2-large-aligned' / 'test' / 'aligned_filtered.pt'

In [27]:
!ls -lah $ensemble_train_path
!ls -lah $ensemble_test_path
!ls -lah $gpt2large_train_path
!ls -lah $gpt2large_test_path

-rw-r--r-- 1 mykyta users 4.5G Jun  5 00:39 /gpfs/space/projects/stud_nlp_share/ensemble-aligned/train/aligned_filtered.pt
-rw-r--r-- 1 mykyta users 510M Jun  5 13:41 /gpfs/space/projects/stud_nlp_share/ensemble-aligned/test/aligned_filtered.pt
-rw-r--r-- 1 mykyta users 5.8G Jun  5 00:48 /gpfs/space/projects/stud_nlp_share/single-gpt2-large-aligned/train/aligned_filtered.pt
-rw-r--r-- 1 mykyta users 652M Jun  5 13:42 /gpfs/space/projects/stud_nlp_share/single-gpt2-large-aligned/test/aligned_filtered.pt


# Vectorize labels & overwrite old files

In [41]:
paths = [ensemble_train_path, ensemble_test_path, gpt2large_train_path, gpt2large_test_path]

for dataset_path in paths:
    dataset = torch.load(dataset_path)
    
    dataset.y = labels2vec(dataset.y)
    torch.save(dataset, dataset_path)
    
    del dataset
    gc.collect()

In [42]:
!ls -lah $ensemble_train_path
!ls -lah $ensemble_test_path
!ls -lah $gpt2large_train_path
!ls -lah $gpt2large_test_path

-rw-r--r-- 1 mykyta users 5.1G Jun  5 14:41 /gpfs/space/projects/stud_nlp_share/ensemble-aligned/train/aligned_filtered.pt
-rw-r--r-- 1 mykyta users 576M Jun  5 14:41 /gpfs/space/projects/stud_nlp_share/ensemble-aligned/test/aligned_filtered.pt
-rw-r--r-- 1 mykyta users 6.4G Jun  5 14:42 /gpfs/space/projects/stud_nlp_share/single-gpt2-large-aligned/train/aligned_filtered.pt
-rw-r--r-- 1 mykyta users 718M Jun  5 14:42 /gpfs/space/projects/stud_nlp_share/single-gpt2-large-aligned/test/aligned_filtered.pt


In [46]:
a = torch.load(ensemble_test_path)
b = torch.load(gpt2large_test_path)

print('a', a.y.shape)
print('b', b.y.shape)

a (133032, 538)
b (133032, 538)


In [51]:
(a.y == b.y).all()

True