# Description

This notebook takes as input:

**ensemble** chunks (EmbeddedDataset):
- non-filtered
- non-shuffled

**gpt2-large** chunks (GPTEmbeddedDataset):
- non-filtered
- shuffled (seed 42)

And outputs the following *single-file whole datasets*:

**ensemble** (EmbeddedDataset):
- filtered
- shuffled (seed 42)

**gpt2-large** (EmbeddedDataset):
- filtered
- shuffled (seed 42)

Along the way it verifies:
- properties of the *input* chunks claimed above
- alignment of *ensemble* and *gpt2-large* articles by comparing their labelsets

# Definitions

In [1]:
from torch.utils.data import IterableDataset, Dataset, DataLoader
import os
import sys
import torch
from pathlib import Path
import gc
import numpy as np
from tqdm.autonotebook import tqdm
import random

module_path = "/home/mbaliesnyi/code/nlp-project/ut-mit-news-classify/NYT"
if module_path not in sys.path:
    sys.path.append(module_path)
import utils

In [2]:
from utils import vec2labels

class GPTEmbeddedDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx], idx


class EmbeddedDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx], idx


class ChunkMixDataset(Dataset):
    def __init__(self, file_paths):
        self.file_paths = file_paths
        self.X = None
        self.y = None
        self._load()

    def _load(self):
        '''Load & concatenate chunks defined in `self.file_paths`. 
        Convert vectorized labels to strings.'''

        print(f'Loading {len(self.file_paths)} chunks')

        for fp in tqdm(self.file_paths):
            dataset = torch.load(fp)

            if self.X is None:
                self.X = dataset.X
            else:
                self.X = torch.vstack((self.X, dataset.X))

            if hasattr(dataset, 'y'):
                if self.y is None:
                    self.y = vec2labels(dataset.y)
                else:
                    self.y += vec2labels(dataset.y)
            del dataset
            gc.collect()

        print(f'Loaded all chunks. Total length: {len(self)}.')

    def __len__(self):
        return len(self.X)

    def __item__(self, idx):
        return self.X[idx], self.y[idx]

# Load & verify data

In [3]:
from utils import load_nyt_data

train_articles, train_labels_lists, test_articles, test_labels_lists = load_nyt_data()

del train_articles
del train_labels_lists
gc.collect()

Train data loaded.
Test data loaded.


0

In [4]:
print('Test articles:', len(test_articles))

Test articles: 144279


In [18]:
data_dir = Path('../../../data/')
gpt2_dir = Path('../GPT')

ensemble_dir = data_dir / 'ensemble' / 'test'
gpt2large_dir = gpt2_dir / 'large'

In [21]:
# ensemble chunks are:
# - non-shuffled
# - non-filtered
# - non-cutoff
# - with transformed paths
ensemble_file_paths = os.listdir(ensemble_dir)
sorted_ensemble_filenames = sorted(ensemble_file_paths, key=lambda fn: int(fn.split('chunk')[1].split('of')[0]))
sorted_ensemble_filepaths = [ensemble_dir / Path(p) for p in sorted_ensemble_filenames]


In [22]:
# gpt2-large chunks are
# - seed-42-shuffled
# - non-filtered
# - non-cutoff
# - with transformed labels
gpt_file_paths = os.listdir(gpt2large_dir)
sorted_gpt_filenames = sorted(gpt_file_paths, key=lambda fn: int(fn.split('chunk')[1].split('of')[0]))
sorted_gpt_filepaths = [gpt2large_dir / Path(p) for p in sorted_gpt_filenames]


In [23]:
ensemble_dataset = ChunkMixDataset(sorted_ensemble_filepaths)


Loading 3 chunks


  0%|          | 0/3 [00:00<?, ?it/s]

Loaded all chunks. Total length: 144279.


In [24]:
gpt2large_dataset = ChunkMixDataset(sorted_gpt_filepaths)


Loading 2 chunks


  0%|          | 0/2 [00:00<?, ?it/s]

Loaded all chunks. Total length: 144279.


In [26]:
# ensemble chunks
## verify non-filtered
assert len(ensemble_dataset.y) == len(test_labels_lists)

## verify non-shuffled
for label_set_ensemble, label_set_raw in zip(ensemble_dataset.y, test_labels_lists):
    assert set(label_set_ensemble) == set(label_set_raw), (label_set_ensemble, label_set_raw)

# gpt2-large chunks
## verify non-filtered
assert len(gpt2large_dataset.y) == len(test_labels_lists)

## verify seed-42-shuffled
shuffled_test_labels_lists = test_labels_lists.copy()
random.Random(42).shuffle(shuffled_test_labels_lists)
for label_set_gpt2large, label_set_raw in zip(gpt2large_dataset.y, shuffled_test_labels_lists):
    assert set(label_set_gpt2large) == set(label_set_raw), (label_set_gpt2large, label_set_raw)

# Align & Filter

In [27]:
# align
random.Random(42).shuffle(ensemble_dataset.X)
random.Random(42).shuffle(ensemble_dataset.y)

In [28]:
# filter
min_len = 500

shuffled_test_articles = test_articles.copy()
random.Random(42).shuffle(shuffled_test_articles)

filtered_test_indices = [i for i, article in enumerate(shuffled_test_articles) if len(article) >= min_len]

ensemble_dataset.X = torch.vstack([ensemble_dataset.X[i] for i in filtered_test_indices])
ensemble_dataset.y = [ensemble_dataset.y[i] for i in filtered_test_indices]

gpt2large_dataset.X = torch.vstack([gpt2large_dataset.X[i] for i in filtered_test_indices])
gpt2large_dataset.y = [gpt2large_dataset.y[i] for i in filtered_test_indices]

print('Teset articles after filtering:', len(ensemble_dataset.X))

Teset articles after filtering: 133032


# Verify

In [29]:
# verify alignment

for label_set_gpt2large, label_set_ensemble in zip(gpt2large_dataset.y, ensemble_dataset.y):
    assert set(label_set_gpt2large) == set(label_set_ensemble), (label_set_gpt2large, label_set_ensemble)

# Save

In [30]:
aligned_ensemble_dataset = EmbeddedDataset(ensemble_dataset.X, ensemble_dataset.y)
aligned_gpt2large_dataset = EmbeddedDataset(gpt2large_dataset.X, gpt2large_dataset.y)

In [31]:
outdir = Path('/home/mbaliesnyi/code/nlp-project/data')

ensemble_path = outdir / 'ensemble-aligned'
gpt2large_path = outdir / 'single-gpt2-large-aligned'

os.makedirs(ensemble_path / 'test' , exist_ok=True)
os.makedirs(gpt2large_path / 'test' , exist_ok=True)

torch.save(aligned_ensemble_dataset, ensemble_path / 'test' / 'aligned_filtered.pt')
torch.save(aligned_gpt2large_dataset, gpt2large_path / 'test' / 'aligned_filtered.pt')

In [32]:
!ls -lah '{ensemble_path}/test'

total 510M
drwxr-xr-x 2 mbaliesnyi users 4.0K Jun  5 13:38 .
drwxr-xr-x 4 mbaliesnyi users 4.0K Jun  5 13:38 ..
-rw-r--r-- 1 mbaliesnyi users 510M Jun  5 13:38 aligned_filtered.pt


In [33]:
!ls -lah '{gpt2large_path}/test'

total 652M
drwxr-xr-x 2 mbaliesnyi users 4.0K Jun  5 13:38 .
drwxr-xr-x 4 mbaliesnyi users 4.0K Jun  5 13:38 ..
-rw-r--r-- 1 mbaliesnyi users 652M Jun  5 13:38 aligned_filtered.pt


# Move stuff to HPC

In [36]:
!scp -r $ensemble_path/test hpc:/gpfs/space/projects/stud_nlp_share/ensemble-aligned/

aligned_filtered.pt                           100%  510MB  11.3MB/s   00:45    


In [37]:
!scp -r $gpt2large_path/test hpc:/gpfs/space/projects/stud_nlp_share/single-gpt2-large-aligned/

aligned_filtered.pt                           100%  652MB  11.3MB/s   00:57    
