In [24]:
import gzip
import pickle
import csv
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
from tqdm.notebook import tqdm
from mitnewsclassify2 import tfidf, tfidf_bi, download
import gc
import torch
import sys
import time
import os
from tqdm.autonotebook import tqdm

def print_f(*args):
    print(*args, flush=True)

print_f('All imports seem good!')

All imports seem good!


In [25]:
chunk_size = 50_000
train_size = None
test_size = None
output_dir = 'vectorized-fixed'

os.makedirs(output_dir, exist_ok=True)

In [None]:
# print_f('Downloading mitwnewsclassify stuff...')
# download.download('tfidf')
# download.download('tfidf_bi')

# print_f('Flushing the buffer to let logs from package appear...')
# sys.stdout.flush()

In [3]:
class EmbeddedDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx], idx


def loadcsv(filename):
    with open(filename, newline='', encoding='utf-8') as f:
        return list(csv.reader(f))


def load_label_map(out2id_path, id2label_path):
    
    out2id = loadcsv(out2id_path)
    out2id = {int(row[0]): row[1] for row in out2id}

    id2label_raw = loadcsv(id2label_path)
    id2label = {}

    for row in id2label_raw:
        if row == []:
            continue
        id2label[row[1]] = row[2]

    out2label = [id2label[out2id[out]] for out in sorted(out2id.keys())]
    
    return out2label

In [4]:
out2label = load_label_map('../data/labels_dict_gpt.csv', '../data/nyt-theme-tags.csv')
mlb = MultiLabelBinarizer(classes=out2label)
mlb.fit(out2label)

MultiLabelBinarizer(classes=['suspensions, dismissals and resignations',
                             'education and schools',
                             'colleges and universities', 'blacks',
                             'population', 'economic conditions and trends',
                             'labor',
                             'office buildings and commercial properties',
                             'architecture', 'medicine and health',
                             'awards, decorations and honors',
                             'diseases and conditions', 'research', 'cancer',
                             'basketball', 'design', 'interior design',
                             'real estate', 'trades (sports)',
                             'demonstrations and riots', 'dancing',
                             'hockey, ice', 'games', 'playoff games',
                             'baseball', 'travel and vacations', 'finances',
                             'books and literature',
   

In [5]:
print_f('Loading data...')

# open the train data given to us by Max
with gzip.open('../data/NYTcorpus_train.p.gz', mode='r') as f:
    train_data = pickle.load(f)

# open the test data given to us by Max
with gzip.open('../data/NYTcorpus_test.p.gz', mode='r') as f:
    test_data = pickle.load(f)

print_f('Data loaded.')

# train and test data labels are coded in numbers,
# but the models predict human-readable labels,
# so we need to re-map these. 
# Let's use one of the files downloaded by the mitnewsclassify package
with open('../data/nyt-theme-tags.csv', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    tags_dict = {row['tags_id']: row['tag'] for row in reader}

# extract actual article texts from data samples
train_articles = [d[2] for d in train_data] 
test_articles = [d[2] for d in test_data]

# map the number-coded labels to human-readable labels
train_labels_lists = [list(map(tags_dict.get, d[3:])) for d in train_data]
test_labels_lists = [list(map(tags_dict.get, d[3:])) for d in test_data]

X_train, y_train = train_articles[:train_size], train_labels_lists[:train_size]
X_test, y_test = test_articles[:test_size], test_labels_lists[:test_size]

print_f('X_train', len(X_train))
print_f('y_train', len(y_train))
print_f('X_test', len(X_test))
print_f('y_test', len(y_test))

Loading data...
Data loaded.
X_train 1298504
y_train 1298504
X_test 144279
y_test 144279


In [33]:
runs = [(X_train, y_train, f'{output_dir}/embedded_train_FULL_ensemble'), (X_test, y_test, f'{output_dir}/embedded_test_FULL_ensemble')]

for X, y, output_path in runs:
    total_chunks = len(X) // chunk_size + 1
    print_f('Total chunks:', total_chunks)
    
    print_f('Transforming labels...')
    y_embedded = mlb.transform(y)
    print_f('Done!')

    dataset = EmbeddedDataset(X, y_embedded)
    iterator = DataLoader(dataset, batch_size=chunk_size)

    for chunk_id, chunk in enumerate(tqdm(iterator)):
        X_chunk, y_chunk, idx_chunk = chunk

        chunk_path = f'{output_path}_chunk{chunk_id+1}of{total_chunks}.pt'
        print_f(f'Vectorizing chunk: ', chunk_path)
        print_f('Chunk size:', len(X_chunk))

        start = time.time()
        tfidf_vec = tfidf.getfeatures(X_chunk)
        tfidf_bi_vec = tfidf_bi.getfeatures(X_chunk)
        X_embedded = np.concatenate((tfidf_vec, tfidf_bi_vec), axis=1)

        saved_dataset = EmbeddedDataset(torch.tensor(X_embedded), torch.tensor(y_chunk))
        torch.save(saved_dataset, chunk_path, pickle_protocol=4)

        print_f(f'Time taken: {int(time.time() - start)/60:.1f}min')

        print_f()

        del tfidf_vec
        del tfidf_bi_vec
        del X_embedded
        del saved_dataset
        gc.collect()

    del y_embedded
    del dataset
    del iterator
    gc.collect()

print_f('Done!')

Total chunks: 26
Transforming labels...
Done!


  0%|          | 0/26 [00:00<?, ?it/s]

Vectorizing chunk:  vectorized-fixed/embedded_train_FULL_ensemble_chunk1of26.pt
Chunk size: 50000


  saved_dataset = EmbeddedDataset(torch.tensor(X_embedded), torch.tensor(y_chunk))


Time taken: 1.9min

Vectorizing chunk:  vectorized-fixed/embedded_train_FULL_ensemble_chunk2of26.pt
Chunk size: 50000
Time taken: 1.8min

Vectorizing chunk:  vectorized-fixed/embedded_train_FULL_ensemble_chunk3of26.pt
Chunk size: 50000
Time taken: 1.8min

Vectorizing chunk:  vectorized-fixed/embedded_train_FULL_ensemble_chunk4of26.pt
Chunk size: 50000
Time taken: 1.7min

Vectorizing chunk:  vectorized-fixed/embedded_train_FULL_ensemble_chunk5of26.pt
Chunk size: 50000
Time taken: 1.7min

Vectorizing chunk:  vectorized-fixed/embedded_train_FULL_ensemble_chunk6of26.pt
Chunk size: 50000
Time taken: 1.8min

Vectorizing chunk:  vectorized-fixed/embedded_train_FULL_ensemble_chunk7of26.pt
Chunk size: 50000
Time taken: 1.8min

Vectorizing chunk:  vectorized-fixed/embedded_train_FULL_ensemble_chunk8of26.pt
Chunk size: 50000
Time taken: 1.7min

Vectorizing chunk:  vectorized-fixed/embedded_train_FULL_ensemble_chunk9of26.pt
Chunk size: 50000
Time taken: 1.7min

Vectorizing chunk:  vectorized-fixed

  0%|          | 0/3 [00:00<?, ?it/s]

Vectorizing chunk:  vectorized-fixed/embedded_test_FULL_ensemble_chunk1of3.pt
Chunk size: 50000
Time taken: 1.7min

Vectorizing chunk:  vectorized-fixed/embedded_test_FULL_ensemble_chunk2of3.pt
Chunk size: 50000
Time taken: 1.7min

Vectorizing chunk:  vectorized-fixed/embedded_test_FULL_ensemble_chunk3of3.pt
Chunk size: 44279
Time taken: 1.5min

Done!


In [34]:
!mv vectorized-fixed/* /gpfs/space/projects/stud_nlp_share/ensemble/

In [39]:
!ls -lah /gpfs/space/projects/stud_nlp_share/ensemble/train

total 11G
d--------- 2 mykyta users 4.0K May 21 17:07 .
d--------- 4 mykyta users 4.0K May 21 17:07 ..
-rw-r--r-- 1 mykyta users 396M May 21 15:54 embedded_train_FULL_ensemble_chunk10of26.pt
-rw-r--r-- 1 mykyta users 396M May 21 15:56 embedded_train_FULL_ensemble_chunk11of26.pt
-rw-r--r-- 1 mykyta users 396M May 21 15:58 embedded_train_FULL_ensemble_chunk12of26.pt
-rw-r--r-- 1 mykyta users 396M May 21 16:00 embedded_train_FULL_ensemble_chunk13of26.pt
-rw-r--r-- 1 mykyta users 396M May 21 16:02 embedded_train_FULL_ensemble_chunk14of26.pt
-rw-r--r-- 1 mykyta users 396M May 21 16:04 embedded_train_FULL_ensemble_chunk15of26.pt
-rw-r--r-- 1 mykyta users 396M May 21 16:06 embedded_train_FULL_ensemble_chunk16of26.pt
-rw-r--r-- 1 mykyta users 396M May 21 16:08 embedded_train_FULL_ensemble_chunk17of26.pt
-rw-r--r-- 1 mykyta users 396M May 21 16:10 embedded_train_FULL_ensemble_chunk18of26.pt
-rw-r--r-- 1 mykyta users 396M May 21 16:11 embedded_train_FULL_ensemble_chunk19of26.pt
-rw-r--r-- 1 myky

In [37]:
!mkdir /gpfs/space/projects/stud_nlp_share/ensemble/train
!mkdir /gpfs/space/projects/stud_nlp_share/ensemble/test
!mv /gpfs/space/projects/stud_nlp_share/ensemble/*train*.pt /gpfs/space/projects/stud_nlp_share/ensemble/train/
!mv /gpfs/space/projects/stud_nlp_share/ensemble/*test*.pt /gpfs/space/projects/stud_nlp_share/ensemble/test/