# Definitions (run first!)

In [1]:
import gzip
import pickle
import random
import requests
import csv
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

In [2]:
seed = 42

In [3]:
class EmbeddedDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx], idx

In [4]:
import csv

def loadcsv(filename):
    with open(filename, newline='', encoding='utf-8') as f:
        return list(csv.reader(f))


def load_label_map(out2id_path, id2label_path):
    
    out2id = loadcsv(out2id_path)
    out2id = {int(row[0]): row[1] for row in out2id}

    id2label_raw = loadcsv(id2label_path)
    id2label = {}

    for row in id2label_raw:
        if row == []:
            continue
        id2label[row[1]] = row[2]

    out2label = [id2label[out2id[out]] for out in sorted(out2id.keys())]
    
    return out2label

out2label = load_label_map('../labels_dict_gpt.csv', '../nyt-theme-tags.csv')
mlb = MultiLabelBinarizer(classes=out2label)
mlb.fit(out2label)

MultiLabelBinarizer(classes=['suspensions, dismissals and resignations',
                             'education and schools',
                             'colleges and universities', 'blacks',
                             'population', 'economic conditions and trends',
                             'labor',
                             'office buildings and commercial properties',
                             'architecture', 'medicine and health',
                             'awards, decorations and honors',
                             'diseases and conditions', 'research', 'cancer',
                             'basketball', 'design', 'interior design',
                             'real estate', 'trades (sports)',
                             'demonstrations and riots', 'dancing',
                             'hockey, ice', 'games', 'playoff games',
                             'baseball', 'travel and vacations', 'finances',
                             'books and literature',
   

In [5]:
# # temporary dataset for storing tokenized articles & transformed labels
# class Dataset(Dataset):
#     def __init__(self, articles, labels):

#         self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
#         self.tokenizer.pad_token = self.tokenizer.eos_token

#         print('Tokenizing...')
#         self.articles = self.tokenizer(articles, add_special_tokens=True, padding="max_length", truncation=True,
#                                        max_length=1024, return_tensors="pt", return_attention_mask=True)

#         self.input_ids = self.articles['input_ids']
#         self.attention_mask = self.articles['attention_mask']

#         print('Preprocessing labels...')
#         self.labels = mlb.transform(labels)
#         print('Done')

#     def __len__(self):
#         return len(self.input_ids)

#     def __getitem__(self, idx):
#         # return self.articles[idx], self.labels[idx]
#         return self.input_ids[idx], self.attention_mask[idx], self.labels[idx]

# Load, Pre-process & Embed data with Ensemble & Save to disk

In [5]:
# open the train data given to us by Max
with gzip.open('../../../NYTcorpus_train.p.gz', mode='r') as f:
    train_data = pickle.load(f)

# open the test data given to us by Max
with gzip.open('../../../NYTcorpus_test.p.gz', mode='r') as f:
    test_data = pickle.load(f)

# shuffle just in case the test and train data were not shuffled before - 
# we will only measure model's accuracy on a few thousand samples
random.Random(seed).shuffle(train_data)
random.Random(seed).shuffle(test_data)

# train and test data labels are coded in numbers,
# but the models predict human-readable labels,
# so we need to re-map these. 
# Let's use one of the files downloaded by the mitnewsclassify package
with open('../nyt-theme-tags.csv', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    tags_dict = {row['tags_id']: row['tag'] for row in reader}

# extract actual article texts from data samples
train_articles = [d[2] for d in train_data] 
test_articles = [d[2] for d in test_data]

# map the number-coded labels to human-readable labels
train_labels_lists = [list(map(tags_dict.get, d[3:])) for d in train_data]
test_labels_lists = [list(map(tags_dict.get, d[3:])) for d in test_data]

In [6]:
# takes 2-3 minutes

train_size = None  # 100_000
test_size = None  # 10_000

# train_dataset = NYTDataset(train_articles[:train_size], train_labels_lists[:train_size])
# test_dataset = NYTDataset(test_articles[:test_size], test_labels_lists[:test_size])
X_train, y_train = train_articles[:train_size], train_labels_lists[:train_size]
X_test, y_test = test_articles[:test_size], test_labels_lists[:test_size]

In [7]:
print('X_train', len(X_train))
print('y_train', len(y_train))
print('X_test', len(X_test))
print('y_test', len(y_test))

X_train 1298504
y_train 1298504
X_test 144279
y_test 144279


In [8]:
%%time

from tqdm.notebook import tqdm
from mitnewsclassify2 import tfidf, tfidf_bi
import gc

runs = [(X_train, y_train, 'embedded_train_full'), (X_test, y_test, 'embedded_test_full')]
# runs = [(X_test, y_test, 'embedded_test_100k')]


# parts = 20

for X, y, output_path in runs:

    # X_split = np.array_split()

    # for i, _ in enumerate(np.array_split(X))

    print(f'Preprocessing dataset for ', output_path)

    tfidf_vec = tfidf.getfeatures(X)
    tfidf_bi_vec = tfidf_bi.getfeatures(X)
    X_embedded = np.concatenate((tfidf_vec, tfidf_bi_vec), axis=1)

    print('Encoding labels...')
    y_embedded = mlb.transform(y)

    print('Saving to disk...')
    np.save(f'{output_path}_X', X_embedded)
    np.save(f'{output_path}_y', y_embedded)

print('Done!')

Preprocessing dataset for  embedded_train_full
Initializing...
Model...
Count Vectorizer...
TF-IDF Transformer...
Miscellaneous...


In [None]:
1+1

In [40]:
X_test = np.load('embedded_test_100k_X.npy')
y_test = np.load('embedded_test_100k_y.npy')

In [41]:
print('X_test.shape', X_test.shape)
print('y_test.shape', y_test.shape)

X_test.shape (10000, 1000)
y_test.shape (10000, 538)
