In [5]:
import pickle
from spacy.lang.en import English

import random
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


from pathlib import Path
import time



In [6]:
PAD = '<PAD>'
PAD_ID = 0
UNK = '<UNK>'
UNK_ID = 1
VOCAB_PREFIX = [PAD, UNK]

VEC_PATH = Path('wiki-news-300d-1M.vec')
DATA_PATH = Path('.')
MAX_VOCAB = 25000

batch_size = 64
validation_split = .3
shuffle_dataset = True
random_seed = 42

In [7]:
class BaseVocab:
    def __init__(self, data, lower=False):
        self.data = data
        self.lower = lower
        self.build_vocab()
        
    def normalize_unit(self, unit):
        if self.lower:
            return unit.lower()
        else:
            return unit
        
    def unit2id(self, unit):
        unit = self.normalize_unit(unit)
        if unit in self._unit2id:
            return self._unit2id[unit]
        else:
            return self._unit2id[UNK]
    
    def id2unit(self, id):
        return self._id2unit[id]
    
    def map(self, units):
        return [self.unit2id(unit) for unit in units]
        
    def build_vocab(self):
        NotImplementedError()
        
    def __len__(self):
        return len(self._unit2id)

In [8]:
class PretrainedWordVocab(BaseVocab):
    def build_vocab(self):
        self._id2unit = VOCAB_PREFIX + self.data
        self._unit2id = {w:i for i, w in enumerate(self._id2unit)}

In [9]:
class LabelVocab(BaseVocab):
    def build_vocab(self):
        self._id2unit = self.data
        self._unit2id = {w:i for i, w in enumerate(self._id2unit)}

In [10]:
class Pretrain:
    def __init__(self, vec_filename, max_vocab=-1):
        self._vec_filename = vec_filename
        self._max_vocab = max_vocab
        
    @property
    def vocab(self):
        if not hasattr(self, '_vocab'):
            self._vocab, self._emb = self.read()
        return self._vocab
    
    @property
    def emb(self):
        if not hasattr(self, '_emb'):
            self._vocab, self._emb = self.read()
        return self._emb
        
    def read(self):
        if self._vec_filename is None:
            raise Exception("Vector file is not provided.")
        print(f"Reading pretrained vectors from {self._vec_filename}...")
        
        words, emb, failed = self.read_from_file(self._vec_filename, open_func=open)
        
        if failed > 0: # recover failure
            emb = emb[:-failed]
        if len(emb) - len(VOCAB_PREFIX) != len(words):
            raise Exception("Loaded number of vectors does not match number of words.")
            
        # Use a fixed vocab size
        if self._max_vocab > len(VOCAB_PREFIX) and self._max_vocab < len(words):
            words = words[:self._max_vocab - len(VOCAB_PREFIX)]
            emb = emb[:self._max_vocab]
                
        vocab = PretrainedWordVocab(words, lower=True)
        print("Done Reading")
        
        return vocab, emb
        
    def read_from_file(self, filename, open_func=open):
        """
        Open a vector file using the provided function and read from it.
        """
        first = True
        words = []
        failed = 0
        with open_func(filename, 'rb') as f:
            for i, line in enumerate(f):
                try:
                    line = line.decode()
                except UnicodeDecodeError:
                    failed += 1
                    continue
                if first:
                    # the first line contains the number of word vectors and the dimensionality
                    first = False
                    line = line.strip().split(' ')
                    rows, cols = [int(x) for x in line]
                    emb = np.zeros((rows + len(VOCAB_PREFIX), cols), dtype=np.float32)
                    continue

                line = line.rstrip().split(' ')
                emb[i+len(VOCAB_PREFIX)-1-failed, :] = [float(x) for x in line[-cols:]]
                words.append(' '.join(line[:-cols]))
        return words, emb, failed

In [11]:
pretrain = Pretrain(VEC_PATH, MAX_VOCAB)

In [25]:

class Vectorizer():
    def __init__(self, pretrain, data, tokenzier, label_vocab):
        self.pretrain_vocab = pretrain.vocab
        self.data = data
        self.tokenizer = tokenizer
        self.label_vocab = label_vocab
        

    def vectorize(self):
        articles = [a[2] for a in self.data]
        labels = [a[3:] for a in self.data]
         
        vectorized_data = []
     
        for article, label_words in zip(articles, labels):
            tokens = [t.text for t in self.tokenizer(article)]
            text = torch.LongTensor(self.pretrain_vocab.map(tokens))

            label_indices = torch.LongTensor(self.label_vocab.map(label_words))

            n_labels = len(self.label_vocab)
            src = torch.ones(n_labels)
            label = torch.zeros(n_labels).scatter_(0, label_indices, src)
            label = torch.FloatTensor(label)

            vectorized_data.append((text, label))
            
        vectorized_data = sorted(vectorized_data, key=lambda x: len(x[0]), reverse=True)
        
        return vectorized_data


In [13]:
pretrain = Pretrain(VEC_PATH, MAX_VOCAB)

In [14]:

# Check if we are running on a CPU or GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [15]:
%%time 
training_data_path = DATA_PATH / 'NYTcorpus_train.p'
with open(training_data_path, mode='rb') as f:
    data = pickle.load(f)


CPU times: user 8.27 s, sys: 8.34 s, total: 16.6 s
Wall time: 20.6 s


In [17]:

        
def get_labels(data):
    labels = [a[3:] for a in data]
    labels_flattened = []
    for label in labels:
        labels_flattened.extend(label)
    return LabelVocab(list(set(labels_flattened)))
    
label_vocab = get_labels(data)


In [28]:
%%time

import random
#random.Random(0).shuffle(data)


nlp = English()
tokenizer = nlp.tokenizer


for i, data_to_vectorize in enumerate(np.array_split(data, 20)):

    start = time.time()

    train_vectorizer = Vectorizer(pretrain, data_to_vectorize, tokenizer, label_vocab)

    vectorized_data = train_vectorizer.vectorize()

    torch.save(vectorized_data, f'all_train_vect/{i}_{len(vectorized_data)}.pt')

    print(f"{i} train vect time taken: {int(time.time() - start)} seconds")
    
del data
    


12 train vect time taken: 141 seconds
13 train vect time taken: 145 seconds
14 train vect time taken: 123 seconds
15 train vect time taken: 133 seconds
16 train vect time taken: 120 seconds
17 train vect time taken: 130 seconds
18 train vect time taken: 123 seconds
19 train vect time taken: 126 seconds
CPU times: user 15min 58s, sys: 41.7 s, total: 16min 40s
Wall time: 17min 26s


In [23]:
%%time 

test_data_path = DATA_PATH / 'NYTcorpus_test.p'
with open(test_data_path, mode='rb') as f:
    test_data = pickle.load(f)


CPU times: user 608 ms, sys: 377 ms, total: 985 ms
Wall time: 994 ms


In [26]:

start = time.time()
test_vectorizer = Vectorizer(pretrain, test_data, tokenizer, label_vocab)

vectorized_data = test_vectorizer.vectorize()

torch.save(vectorized_data, f'all_test_vect/all_{len(vectorized_data)}.pt')
print(f"test vect time taken: {int(time.time() - start)} seconds")

test vect time taken: 287 seconds
