In [79]:
import os
import torch
import os
from tqdm.notebook import tqdm

from torch.utils.data import Dataset

class GPTEmbeddedDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx], idx
    
class EmbeddedDataset(Dataset):
    def __init__(self, X):
        self.X = X

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], idx

def load_all(directory):
    sorted_filenames = sorted(os.listdir(directory), key=lambda fn: int(fn.split('of')[0].split('chunk')[1]))
    all_X = None
    all_y = None
    for filename in tqdm(sorted_filenames):
        saved_dataset = torch.load(directory + filename)

        if all_X is not None:
            all_X = torch.cat([all_X, saved_dataset.X])
#             all_y = torch.cat([all_y, saved_dataset.y])
        else:
            all_X = saved_dataset.X
#             all_y = saved_dataset.y
    all_X = all_X.numpy()
#     all_y = all_y.numpy()
    return all_X #, all_y

In [73]:
!ls /gpfs/space/projects/stud_nlp_share/kristjan/gpt2-vectors/

embedded_train_FULL_gpt2_chunk1of9.pt  embedded_train_FULL_gpt2_chunk6of9.pt
embedded_train_FULL_gpt2_chunk2of9.pt  embedded_train_FULL_gpt2_chunk7of9.pt
embedded_train_FULL_gpt2_chunk3of9.pt  embedded_train_FULL_gpt2_chunk8of9.pt
embedded_train_FULL_gpt2_chunk4of9.pt  embedded_train_FULL_gpt2_chunk9of9.pt
embedded_train_FULL_gpt2_chunk5of9.pt


In [81]:
directory = '/gpfs/space/projects/stud_nlp_share/kristjan/gpt2-vectors/'

gpt2_kristjan_X = load_all(directory)

  0%|          | 0/9 [00:00<?, ?it/s]

In [82]:
gpt2_kristjan_X.shape

(1298504, 768)

In [90]:
n_chunks = 25

split_indices = np.array_split(np.arange(len(gpt2_kristjan_X)), n_chunks)

In [94]:
len(split_indices[0])

51941

In [95]:
split_indices[0][-1]

51940

In [96]:
split_indices[1][0]

51941

In [4]:
directory = '/gpfs/space/projects/stud_nlp_share/single-gpt2/'

gpt2_X, gpt2_y = load_all(directory)

  0%|          | 0/7 [00:00<?, ?it/s]

In [44]:
!ls /gpfs/space/projects/stud_nlp_share

kristjan  NYTcorpus_test.p.gz  NYTcorpus_train.p.gz  single-gpt2  tfidf


In [46]:
directory = '/gpfs/space/projects/stud_nlp_share/tfidf/train/'

tfidf_X, tfidf_y = load_all(directory)

  0%|          | 0/25 [00:00<?, ?it/s]

In [47]:
gpt2_X.shape, gpt2_y.shape, tfidf_X.shape, tfidf_y.shape

((1298505, 768), (1298505, 538), (1298479, 1000), (1298479, 538))

In [22]:
import numpy as np

last_article = np.zeros((768,))

for i, article in enumerate(gpt2_X):    
    if np.equal(article, last_article).all():
        print('duplicate!', i)
    last_article = article

duplicate! 600000


In [26]:
gpt2_X[600000-1][:10]

array([-0.07917378, -0.12252154,  0.21342368,  0.0502246 ,  0.08611257,
       -0.36935362, 21.566942  ,  0.74405277,  0.9342644 , -0.6718916 ],
      dtype=float32)

In [27]:
gpt2_X[600000][:10]

array([-0.07917378, -0.12252154,  0.21342368,  0.0502246 ,  0.08611257,
       -0.36935362, 21.566942  ,  0.74405277,  0.9342644 , -0.6718916 ],
      dtype=float32)

In [41]:
import sys

gpt_unique = set()

for i, article in enumerate(gpt2_X):
    gpt_unique.add(tuple(list(article)))
    if i % 50_000 ==0:
        print('i', i)
    

i 0
i 10000
i 20000
i 30000
i 40000
i 50000
i 60000
i 70000
i 80000
i 90000
i 100000
i 110000
i 120000
i 130000
i 140000
i 150000
i 160000
i 170000
i 180000
i 190000
i 200000
i 210000
i 220000
i 230000
i 240000
i 250000
i 260000
i 270000
i 280000
i 290000
i 300000
i 310000
i 320000
i 330000
i 340000
i 350000
i 360000
i 370000
i 380000
i 390000
i 400000
i 410000
i 420000
i 430000
i 440000
i 450000
i 460000
i 470000
i 480000
i 490000
i 500000
i 510000
i 520000
i 530000
i 540000
i 550000
i 560000
i 570000
i 580000
i 590000
i 600000
i 610000
i 620000
i 630000
i 640000
i 650000
i 660000
i 670000
i 680000
i 690000
i 700000
i 710000
i 720000
i 730000
i 740000
i 750000
i 760000
i 770000
i 780000
i 790000
i 800000
i 810000
i 820000
i 830000
i 840000
i 850000
i 860000
i 870000
i 880000
i 890000
i 900000
i 910000
i 920000
i 930000
i 940000
i 950000
i 960000
i 970000
i 980000
i 990000
i 1000000
i 1010000
i 1020000
i 1030000
i 1040000
i 1050000
i 1060000
i 1070000
i 1080000
i 1090000
i 1100000
i 11

In [49]:
print('unique:', len(gpt_unique))
print('all:', len(gpt2_X))

unique: 1288424
all: 1298505


In [50]:
tfidf_unique = set()

for i, article in enumerate(tfidf_X):
    tfidf_unique.add(tuple(list(article)))
    if i % 50_000 ==0:
        print('i', i)

i 0
i 50000
i 100000
i 150000
i 200000
i 250000
i 300000
i 350000
i 400000
i 450000
i 500000
i 550000
i 600000
i 650000
i 700000
i 750000
i 800000
i 850000
i 900000
i 950000
i 1000000
i 1050000
i 1100000
i 1150000
i 1200000
i 1250000


In [51]:
print('unique:', len(tfidf_unique))
print('all:', len(tfidf_X))

unique: 1278567
all: 1298479


In [62]:
gpt_i = 0
tfidf_i = 0

while gpt_i < gpt2_y.shape[0] and tfidf_i < tfidf_y.shape[0]:
    
    tfidf_labels = tfidf_y[tfidf_i].nonzero()[0]
    gpt_labels = gpt2_y[gpt_i].nonzero()[0]
    
    if (tfidf_labels != gpt_labels).all():
        print('found not equal', 'gpt:', gpt_i, 'tfidf:', tfidf_i)
        
    gpt_i += 1
    tfidf_i += 1
    
    

found not equal gpt: 51940 tfidf: 51940


  if (tfidf_labels != gpt_labels).all():


AttributeError: 'bool' object has no attribute 'all'

In [88]:
tfidf_y[51940]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [89]:
gpt2_y[51941]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,