In [1]:
import torch
import json
import h5py
from tqdm import tqdm

In [2]:
# Set values arbitrarily
limit = None
layer = None
first_half_only = False
second_half_only = False
num_neurons_d = {} # {fname, int}
representations_d = {} # _d to prevent namespace conflicts
# arbitrary value of loop variable
fname = "/data/sls/temp/belinkov/contextual-corr-analysis/contextualizers/elmo_original/ptb_pos_dev.hdf5" 

In [3]:
activations_h5 = h5py.File(fname)
activations_h5 # an h5py File, an h5py Group

<HDF5 file "ptb_pos_dev.hdf5" (mode r+)>

In [4]:
sentence_d = json.loads(activations_h5['sentence_to_index'][0]) # "sentence dict"
sentence_d # (sentence, string index) pairs. Why not the other order?

{"' This is loyalty intelligently bestowed . ''": '0',
 "'' Mr. Allen objected to this analogy because it seems to `` assimilate the status of blacks to that of animals -- as a mere project of charity , of humaneness . ''": '1',
 "'' None believe the proposed management LBOs will be easy to complete , especially at B. Altman & Co. , which is under Chapter 11 bankruptcy protection .": '2',
 "( A Fortune article on Saul Steinberg was entitled , `` Fear and Loathing in the Corporate Boardrooms . '' )": '3',
 '( A ratings point represents 904,000 television households ; shares indicate the percentage of sets in use . )': '4',
 "( But who 's counting . )": '5',
 '( CenTrust shares have fallen sharply in price from a high of $ 15.125 in 1986 to close yesterday at $ 2.875 . )': '6',
 '( Dong-A had had a technology agreement with Jeep maker American Motors Corp. , now a part of Chrysler Corp . )': '7',
 '( During its centennial year , The Wall Street Journal will report events of the past cent

In [5]:
# make it the other order. This actually makes no sense
temp = {}
for k, v in sentence_d.items():
    temp[v] = k
sentence_d = temp

indices = list(sentence_d.keys())[:limit]
indices

['0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '30',
 '31',
 '32',
 '33',
 '34',
 '35',
 '36',
 '37',
 '38',
 '39',
 '40',
 '41',
 '42',
 '43',
 '44',
 '45',
 '46',
 '47',
 '48',
 '49',
 '50',
 '51',
 '52',
 '53',
 '54',
 '55',
 '56',
 '57',
 '58',
 '59',
 '60',
 '61',
 '62',
 '63',
 '64',
 '65',
 '66',
 '67',
 '68',
 '69',
 '70',
 '71',
 '72',
 '73',
 '74',
 '75',
 '76',
 '77',
 '78',
 '79',
 '80',
 '81',
 '82',
 '83',
 '84',
 '85',
 '86',
 '87',
 '88',
 '89',
 '90',
 '91',
 '92',
 '93',
 '94',
 '95',
 '96',
 '97',
 '98',
 '99',
 '100',
 '101',
 '102',
 '103',
 '104',
 '105',
 '106',
 '107',
 '108',
 '109',
 '110',
 '111',
 '112',
 '113',
 '114',
 '115',
 '116',
 '117',
 '118',
 '119',
 '120',
 '121',
 '122',
 '123',
 '124',
 '125',
 '126',
 '127',
 '128',
 '129',
 '130',
 '131',
 '132',
 '133',
 '134',
 '135',
 '136',
 '137',
 '138'

In [6]:
# Create "representations_l object"
representations_l = []
for sentence_ix in indices: 
    # Create "activations" object
    activations = torch.FloatTensor(activations_h5[sentence_ix])
    if not (activations.dim() == 2 or activations.dim() == 3):
        raise ValueError('Improper array shape in file: ' + fname + "\nShape: " + str(activations.shape))

    # Create "representations" object
    representations = activations
    if activations.dim() == 3:
        if layer is not None: # TO DO: change to self.layer
            representations = activations[layer] 
        else:
            # use the top layer by default
            representations = activations[-1]
    if first_half_only: # TO DO: change to self.first_half_only
        representations = torch.chunk(representations, chunks=2, dim=-1)[0]
    elif second_half_only:
        representations = torch.chunk(representations, chunks=2, dim=-1)[1]
    
    representations_l.append(representations)

In [7]:
representations_l

[tensor([[-0.0162, -0.4435,  0.0620,  ..., -1.7420,  0.2056,  0.3489],
         [-1.2992, -1.9456,  0.4049,  ..., -1.0047, -0.2628,  0.2113],
         [-0.8853, -2.1571,  0.1344,  ..., -0.8442,  0.8933,  0.7662],
         ...,
         [-1.6932, -0.9968,  0.0735,  ...,  0.1323, -0.3753, -0.1973],
         [-1.7984, -1.4073, -0.6393,  ...,  0.2594,  0.0954,  0.0070],
         [-0.6673,  0.0718,  0.3171,  ...,  0.1864,  0.0748,  1.6449]]),
 tensor([[-2.4648e-04, -7.9731e-02, -5.8468e-02,  ...,  2.2272e-01,
          -4.8065e-02,  2.5426e-01],
         [ 2.6289e-02, -9.8873e-01,  3.4468e-01,  ..., -7.5821e-01,
          -1.2360e+00, -1.1519e-01],
         [-1.6947e+00, -3.9532e-01,  3.1134e-01,  ..., -6.5611e-01,
          -1.9792e-01, -4.3493e-01],
         ...,
         [-1.3345e+00, -9.5273e-01,  1.8791e-01,  ...,  9.8387e-01,
           1.7055e+00, -1.6219e-01],
         [-3.4229e-01, -6.5141e-01, -9.7165e-01,  ...,  3.0995e-01,
           2.9020e-02,  2.2108e-02],
         [-3.0843e-

In [10]:
representations_l[0].shape

torch.Size([8, 1024])

In [11]:
representations_l[1].shape

torch.Size([33, 1024])

In [8]:
# original, gives an error
torch.cat([torch.stack([torch.cat(token) for token in sentence])
                         for sentence in representations_l]).cpu() 

TypeError: cat(): argument 'tensors' (position 1) must be tuple of Tensors, not Tensor

In [12]:
# my code
torch.cat(representations_l)

tensor([[-0.0162, -0.4435,  0.0620,  ..., -1.7420,  0.2056,  0.3489],
        [-1.2992, -1.9456,  0.4049,  ..., -1.0047, -0.2628,  0.2113],
        [-0.8853, -2.1571,  0.1344,  ..., -0.8442,  0.8933,  0.7662],
        ...,
        [-1.4566, -0.7069,  0.1162,  ...,  0.6021,  2.6262,  0.0305],
        [-0.5817, -1.5737, -0.7076,  ...,  0.1753, -0.1922,  0.0349],
        [-0.9899, -0.4524,  0.2248,  ...,  0.6204,  0.5664,  1.6108]])

In [13]:
torch.cat(representations_l).shape

torch.Size([131611, 1024])