In [1]:
import torch
import json
import h5py
from tqdm import tqdm
from os.path import basename, dirname

In [2]:
# Set arguments arbitrarily
limit = 10000
layerspec_l = [
    "all", 
    -1, 
]
first_half_only_l = [
    False, 
    False,
]
second_half_only_l = [
    False,
    False
]
representation_fname_l = [
    "/data/sls/temp/belinkov/contextual-corr-analysis/contextualizers/elmo_original/ptb_pos_dev.hdf5",
    "/data/sls/temp/belinkov/contextual-corr-analysis/contextualizers/calypso_transformer_6_512_base/ptb_pos_dev.hdf5"
]
disable_cuda = True

In [3]:
def fname2mname(fname):
    """
    "filename to model name". 
    """
    return basename(dirname(fname))

In [4]:
num_neurons_d = {} 
representations_d = {} 

if not disable_cuda and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

# for fname in ... loop

In [14]:
# loop variables
ix = 0
layerspec = layerspec_l[ix]
first_half_only = first_half_only_l[ix]
second_half_only = second_half_only_l[ix]
fname = representation_fname_l[ix]

In [15]:
# Set `activations_h5`, `sentence_d`, `indices`
activations_h5 = h5py.File(fname, 'r')
sentence_d = json.loads(activations_h5['sentence_to_index'][0])
temp = {} # TO DO: Make this more elegant?
for k, v in sentence_d.items():
    temp[v] = k
sentence_d = temp # {str ix, sentence}
indices = list(sentence_d.keys())[:limit]

In [16]:
# Set `num_layers`, `num_neurons`, `layers`
s = activations_h5[indices[0]].shape
num_layers = 1 if len(s)==2 else s[0]
num_neurons = s[-1]
layers = list(range(num_layers)) if layerspec=="all" else [layerspec]

In [37]:
# Set `num_neurons_d`, `representations_d`
for layer in layers:
    # Create `representations_l`
    representations_l = []
    for sentence_ix in indices: 
        # Set `dim`
        dim = len(activations_h5[sentence_ix].shape)
        if not (dim == 2 or dim == 3):
            raise ValueError('Improper array dimension in file: ' +
                             fname + "\nShape: " +
                             str(activations_h5[sentence_ix].shape))
        
        # Create `activations`
        activations = torch.FloatTensor(activations_h5[sentence_ix][layer] if dim==3 
                                        else activations_h5[sentence_ix])
        activations = activations.to(device)

        # Create `representations`
        representations = activations
        if first_half_only: 
            representations = torch.chunk(representations, chunks=2,
                                          dim=-1)[0]
        elif second_half_only:
            representations = torch.chunk(representations, chunks=2,
                                          dim=-1)[1]

        representations_l.append(representations)
    
    # update
    model_name = "{model}_{layer}".format(model=fname2mname(fname), 
                                          layer=layer)
    num_neurons_d[model_name] = num_neurons
    representations_d[model_name] = torch.cat(representations_l)

2.74 s ± 33.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [5]:
# full
for loop_var in tqdm(zip(representation_fname_l, layerspec_l, first_half_only_l, second_half_only_l)):
    fname, layerspec, first_half_only, second_half_only = loop_var

    # Set `activations_h5`, `sentence_d`, `indices`
    activations_h5 = h5py.File(fname, 'r')
    sentence_d = json.loads(activations_h5['sentence_to_index'][0])
    temp = {} # TO DO: Make this more elegant?
    for k, v in sentence_d.items():
        temp[v] = k
    sentence_d = temp # {str ix, sentence}
    indices = list(sentence_d.keys())[:limit]

    # Set `num_layers`, `num_neurons`, `layers`
    s = activations_h5[indices[0]].shape
    num_layers = 1 if len(s)==2 else s[0]
    num_neurons = s[-1]
    layers = list(range(num_layers)) if layerspec=="all" else [layerspec]

    # Set `num_neurons_d`, `representations_d`
    for layer in layers:
        # Create `representations_l`
        representations_l = []
        for sentence_ix in indices: 
            # Set `dim`
            dim = len(activations_h5[sentence_ix].shape)
            if not (dim == 2 or dim == 3):
                raise ValueError('Improper array dimension in file: ' +
                                 fname + "\nShape: " +
                                 str(activations_h5[sentence_ix].shape))

            # Create `activations`
            activations = torch.FloatTensor(activations_h5[sentence_ix][layer] if dim==3 
                                            else activations_h5[sentence_ix])
            activations = activations.to(device)

            # Create `representations`
            representations = activations
            if first_half_only: 
                representations = torch.chunk(representations, chunks=2,
                                              dim=-1)[0]
            elif second_half_only:
                representations = torch.chunk(representations, chunks=2,
                                              dim=-1)[1]

            representations_l.append(representations)

        # update
        model_name = "{model}_{layer}".format(model=fname2mname(fname), 
                                              layer=layer)
        num_neurons_d[model_name] = num_neurons
        representations_d[model_name] = torch.cat(representations_l)

2it [00:10,  6.58s/it]


In [6]:
num_neurons_d

{'elmo_original_0': 1024,
 'elmo_original_1': 1024,
 'elmo_original_2': 1024,
 'calypso_transformer_6_512_base_-1': 1024}

In [7]:
representations_d

{'elmo_original_0': tensor([[-1.4411e-01,  1.0574e+00, -8.1262e-02,  ...,  7.3613e-01,
           9.2834e-02, -1.3002e-01],
         [ 6.6604e-04, -2.5411e-01, -6.2755e-01,  ..., -6.3302e-02,
           1.8540e-01, -1.7286e-01],
         [ 1.9155e-01,  2.2999e-01, -2.8944e-01,  ..., -6.4465e-02,
           5.8102e-01,  2.1768e-01],
         ...,
         [-1.0209e-01, -1.4119e-01,  3.2245e-01,  ...,  5.7759e-01,
           9.0413e-01, -2.5007e-01],
         [-8.8715e-01, -2.0040e-01, -1.0601e+00,  ..., -2.6555e-01,
           2.1146e-01,  1.9773e-01],
         [-3.1370e-01,  3.0314e-01, -1.9021e-02,  ..., -8.7148e-01,
          -4.1681e-01,  3.5492e-01]]),
 'elmo_original_1': tensor([[-1.2613e-01,  1.5737e-01, -1.3427e-01,  ..., -8.5516e-01,
           3.7451e-01,  1.5019e-01],
         [-9.6667e-01, -7.0480e-01, -3.1984e-01,  ..., -5.6957e-01,
          -3.0502e-01, -8.0291e-02],
         [-6.4364e-01, -7.3775e-01, -1.2226e-01,  ..., -1.4182e-01,
           2.7240e-01,  3.8422e-01],
 