In [2]:
import torch
import json
import h5py
from tqdm import tqdm
from os.path import basename, dirname

In [3]:
# Set arguments arbitrarily
limit = 10000
layerspec_l = [
    "full", 
    -1, 
]
first_half_only_l = [
    False, 
    False,
]
second_half_only_l = [
    False,
    False
]
representation_fname_l = [
    "/data/sls/temp/belinkov/contextual-corr-analysis/contextualizers/elmo_original/ptb_pos_dev.hdf5",
    "/data/sls/temp/belinkov/contextual-corr-analysis/contextualizers/calypso_transformer_6_512_base/ptb_pos_dev.hdf5"
]

In [3]:
def fname2mname(fname):
    """
    "filename to model name". 
    """
    return '-'.join([basename(dirname(fname)), basename(fname)])

In [4]:
num_neurons_d = {} 
representations_d = {} 

# for fname in ... loop

In [5]:
# loop variables
ix = 0
layerspec = layerspec_l[ix]
first_half_only = first_half_only_l[ix]
second_half_only = second_half_only_l[ix]
fname = representation_fname_l[ix]

In [6]:
# Set `activations_h5`, `sentence_d`, `indices`
activations_h5 = h5py.File(fname, 'r')
sentence_d = json.loads(activations_h5['sentence_to_index'][0])
temp = {} # TO DO: Make this more elegant?
for k, v in sentence_d.items():
    temp[v] = k
sentence_d = temp # {str ix, sentence}
indices = list(sentence_d.keys())[:limit]

In [7]:
# Set `num_layers`, `num_neurons`, `layers`
s = activations_h5[indices[0]].shape
num_layers = 1 if len(s)==2 else s[0]
num_neurons = s[-1]
if layerspec == "all":
    layers = list(range(num_layers))
elif layerspec == "full":
    layers = ["full"]
else:
    layers = [layerspec]

In [9]:
# Set `num_neurons_d`, `representations_d`
for layer in layers:
    # Create `representations_l`
    representations_l = []
    word_count = 0
    for sentence_ix in indices: 
        # Set `dim`, `n_word`, update `word_count`
        shape = activations_h5[sentence_ix].shape
        dim = len(shape)
        if not (dim == 2 or dim == 3):
            raise ValueError('Improper array dimension in file: ' +
                             fname + "\nShape: " +
                             str(activations_h5[sentence_ix].shape))
        if dim == 3:
            n_word = shape[1]
        elif dim == 2:
            n_word = shape[0]
        word_count += n_word
        
        # Create `activations`
        if layer == "full":
            activations = torch.FloatTensor(activations_h5[sentence_ix])
            if dim == 3:
                activations = activations.permute(1, 0, 2)
                activations = activations.contiguous().view(n_word, -1)
        else:
            activations = torch.FloatTensor(activations_h5[sentence_ix][layer] if dim==3 
                                                else activations_h5[sentence_ix])

        # Create `representations`
        representations = activations
        if first_half_only: 
            representations = torch.chunk(representations, chunks=2,
                                          dim=-1)[0]
        elif second_half_only:
            representations = torch.chunk(representations, chunks=2,
                                          dim=-1)[1]

        representations_l.append(representations)
        
        # If we've loaded in enough words already, stop
        if limit is not None and word_count >= limit:
            break
    
    # update
    model_name = "{model}_{layer}".format(model=fname2mname(fname), 
                                          layer=layer)
    num_neurons_d[model_name] = representations_l[0].size()[-1]
    representations_d[model_name] = torch.cat(representations_l)[:limit]

In [13]:
# full
for loop_var in tqdm(zip(representation_fname_l, layerspec_l,
                         first_half_only_l, second_half_only_l)):
    fname, layerspec, first_half_only, second_half_only = loop_var

    # Set `activations_h5`, `sentence_d`, `indices`
    activations_h5 = h5py.File(fname, 'r')
    sentence_d = json.loads(activations_h5['sentence_to_index'][0])
    temp = {} # TO DO: Make this more elegant?
    for k, v in sentence_d.items():
        temp[v] = k
    sentence_d = temp # {str ix, sentence}
    indices = list(sentence_d.keys())[:limit]

    # Set `num_layers`, `num_neurons`, `layers`
    s = activations_h5[indices[0]].shape
    num_layers = 1 if len(s)==2 else s[0]
    num_neurons = s[-1]
    if layerspec == "all":
        layers = list(range(num_layers))
    elif layerspec == "full":
        layers = ["full"]
    else:
        layers = [layerspec]

    # Set `num_neurons_d`, `representations_d`
    for layer in layers:
        # Create `representations_l`
        representations_l = []
        word_count = 0
        for sentence_ix in indices: 
            # Set `dim`, `n_word`, update `word_count`
            shape = activations_h5[sentence_ix].shape
            dim = len(shape)
            if not (dim == 2 or dim == 3):
                raise ValueError('Improper array dimension in file: ' +
                                 fname + "\nShape: " +
                                 str(activations_h5[sentence_ix].shape))
            if dim == 3:
                n_word = shape[1]
            elif dim == 2:
                n_word = shape[0]
            word_count += n_word

            # Create `activations`
            if layer == "full":
                activations = torch.FloatTensor(activations_h5[sentence_ix])
                if dim == 3:
                    activations = activations.permute(1, 0, 2)
                    activations = activations.contiguous().view(n_word, -1)
            else:
                activations = torch.FloatTensor(activations_h5[sentence_ix][layer] if dim==3 
                                                    else activations_h5[sentence_ix])

            # Create `representations`
            representations = activations
            if first_half_only: 
                representations = torch.chunk(representations, chunks=2,
                                              dim=-1)[0]
            elif second_half_only:
                representations = torch.chunk(representations, chunks=2,
                                              dim=-1)[1]

            representations_l.append(representations)

            # If we've loaded in enough words already, stop
            if limit is not None and word_count >= limit:
                break

        # update
        model_name = "{model}_{layer}".format(model=fname2mname(fname), 
                                              layer=layer)
        num_neurons_d[model_name] = representations_l[0].size()[-1]
        representations_d[model_name] = torch.cat(representations_l)[:limit]

2it [00:20,  7.32s/it]


# Full function

In [6]:
def load_representations(representation_fname_l, limit=None,
                         layerspec_l=None, first_half_only_l=False,
                         second_half_only_l=False):
    def fname2mname(fname):
        """
        "filename to model name". 
        """
        return '-'.join([basename(dirname(fname)), basename(fname)])

    num_neurons_d = {} 
    representations_d = {} 
    
    for loop_var in tqdm(zip(representation_fname_l, layerspec_l,
                             first_half_only_l, second_half_only_l)):
        fname, layerspec, first_half_only, second_half_only = loop_var

        # Set `activations_h5`, `sentence_d`, `indices`
        activations_h5 = h5py.File(fname, 'r')
        sentence_d = json.loads(activations_h5['sentence_to_index'][0])
        temp = {} # TO DO: Make this more elegant?
        for k, v in sentence_d.items():
            temp[v] = k
        sentence_d = temp # {str ix, sentence}
        indices = list(sentence_d.keys())[:limit]

        # Set `num_layers`, `num_neurons`, `layers`
        s = activations_h5[indices[0]].shape
        num_layers = 1 if len(s)==2 else s[0]
        num_neurons = s[-1]
        if layerspec == "all":
            layers = list(range(num_layers))
        elif layerspec == "full":
            layers = ["full"]
        else:
            layers = [layerspec]

        # Set `num_neurons_d`, `representations_d`
        for layer in layers:
            # Create `representations_l`
            representations_l = []
            word_count = 0
            for sentence_ix in indices: 
                # Set `dim`, `n_word`, update `word_count`
                shape = activations_h5[sentence_ix].shape
                dim = len(shape)
                if not (dim == 2 or dim == 3):
                    raise ValueError('Improper array dimension in file: ' +
                                     fname + "\nShape: " +
                                     str(activations_h5[sentence_ix].shape))
                if dim == 3:
                    n_word = shape[1]
                elif dim == 2:
                    n_word = shape[0]
                word_count += n_word

                # Create `activations`
                if layer == "full":
                    activations = torch.FloatTensor(activations_h5[sentence_ix])
                    if dim == 3:
                        activations = activations.permute(1, 0, 2)
                        activations = activations.contiguous().view(n_word, -1)
                else:
                    activations = torch.FloatTensor(activations_h5[sentence_ix][layer] if dim==3 
                                                        else activations_h5[sentence_ix])

                # Create `representations`
                representations = activations
                if first_half_only: 
                    representations = torch.chunk(representations, chunks=2,
                                                  dim=-1)[0]
                elif second_half_only:
                    representations = torch.chunk(representations, chunks=2,
                                                  dim=-1)[1]

                representations_l.append(representations)

                # If we've loaded in enough words already, stop
                if limit is not None and word_count >= limit:
                    break

            # update
            model_name = "{model}_{layer}".format(model=fname2mname(fname), 
                                                  layer=layer)
            num_neurons_d[model_name] = representations_l[0].size()[-1]
            representations_d[model_name] = torch.cat(representations_l)[:limit]   
    
    return num_neurons_d, representations_d

In [7]:
load_representations(representation_fname_l, limit=10000, layerspec_l=layerspec_l, first_half_only_l=first_half_only_l,
                         second_half_only_l=second_half_only_l)

2it [00:03,  2.04s/it]


({'elmo_original-ptb_pos_dev.hdf5_full': 3072,
  'calypso_transformer_6_512_base-ptb_pos_dev.hdf5_-1': 1024},
 {'elmo_original-ptb_pos_dev.hdf5_full': tensor([[-1.4411e-01,  1.0574e+00, -8.1262e-02,  ..., -1.7420e+00,
            2.0556e-01,  3.4890e-01],
          [ 6.6604e-04, -2.5411e-01, -6.2755e-01,  ..., -1.0047e+00,
           -2.6278e-01,  2.1131e-01],
          [ 1.9155e-01,  2.2999e-01, -2.8944e-01,  ..., -8.4425e-01,
            8.9332e-01,  7.6619e-01],
          ...,
          [-2.0288e-02,  1.6141e-01,  3.3374e-01,  ...,  1.0572e+00,
            1.0167e+00, -1.1895e-01],
          [ 1.1309e-01,  1.5746e-01,  1.4142e-01,  ...,  3.0812e-01,
           -3.9564e-01,  4.3130e-01],
          [ 2.4991e-01,  4.4100e-01,  5.2294e-02,  ..., -1.4741e+00,
           -4.3560e-02,  7.0020e-01]]),
  'calypso_transformer_6_512_base-ptb_pos_dev.hdf5_-1': tensor([[  1.0387,  -3.4095,  -1.5543,  ...,   4.0005,   2.9103,  -2.1940],
          [-10.8572,   2.4143,  -0.7933,  ...,   2.0335,   4

In [None]:
load_representations(representation_fname_l, limit=None, layerspec_l=layerspec_l, first_half_only_l=first_half_only_l,
                         second_half_only_l=second_half_only_l)

0it [00:00, ?it/s]