In [None]:
from collections import Counter
from itertools import chain
import mmh3
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from htools import hdir

In [None]:
chain.from_iterable(row.split(' ') for row in sents)

<itertools.chain at 0x1215dc310>

In [None]:
sents = [
    'I walked to the store so I hope it is not closed.',
    'The theater is closed today and the sky is grey.',
    'His dog is brown while hers is grey.'
]
labels = [0, 1, 1]

For now, just convert int to str and take hash. Another option that is meant for ints is Knuth's multiplicative method:

hash(i) = i*2654435761 mod 2^32

In [None]:
def hash_int(x, n_buckets, n_hashes=3):
    assert isinstance(x, int), 'Input `x` must have type int.'
    return [mmh3.hash(str(x), i, signed=False) % n_buckets for i in range(n_hashes)]

In [None]:
def hash_int_tensor(x_2d, n_buckets, n_hashes=3):
    return torch.tensor([[hash_int(x.item(), n_buckets, n_hashes) for x in row]
                         for row in x_2d])

In [None]:
for row in x:
    print(row, end='\n\n')
    print([x.item() for x in row])

tensor([ 2,  5,  6,  3,  7,  8,  2,  9, 10,  1])

[2, 5, 6, 3, 7, 8, 2, 9, 10, 1]
tensor([13, 14,  1, 15, 16, 17,  3, 18,  1,  4])

[13, 14, 1, 15, 16, 17, 3, 18, 1, 4]


In [None]:
hash_int_tensor(x, 11)

tensor([[[ 8,  2,  7],
         [ 2,  8,  1],
         [ 6,  6, 10],
         [10,  5,  5],
         [ 6,  9,  7],
         [ 5,  9,  4],
         [ 8,  2,  7],
         [ 5, 10,  8],
         [ 7,  8,  6],
         [ 6, 10,  6]],

        [[ 2,  5,  1],
         [ 9,  8,  8],
         [ 6, 10,  6],
         [ 8,  1, 10],
         [ 6,  3,  9],
         [ 2,  6,  9],
         [10,  5,  5],
         [10,  5,  0],
         [ 6, 10,  6],
         [ 4,  8,  6]]])

In [None]:
hash_int_tensor(x[0, None], 13)

tensor([[[10,  9,  8],
         [ 8,  1,  0],
         [ 4,  4,  6],
         [ 5, 12,  6],
         [ 7,  3,  2],
         [ 0,  4,  7],
         [10,  9,  8],
         [ 6,  2,  8],
         [ 0,  1,  0],
         [11,  9,  0]]])

In [None]:
x

tensor([[ 2,  5,  6,  3,  7,  8,  2,  9, 10,  1],
        [13, 14,  1, 15, 16, 17,  3, 18,  1,  4]])

In [None]:
[n.item() for n in x[0]]

[2, 5, 6, 3, 7, 8, 2, 9, 10, 1]

In [None]:
[hash_int(n.item(), 11) for n in x[0]]

[[8, 2, 7],
 [2, 8, 1],
 [6, 6, 10],
 [10, 5, 5],
 [6, 9, 7],
 [5, 9, 4],
 [8, 2, 7],
 [5, 10, 8],
 [7, 8, 6],
 [6, 10, 6]]

In [None]:
for i in range(0, 200, 17):
    print(hash_int(i, 11))

[9, 8, 1]
[2, 6, 9]
[8, 2, 7]
[2, 5, 7]
[10, 10, 2]
[0, 10, 4]
[3, 8, 4]
[6, 5, 0]
[6, 10, 8]
[4, 4, 0]
[7, 1, 0]
[10, 4, 6]


In [None]:
class Data(Dataset):
    
    def __init__(self, sentences, labels, seq_len):
        x = [s.split(' ') for s in sentences]
        self.w2i = self.make_w2i(x)
        self.seq_len = seq_len
        self.x = self.encode(x)
        self.y = torch.tensor(labels)
        
    def __getitem__(self, i):
        return self.x[i], self.y[i]
    
    def __len__(self):
        return len(self.y)
    
    def make_w2i(self, tok_rows):
        return {k: i for i, (k, v) in 
                enumerate(Counter(chain(*tok_rows)).most_common(), 1)}
    
    def encode(self, tok_rows):
        enc = np.zeros((len(tok_rows), self.seq_len), dtype=int)
        for i, row in enumerate(tok_rows):
            trunc = [self.w2i.get(w, 0) for w in row[:self.seq_len]]
            enc[i, :len(trunc)] = trunc
        return torch.tensor(enc)

In [None]:
ds = Data(sents, labels, 10)
ds[1]

(tensor([13, 14,  1, 15, 16, 17,  3, 18,  1,  4]), tensor(1))

In [None]:
dl = DataLoader(ds, batch_size=2)
x, y = next(iter(dl))
x, y

(tensor([[ 2,  5,  6,  3,  7,  8,  2,  9, 10,  1],
         [13, 14,  1, 15, 16, 17,  3, 18,  1,  4]]), tensor([0, 1]))

In [None]:
x.shape

torch.Size([2, 10])

In [None]:
ds.x

tensor([[ 2,  5,  6,  3,  7,  8,  2,  9, 10,  1],
        [13, 14,  1, 15, 16, 17,  3, 18,  1,  4],
        [19, 20,  1, 21, 22, 23,  1,  4,  0,  0]])

In [None]:
ds.w2i

{'is': 1,
 'I': 2,
 'the': 3,
 'grey.': 4,
 'walked': 5,
 'to': 6,
 'store': 7,
 'so': 8,
 'hope': 9,
 'it': 10,
 'not': 11,
 'closed.': 12,
 'The': 13,
 'theater': 14,
 'closed': 15,
 'today': 16,
 'and': 17,
 'sky': 18,
 'His': 19,
 'dog': 20,
 'brown': 21,
 'while': 22,
 'hers': 23}

# To do:
- maybe handle padding differently (i.e. just a row of zeros as usual. Should this happen in hash_int, hash_int_tensor, or BloomEmbedding?)
- maybe use fastai embedding() instead of nn.Embedding? Check if the weight init method might help for this.
- maybe use nn.embeddingbag?
- experiment with different numbers of embeddings and hashes. Try to find guidelines for what reasonable choices are to prevent collisions. 
    - Eventually, maybe better to let user input vocab size and choose prob of collision, then automatically select values for n_emb and n_hashes. 
- check to make sure indices are working correctly after switching hash_int_tensor to take only 2d tensors. Also consider if this is the preferred way to do this.
- should we let user choose between mean and sum? Wonder if mean would be better bc we could try different values of n_hashes while still loading pre-trained embeddings (bc scale is standardized)? But that probably doesn't work bc the hashes will be different anyway.

In [None]:
class BloomEmbedding(nn.Module):
    """Bloom Embedding layer for memory-efficient word representations.
    Each word is encoded by a combination of rows of the embedding
    matrix. The number of rows can therefore be far lower than the number
    of words in our vocabulary while still providing unique representations.
    
    The reduction in rows allows us to use memory in other ways: 
    a larger embedding dimension, more or larger layers after the embedding,
    or larger batch sizes.
    """
    
    def __init__(self, n_emb=997, emb_dim=100, n_hashes=3):
        """
        Parameters
        ----------
        n_emb: int
            Number of rows to create in the embedding matrix. A prime
            number is recommended. Lower numbers will be more 
            memory-efficient but increase the chances of collisions.
        emb_dim: int
            Size of each embedding. If emb_dim=100, each word will
            be represented by a 100-dimensional vector.
        n_hashes: int
            This determines the number of hashes that will be taken
            for each word index, and as a result, the number of rows
            that will be summed to create each unique representation.
            The higher the number, the lower the chances of a collision.
        """
        super().__init__()
        self.n_emb = n_emb
        self.emb = nn.Embedding(n_emb, emb_dim)
        self.n_hashes = n_hashes
        
    def forward(self, x):
        """
        Parameters
        ----------
        x: torch.LongTensor
            Input tensor of word indices. (bs x seq_len)
            
        Returns
        -------
        torch.FloatTensor: Words encoded with combination of embeddings.
            (bs x seq_len x emb_dim)
        """
        # (bs, seq_len, n_hashes)
        hashed = hash_int_tensor(x, self.n_emb, self.n_hashes)
        # (bs, seq_len, n_hashes, emb_dim) -> (bs, seq_len, emb_dim)
        return self.emb(hashed).sum(-2)

In [None]:
be = BloomEmbedding(11, 4)
be.emb.weight

Parameter containing:
tensor([[ 0.9593,  1.0518,  0.6034, -1.1845],
        [ 2.6107,  0.8535,  0.1414,  2.5509],
        [ 0.1232, -0.9368, -1.1105, -0.5421],
        [-0.8812, -0.6248,  0.1107,  1.3015],
        [-0.8906,  0.4898, -0.2640,  0.9300],
        [ 0.0312, -0.0197, -0.4688,  0.6815],
        [-1.2531, -1.1531,  0.2033,  0.0260],
        [-0.7790, -0.6460, -1.5369, -1.4178],
        [-0.2901,  0.2190, -1.0544, -0.3004],
        [-1.7659,  0.6724, -0.5916, -1.2079],
        [-1.1231,  0.2622, -1.1394, -0.7264]], requires_grad=True)

In [None]:
x

tensor([[ 2,  5,  6,  3,  7,  8,  2,  9, 10,  1],
        [13, 14,  1, 15, 16, 17,  3, 18,  1,  4]])

In [None]:
hash_int(2, 11)

[8, 2, 7]

In [None]:
for i in range(24):
    print(hash_int(i, 11))

[9, 8, 1]
[6, 10, 6]
[8, 2, 7]
[10, 5, 5]
[4, 8, 6]
[2, 8, 1]
[6, 6, 10]
[6, 9, 7]
[5, 9, 4]
[5, 10, 8]
[7, 8, 6]
[7, 9, 2]
[1, 7, 3]
[2, 5, 1]
[9, 8, 8]
[8, 1, 10]
[6, 3, 9]
[2, 6, 9]
[10, 5, 0]
[2, 10, 6]
[1, 7, 8]
[2, 7, 1]
[9, 4, 8]
[5, 3, 3]


In [None]:
hash_int(2, 11)

[8, 2, 7]

In [None]:
y = be(x)
y.shape

torch.Size([2, 10, 4])

In [None]:
y[0]

tensor([[-0.9459, -1.3638, -3.7018, -2.2603],
        [ 2.4437,  0.1357, -2.0235,  1.7084],
        [-3.6293, -2.0441, -0.7328, -0.6743],
        [-1.0606,  0.2228, -2.0770,  0.6366],
        [-3.7980, -1.1267, -1.9251, -2.5997],
        [-2.6253,  1.1425, -1.3244,  0.4036],
        [-0.9459, -1.3638, -3.7018, -2.2603],
        [-1.3820,  0.4614, -2.6626, -0.3453],
        [-2.3222, -1.5802, -2.3880, -1.6921],
        [-3.6293, -2.0441, -0.7328, -0.6743]], grad_fn=<SelectBackward>)

In [None]:
x

tensor([[ 2,  5,  6,  3,  7,  8,  2,  9, 10,  1],
        [13, 14,  1, 15, 16, 17,  3, 18,  1,  4]])

In [None]:
be([ds[1][0]])

tensor([[[ 2.7651, -0.1030, -1.4379,  2.6902],
         [-2.3462,  1.1103, -2.7004, -1.8087],
         [-3.6293, -2.0441, -0.7328, -0.6743],
         [ 1.1975,  1.3346, -2.0524,  1.5241],
         [-3.9003, -1.1055, -0.2776,  0.1196],
         [-2.8959, -1.4175, -1.4988, -1.7240],
         [-1.0606,  0.2228, -2.0770,  0.6366],
         [-0.1325,  1.2943, -1.0048, -1.2294],
         [-3.6293, -2.0441, -0.7328, -0.6743],
         [-2.4338, -0.4444, -1.1151,  0.6557]]], grad_fn=<SumBackward1>)

In [None]:
be([ds[2][0]])

tensor([[[-2.2530, -1.8277, -2.0466, -1.2425],
         [ 1.5416,  0.4265, -2.4499,  0.8327],
         [-3.6293, -2.0441, -0.7328, -0.6743],
         [ 1.9549, -0.7293, -2.5060,  0.5909],
         [-2.9467,  1.3812, -1.9100, -0.5783],
         [-1.7311, -1.2693, -0.2474,  3.2845],
         [-3.6293, -2.0441, -0.7328, -0.6743],
         [-2.4338, -0.4444, -1.1151,  0.6557],
         [ 0.5547,  1.7449, -1.5046,  1.0425],
         [ 0.5547,  1.7449, -1.5046,  1.0425]]], grad_fn=<SumBackward1>)

In [None]:
for w, i in ds.w2i.items():
    print(w, i, be(torch.tensor([[i]])).detach().numpy())
#           .emb.weight[hash_int(i, be.n_emb)])

is 1 [[[-3.629335   -2.0440652  -0.73281133 -0.67432237]]]
I 2 [[[-0.9459374 -1.3638294 -3.701799  -2.260304 ]]]
the 3 [[[-1.0606176   0.22278604 -2.076974    0.6365739 ]]]
grey. 4 [[[-2.4338439  -0.44435126 -1.1151254   0.6556832 ]]]
walked 5 [[[ 2.4437425  0.1356715 -2.0235345  1.7083521]]]
to 6 [[[-3.629335   -2.0440652  -0.73281133 -0.6743224 ]]]
store 7 [[[-3.7980285 -1.1267047 -1.925149  -2.599691 ]]]
so 8 [[[-2.6252909   1.1425201  -1.3243927   0.40356058]]]
hope 9 [[[-1.3819773   0.46144292 -2.6625922  -0.34529456]]]
it 10 [[[-2.322223  -1.5801504 -2.3879628 -1.6921202]]]
not 11 [[[-2.421743  -0.9103837 -3.2389848 -3.1678748]]]
closed. 12 [[[ 0.9505731 -0.4172793 -1.2848289  2.434585 ]]]
The 13 [[[ 2.7651021  -0.10298538 -1.4379164   2.6902206 ]]]
theater 14 [[[-2.3461998  1.1103309 -2.700363  -1.8086885]]]
closed 15 [[[ 1.1975132  1.3346475 -2.0524316  1.5240704]]]
today 16 [[[-3.900252   -1.105511   -0.27760884  0.11962378]]]
and 17 [[[-2.8959124 -1.4175131 -1.4987998 -1.7240

In [None]:
torch.tensor([[i]]).shape

torch.Size([1, 1])

In [None]:
hash_int(23, 11,  3)

[5, 3, 3]

In [None]:
hashed = hash_int_tensor([[23]], 11, 3)
hashed

tensor([[[5, 3, 3]]])

In [None]:
be.emb.weight[hashed]

tensor([[[[-0.8263,  0.3004, -0.2202, -0.6845],
          [-0.1281,  1.5147,  0.0252,  0.3402],
          [-0.1281,  1.5147,  0.0252,  0.3402]]]], grad_fn=<IndexBackward>)

In [None]:
hashed = hash_int_tensor([[4]], 11, 3)
print('shape:', hashed.shape)
hashed

shape: torch.Size([1, 1, 3])


tensor([[[4, 8, 6]]])

In [None]:
be.emb.weight[hashed]

tensor([[[[-0.7247,  0.6117,  1.5340,  1.1633],
          [-1.0695,  0.5586,  0.8308, -1.2400],
          [-0.4010,  1.0211,  0.6487,  1.4627]]]], grad_fn=<IndexBackward>)

In [None]:
hash_int(2, 11)

[8, 2, 7]

In [None]:
be.emb.weight

Parameter containing:
tensor([[ 4.5428e+00, -1.5926e+00, -1.8052e+00, -4.5093e-01],
        [-1.2740e+00,  6.9280e-01, -1.6181e-01,  1.0787e+00],
        [-5.7357e-01, -3.9490e-01, -2.5627e-01,  5.3629e-01],
        [-4.6838e-02, -3.5406e-01,  6.5238e-01, -1.2767e+00],
        [-3.7521e-01,  5.7965e-05, -2.1415e-01,  7.3730e-01],
        [-1.0251e+00, -1.0557e+00,  1.0477e-01, -6.6462e-01],
        [ 2.6104e-01, -1.6422e+00,  1.5243e+00, -4.5074e-01],
        [ 1.7178e+00, -4.8310e-01,  3.5616e-01, -2.4185e+00],
        [ 8.9364e-01, -2.7994e+00, -1.5495e+00, -1.0016e+00],
        [-2.1734e-01, -6.0759e-01, -1.1346e+00, -6.1147e-01],
        [-3.1615e-01, -9.3613e-01,  6.4561e-01,  1.6669e+00]],
       requires_grad=True)

In [None]:
be([[2]])

tensor([[[8, 2, 7]]])

tensor([[[[ 0.8936, -2.7994, -1.5495, -1.0016],
          [-0.5736, -0.3949, -0.2563,  0.5363],
          [ 1.7178, -0.4831,  0.3562, -2.4185]]]], grad_fn=<EmbeddingBackward>)


tensor([[[ 2.0378, -3.6774, -1.4496, -2.8839]]], grad_fn=<SumBackward1>)

In [None]:
be(torch.tensor([[2, 5]]))

tensor([[[-0.9459, -1.3638, -3.7018, -2.2603],
         [ 2.4437,  0.1357, -2.0235,  1.7084]]], grad_fn=<SumBackward1>)

In [None]:
y

tensor([[[-0.9459, -1.3638, -3.7018, -2.2603],
         [ 2.4437,  0.1357, -2.0235,  1.7084],
         [-3.6293, -2.0441, -0.7328, -0.6743],
         [-1.0606,  0.2228, -2.0770,  0.6366],
         [-3.7980, -1.1267, -1.9251, -2.5997],
         [-2.6253,  1.1425, -1.3244,  0.4036],
         [-0.9459, -1.3638, -3.7018, -2.2603],
         [-1.3820,  0.4614, -2.6626, -0.3453],
         [-2.3222, -1.5802, -2.3880, -1.6921],
         [-3.6293, -2.0441, -0.7328, -0.6743]],

        [[ 2.7651, -0.1030, -1.4379,  2.6902],
         [-2.3462,  1.1103, -2.7004, -1.8087],
         [-3.6293, -2.0441, -0.7328, -0.6743],
         [ 1.1975,  1.3346, -2.0524,  1.5241],
         [-3.9003, -1.1055, -0.2776,  0.1196],
         [-2.8959, -1.4175, -1.4988, -1.7240],
         [-1.0606,  0.2228, -2.0770,  0.6366],
         [-0.1325,  1.2943, -1.0048, -1.2294],
         [-3.6293, -2.0441, -0.7328, -0.6743],
         [-2.4338, -0.4444, -1.1151,  0.6557]]], grad_fn=<SumBackward1>)