## LATENT DIRICHLET ALLOCATION

https://medium.com/@lettier/how-does-lda-work-ill-explain-using-emoji-108abf40fa7d
    
https://www.youtube.com/watch?v=DWJYZq_fQ2A

In [1]:
import random
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np

topic1 = {
    'id': 'sports', 
    'words': ['football', 'basketball', 'gol', 'play', 'match', 'space'],
    'weights': [4, 1, 1, 1, 1, 2]
}
topic2 = {
    'id': 'politics', 
    'words': ['president', 'interview', 'twitter', 'television', 'debate', 'space'],
    'weights': [2, 1, 4, 1, 1, 1]
}
topic3 = {
    'id': 'science', 
    'words': ['science', 'molecula', 'debate', 'space'],
    'weights': [1, 1, 1, 1]
}

topics = [topic1, topic2, topic3]


# topic1 = {
#     'id': 'sports', 
#     'words': ['football', 'basketball', 'match'],
#     'weights': [1,1,1]
# }
# topic2 = {
#     'id': 'politics', 
#     'words': ['president', 'interview', 'twitter'],
#     'weights': [1, 1, 1]
# }
# topic3 = {
#     'id': 'science', 
#     'words': ['science', 'molecula', 'space'],
#     'weights': [1, 1, 1]
# }

topics = [topic1, topic2, topic3]


# topic1 = {
#     'id': 'dogs', 
#     'words': ['guau', 'guau2'],
#     'weights': [1, 1]
# }
# topic2 = {
#     'id': 'cats', 
#     'words': ['miaw', 'miaw2'],
#     'weights': [1, 1]
# }

# topics = [topic1, topic2]


## Generar documentos 


In [121]:
def generate_documents(topics, num_docs=100, topics_per_doc=2, words_per_doc=1000):
    documents = []
    for i in range(num_docs):
        # first select several topics
        doc_topics = random.sample(topics, topics_per_doc)

        # asign a weight to each topic of the document
        r = [random.random() for _ in range(0, topics_per_doc)]
        s = sum(r)
        doc_topics_weights = [ j/s for j in r ]
        
        # now sample the topics to generate the document. 
        # NOTE: this is an inefficient version for teaching purposes
        doc_words = [] 
        for _ in range(words_per_doc):
            topic = random.choices(doc_topics, doc_topics_weights)[0]  # select one topic
            doc_words = doc_words + random.choices(topic['words'], topic['weights'], k=1)  # choose one word
        
        documents.append({'id': i, 'words': doc_words, 'topics_weights': [(t[0]['id'], t[1]) for t in zip(doc_topics, doc_topics_weights)]})
        
        words_idx = list(set([word for doc in documents for word in doc['words']]))
        docs_idx = [doc['id'] for doc in documents]
        topics_idx = [t['id'] for t in topics]
    return documents, docs_idx, words_idx, topics_idx


docs, docs_idx, words_idx, topics_idx = generate_documents(topics, num_docs=100, topics_per_doc=2, words_per_doc=100)
docs

[{'id': 0,
  'words': ['president',
   'football',
   'television',
   'twitter',
   'football',
   'football',
   'president',
   'space',
   'president',
   'football',
   'twitter',
   'twitter',
   'interview',
   'match',
   'debate',
   'space',
   'president',
   'television',
   'interview',
   'football',
   'space',
   'president',
   'basketball',
   'interview',
   'space',
   'twitter',
   'space',
   'twitter',
   'space',
   'twitter',
   'twitter',
   'football',
   'gol',
   'twitter',
   'president',
   'twitter',
   'twitter',
   'interview',
   'interview',
   'play',
   'interview',
   'twitter',
   'interview',
   'play',
   'match',
   'basketball',
   'space',
   'football',
   'football',
   'twitter',
   'president',
   'football',
   'football',
   'president',
   'twitter',
   'match',
   'debate',
   'president',
   'play',
   'match',
   'space',
   'twitter',
   'football',
   'twitter',
   'space',
   'television',
   'football',
   'football',
   'debat

## Generar tablas auxiliares: words vs topics, docs vs topics

In [3]:
def words_vs_topics_matrix(docs, words_idx, ntopics):
    m = np.zeros((len(words_idx), ntopics))
    for doc in docs:
        for a in doc['assignment']:
            word_j = words_idx.index(a['word'])
            m[word_j, a['topic']] += 1
    return m


def docs_vs_topics_matrix(docs, ntopics):
    m = np.zeros((len(docs), ntopics))
    for doc in docs:
        doc_i = docs.index(doc)
        for a in doc['assignment']:
            m[doc_i, a['topic']] += 1
    return m


def bag_of_words_matrix(docs, words_idx):
    m = np.zeros((len(docs), len(words_idx)), dtype=int)
    for doc in docs:
        doc_i = doc['id']
        for word in doc['words']:
            word_j = words_idx.index(word)
            m[doc_i, word_j] += 1
    return m


def random_assignment(docs, ntopics):
    for doc in docs:
        assignment = zip(doc['words'], np.random.randint(0, ntopics, len(doc['words'])))  # get random initial assignment for each word in each document
        doc['assignment'] = [{'word': t[0], 'topic': t[1]} for t in assignment]
    

test_docs = [
    {'id': 0, 'assignment': [{'word': 'A', 'topic': 0}, {'word': 'AA', 'topic': 0}]},
    {'id': 1, 'assignment': [{'word': 'B', 'topic': 1}, {'word': 'BB', 'topic': 1}]},
]

# TEST
words_vs_topics_matrix(test_docs, words_idx=['A', 'AA', 'B', 'BB'], ntopics=2) == np.array([[1,0],[1,0],[0,1],[0,1]])


array([[ True,  True],
       [ True,  True],
       [ True,  True],
       [ True,  True]])

## Dirichlet allocation versión lenta

In [None]:
def prob_w_belongs_t_in_d(words_vs_topics, docs_vs_topics, d, w, t, old_topic, alpha, beta, debug=False):
    # P(w belongs topic) = P(word w | topic t) * P(topic t | doc d)
    # P(w belongs topic) = Proporcion(word w en el topic t) * Proporcion(topic t en el doc d)
    # prob_w_belongs_t = words_vs_topics[w, t]/words_vs_topics[:, t] * docs_vs_topics[d, t]/sum(docs_vs_topic[d, :])
    nwords, ntopics = words_vs_topics.shape
    ndocs, _ = docs_vs_topics.shape
        
    if debug:
        print("words_vs_topics=\n", words_vs_topics, words_vs_topics[w, t], (w,t))
        print("docs_vs_topics=\n", docs_vs_topics, docs_vs_topics[d, t], (d,t))
        print("t=", t)
        print("d=", d)
        print("w=", w)
        print("old_topic=", old_topic)
    
    if old_topic == t:
        # Note: as we are calculating the new topic assignation for w, we need to eliminate the
        # old assignation. That is why we rest -1 in those calculations
        assert words_vs_topics[w, t] > 0
        assert docs_vs_topics[d, t] > 0
        if debug:
            print("REMOVING OLD ASIGNATION")
            print((words_vs_topics[w, t]-1 + beta)/(sum(words_vs_topics[:, t])-1 + beta*nwords))
            print((docs_vs_topics[d, t]-1 + alpha)/(sum(docs_vs_topics[d, :])-1 + alpha*ntopics))
            input()
        return (words_vs_topics[w, t]-1 + beta)/(sum(words_vs_topics[:, t])-1 + beta*nwords) \
               * (docs_vs_topics[d, t]-1 + alpha)/(sum(docs_vs_topics[d, :])-1 + alpha*ntopics)
    else:
        if debug:
            print((words_vs_topics[w, t] + beta)/(sum(words_vs_topics[:, t]) + beta))
            print((docs_vs_topics[d, t] + alpha)/(sum(docs_vs_topics[d, :]) + alpha*ntopics))
            input()
        return (words_vs_topics[w, t] + beta)/(sum(words_vs_topics[:, t]) + beta) \
               * (docs_vs_topics[d, t] + alpha)/(sum(docs_vs_topics[d, :]) + alpha*ntopics)


def dirichlet_allocation(docs, docs_idx, words_idx, ntopics, niter=400, alpha=0.5, beta=0.01, debug=False):
    ndocs, nwords = len(docs_idx), len(words_idx)

    random_assignment(docs, ntopics)

    for _ in range(niter):

        words_vs_topics = words_vs_topics_matrix(docs, words_idx, ntopics)
        # print(words_vs_topics)

        docs_vs_topics = docs_vs_topics_matrix(docs, ntopics)
        # print(docs_vs_topics)

        for doc in docs:
            doc_i = docs_idx.index(doc['id'])
            for assign in doc['assignment']:
                word = assign['word']
                word_j = words_idx.index(word)
                old_topic = assign['topic']
                prob_by_topic = [0] * ntopics  # unnormalized
                for topic_k in range(ntopics):
                    if debug:
                        print(f"==>> doc {doc_i}, word {word_j}, calculating for topic {topic_k} (old topic was {old_topic})")
                    p = prob_w_belongs_t_in_d(words_vs_topics, docs_vs_topics, doc_i, word_j, topic_k, old_topic=old_topic, alpha=alpha, beta=beta, debug=debug)
                    prob_by_topic[topic_k] = p

                new_topic = random.choices(range(ntopics), prob_by_topic, k=1)[0]
                assign['topic'] = new_topic

    return docs


def show_results(docs):
    def normalize_counter(c):
        total = sum(c.values(), 0.0)
        for key in c:
            c[key] /= total
        return c

    for (i, d) in enumerate(docs):
        print(i, d['topics_weights'])
        topic_list = Counter([a['topic'] for a in d['assignment']])
        a = normalize_counter(Counter(topic_list))
        print(i, sorted(a.items()))


In [131]:
%%time
docs = dirichlet_allocation(docs, docs_idx, words_idx, ntopics=3, niter=100)
show_results(docs)

0 [('politics', 0.5401744339260784), ('sports', 0.4598255660739216)]
0 [(0, 0.54), (1, 0.46)]
1 [('politics', 0.5864188742826167), ('science', 0.41358112571738326)]
1 [(0, 0.64), (1, 0.01), (2, 0.35)]
2 [('sports', 0.7231207447241729), ('science', 0.27687925527582713)]
2 [(1, 0.63), (2, 0.37)]
3 [('politics', 0.23431654435232518), ('sports', 0.7656834556476748)]
3 [(0, 0.29), (1, 0.71)]
4 [('science', 0.6175851598695579), ('sports', 0.3824148401304421)]
4 [(1, 0.45), (2, 0.55)]
5 [('politics', 0.3659803489588083), ('sports', 0.6340196510411918)]
5 [(0, 0.38), (1, 0.61), (2, 0.01)]
6 [('politics', 0.7816594986813653), ('sports', 0.2183405013186347)]
6 [(0, 0.78), (1, 0.2), (2, 0.02)]
7 [('science', 0.6598811191253253), ('politics', 0.3401188808746747)]
7 [(0, 0.31), (2, 0.69)]
8 [('sports', 0.6339531098157732), ('politics', 0.3660468901842267)]
8 [(0, 0.29), (1, 0.71)]
9 [('politics', 0.21923481213882992), ('science', 0.7807651878611701)]
9 [(0, 0.16), (2, 0.84)]
10 [('politics', 0.3879

## OK, this works but it needs a speed up: let's try to vectorize it


In [40]:
test_docs = [
    {'id': 0, 'words': ['A', 'AA']},
    {'id': 1, 'words': ['B', 'BB', 'BB', 'C']},
    {'id': 2, 'words': ['C', 'CC', 'CC', 'A']},
]
words_idx = ['A', 'AA', 'B', 'BB', 'C', 'CC']
nwords = len(words_idx)
ndocs = len(test_docs)
ntopics = 2
beta = 0.01
alpha = 0.5

random_assignment(test_docs, ntopics=ntopics)

B = bag_of_words_matrix(test_docs, words_idx)
D = docs_vs_topics_matrix(test_docs, ntopics=ntopics)
W = words_vs_topics_matrix(test_docs, words_idx, ntopics=ntopics)

print('B=\n', B)
print('D=\n', D)
print('W=\n', W)


B=
 [[1 1 0 0 0 0]
 [0 0 1 2 1 0]
 [1 0 0 0 1 2]]
D=
 [[2. 0.]
 [2. 2.]
 [2. 2.]]
W=
 [[2. 0.]
 [1. 0.]
 [0. 1.]
 [2. 0.]
 [1. 1.]
 [0. 2.]]


In [41]:
# sum W[:, t]  (por columnas)
np.sum(W, axis=0)

array([6., 4.])

In [42]:
# sum D[d, :] (por filas)
np.sum(D, axis=1).reshape((ndocs,1))

array([[2.],
       [4.],
       [4.]])

In [43]:
# (W[w,t] + beta) / (sum(W[:,t]) + beta*nwords) matriz W con cada valor normalizado con la suma de la columna
Wnorm = (W + beta) / (np.sum(W, axis=0) + beta*nwords)
Wnorm

array([[0.33168317, 0.00246305],
       [0.16666667, 0.00246305],
       [0.00165017, 0.24876847],
       [0.33168317, 0.00246305],
       [0.16666667, 0.24876847],
       [0.00165017, 0.49507389]])

In [44]:
# (D[d,t] + alpha) / (sum(D[d,:]) + alpha*ntopics)  matriz D con cada valor normalizadp con la suma de la fila
Dnorm = (D + alpha) / (np.sum(D, axis=1).reshape((ndocs,1)) + alpha*ntopics) 
Dnorm

array([[0.83333333, 0.16666667],
       [0.5       , 0.5       ],
       [0.5       , 0.5       ]])

In [45]:
# Ahora hay que "combinar" ambas matrices W-norm y D-norm de una manera especial: 
# ambas matrices tienen el mismo número de columnas, una por topic. Cada columna j de la matriz W-norm se tiene 
# que combinar con la misma columna j de la matriz D-norm (mismo topic). 
# Combinar: por cada elemento de la columna de W-norm multiplicarlo por cada elemento de la columna 
# de D-norm, generando en total nwords*ndocs elementos. Esto se consige "expandiendo" una de las columnas 
# tantas veces como elementos tiene la otra y luego multiplicando. 
# con esto conseguimos una matriz P[nwords, ndocs, ntopics]

In [46]:
P = np.stack([Dnorm]*Wnorm.shape[0], axis=1) * Wnorm
P

array([[[0.27640264, 0.00041051],
        [0.13888889, 0.00041051],
        [0.00137514, 0.04146141],
        [0.27640264, 0.00041051],
        [0.13888889, 0.04146141],
        [0.00137514, 0.08251232]],

       [[0.16584158, 0.00123153],
        [0.08333333, 0.00123153],
        [0.00082508, 0.12438424],
        [0.16584158, 0.00123153],
        [0.08333333, 0.12438424],
        [0.00082508, 0.24753695]],

       [[0.16584158, 0.00123153],
        [0.08333333, 0.00123153],
        [0.00082508, 0.12438424],
        [0.16584158, 0.00123153],
        [0.08333333, 0.12438424],
        [0.00082508, 0.24753695]]])

In [47]:
P[1,5,1]

0.24753694581280788

In [48]:
# primero normalizamos la matriz P para que se convierta en una matriz de probabilidades (filas sumen 1)
Pnorm = P/np.sum(P, axis=2).reshape((ndocs, nwords, 1))
Pnorm

array([[[0.99851702, 0.00148298],
        [0.99705305, 0.00294695],
        [0.03210197, 0.96789803],
        [0.99851702, 0.00148298],
        [0.77010622, 0.22989378],
        [0.01639265, 0.98360735]],

       [[0.99262881, 0.00737119],
        [0.98543689, 0.01456311],
        [0.00658963, 0.99341037],
        [0.99262881, 0.00737119],
        [0.40118577, 0.59881423],
        [0.0033221 , 0.9966779 ]],

       [[0.99262881, 0.00737119],
        [0.98543689, 0.01456311],
        [0.00658963, 0.99341037],
        [0.99262881, 0.00737119],
        [0.40118577, 0.59881423],
        [0.0033221 , 0.9966779 ]]])

## Problema

Pnorm nos permite seleccionar aleatoriamente el nuevo topic para cada palabra de cada documento. El problema que tenemos es que si un documento contiene la misma palabra más de una vez, la representacion "bag of words" que estábamos usando no nos sirve, cada ocurrencia de la misma palabra deberíamos tratarla por separado (pueden venir de distintos topics). Esto complica el trabajar con matrices, ya que la alternativa es representar cada documento como una lista de longitud variable de vectores (un vector por palabra), y eso es ineficiente en numpy.


    -- Representación "Bag of Words": 
    DW = [[2,0,1], # el doc 0 contiene dos veces la palabra 0 y 1 vez la palabra 2
          [0,2,0], # el doc 1 contiene dos veces la palabra 1
          [1,1,0]] # ...

    -- Representación one hot encoding de la misma matriz
    DWb = [[[1,0,0], [1,0,0], [0,0,1]],  # dos vectores distintos para las dos ocurrencias de la palabra 0 
           [[0,1,0], [0,1,0]],
           [[1,0,0], [1,0,0]]
          ]


Podemos seguir de dos/tres formas:

0. Continuar con la representación "bag of words" y todas las ocurrencias de la misma palabra en un documento las tratamos igual(al asignar un nuevo topic, a todas le asignamos el mismo). Intuyo que el algoritmo no convergera bien si hay mucho solapamiento de palabras entre topics.

1. Versión secuencial: iterar para cada documento d, para cada palabra w. Pnorm[d][w] contiene el vector de probabilidades para seleccionar el nuevo topic. O(ndocs * nwordsmedio * O(random.choice(ntopics)) ). Lo bueno es que esto sería paralelizable.

2. Versión "one hot encoding" con sparse matrix: el problema aquí es que cada documento tiene longitud variable de palabras. La solucion: mirar cual es el documento con más palabras e igualar el resto con vectores vacíos, para que la matriz nos quede "cuadrada". Esto es muy bestia en consumo de memoria y por eso lo de usar sparse matrices. 


## Opción 0

Si tengo 5 ocurrencias de la misma palabra en un documento, selecciono el mismo nuevo topic para todas ellas.

Para vectorizar la seleccion aleatoria del nuevo topic (para cada palabra de cada documento) se necesita una función random.choice() en paralelo

https://stackoverflow.com/a/57238866

In [49]:
# generamos la suma cumulativa por filas
Pnormcum = Pnorm.cumsum(axis=2)
Pnormcum

array([[[0.99851702, 1.        ],
        [0.99705305, 1.        ],
        [0.03210197, 1.        ],
        [0.99851702, 1.        ],
        [0.77010622, 1.        ],
        [0.01639265, 1.        ]],

       [[0.99262881, 1.        ],
        [0.98543689, 1.        ],
        [0.00658963, 1.        ],
        [0.99262881, 1.        ],
        [0.40118577, 1.        ],
        [0.0033221 , 1.        ]],

       [[0.99262881, 1.        ],
        [0.98543689, 1.        ],
        [0.00658963, 1.        ],
        [0.99262881, 1.        ],
        [0.40118577, 1.        ],
        [0.0033221 , 1.        ]]])

In [50]:
# para cada una de las filas generamos un numero aleatorio entre 0 y 1
r = np.random.rand(Pnorm.shape[0], Pnorm.shape[1], 1)
r

array([[[0.5510518 ],
        [0.63758552],
        [0.66343913],
        [0.1548311 ],
        [0.65825813],
        [0.23003623]],

       [[0.27710679],
        [0.13838579],
        [0.18443574],
        [0.99265878],
        [0.88477087],
        [0.53229078]],

       [[0.8456741 ],
        [0.61437329],
        [0.43972944],
        [0.85317956],
        [0.17596085],
        [0.04803068]]])

In [51]:
# y ahora el truco que me encanta
q = Pnormcum >= r
q

array([[[ True,  True],
        [ True,  True],
        [False,  True],
        [ True,  True],
        [ True,  True],
        [False,  True]],

       [[ True,  True],
        [ True,  True],
        [False,  True],
        [False,  True],
        [False,  True],
        [False,  True]],

       [[ True,  True],
        [ True,  True],
        [False,  True],
        [ True,  True],
        [ True,  True],
        [False,  True]]])

In [52]:
Tnew = np.argmax(Pnormcum >= r, axis=-1).reshape((ndocs, nwords, 1))
Tnew

array([[[0],
        [0],
        [1],
        [0],
        [0],
        [1]],

       [[0],
        [0],
        [1],
        [1],
        [1],
        [1]],

       [[0],
        [0],
        [1],
        [0],
        [0],
        [1]]])

In [53]:
# en realidad sin el reshape queda mejor. Tnew[nwords, ndocs]:
Tnew = np.argmax(Pnormcum >= r, axis=-1)

print(Tnew)
d = 0  # document 0
w = 0  # word 0
Tnew[d,w]

[[0 0 1 0 0 1]
 [0 0 1 1 1 1]
 [0 0 1 0 0 1]]


0

In [83]:
# ahora falta actualizar las matrices W y D
doc_word_topic = np.eye(ntopics)[Tnew]* np.stack([B]*ntopics, axis=2)
print(doc_word_topic)

Dnew = doc_word_topic.sum(axis=1)
print(Dnew)

Wnew = doc_word_topic.sum(axis=0)
print(Wnew)

[[[1. 0.]
  [1. 0.]
  [0. 0.]
  [0. 0.]
  [0. 0.]
  [0. 0.]]

 [[0. 0.]
  [0. 0.]
  [0. 1.]
  [0. 2.]
  [0. 1.]
  [0. 0.]]

 [[1. 0.]
  [0. 0.]
  [0. 0.]
  [0. 0.]
  [1. 0.]
  [0. 2.]]]
[[2. 0.]
 [0. 4.]
 [2. 2.]]
[[2. 0.]
 [1. 0.]
 [0. 1.]
 [0. 2.]
 [1. 1.]
 [0. 2.]]


## Opción 1

Versión secuencial (paralelizable)

In [None]:
def dirichlet_allocation(docs, docs_idx, words_idx, ntopics, niter=400, alpha=0.5, beta=0.01, debug=False):
    ndocs, nwords = len(docs_idx), len(words_idx)

    random_assignment(docs, ntopics)

    for _ in range(niter):

        W = words_vs_topics_matrix(docs, words_idx, ntopics)  # Esto sigue siendo lento
        D = docs_vs_topics_matrix(docs, ntopics)
        
        Wnorm = (W + beta) / (np.sum(W, axis=0) + beta*nwords)
        Dnorm = (D + alpha) / (np.sum(D, axis=1).reshape((ndocs,1)) + alpha*ntopics) 
        P = np.stack([Dnorm]*Wnorm.shape[0], axis=1) * Wnorm
        # Pnorm = P/np.sum(P, axis=2).reshape((ndocs, nwords, 1))

        for doc in docs:
            doc_i = docs_idx.index(doc['id'])
            for assign in doc['assignment']:
                word_j = words_idx.index(assign['word'])     
                prob_by_topic = P[doc_i, word_j]
                new_topic = random.choices(range(ntopics), prob_by_topic, k=1)[0]
                assign['topic'] = new_topic

    return docs

In [129]:
%%time
docs = dirichlet_allocation(docs, docs_idx, words_idx, ntopics=3, niter=100)
show_results(docs)

0 [('politics', 0.5401744339260784), ('sports', 0.4598255660739216)]
0 [(0, 0.49), (1, 0.5), (2, 0.01)]
1 [('politics', 0.5864188742826167), ('science', 0.41358112571738326)]
1 [(1, 0.63), (2, 0.37)]
2 [('sports', 0.7231207447241729), ('science', 0.27687925527582713)]
2 [(0, 0.65), (2, 0.35)]
3 [('politics', 0.23431654435232518), ('sports', 0.7656834556476748)]
3 [(0, 0.75), (1, 0.25)]
4 [('science', 0.6175851598695579), ('sports', 0.3824148401304421)]
4 [(0, 0.44), (2, 0.56)]
5 [('politics', 0.3659803489588083), ('sports', 0.6340196510411918)]
5 [(0, 0.6), (1, 0.38), (2, 0.02)]
6 [('politics', 0.7816594986813653), ('sports', 0.2183405013186347)]
6 [(0, 0.18), (1, 0.79), (2, 0.03)]
7 [('science', 0.6598811191253253), ('politics', 0.3401188808746747)]
7 [(1, 0.28), (2, 0.72)]
8 [('sports', 0.6339531098157732), ('politics', 0.3660468901842267)]
8 [(0, 0.71), (1, 0.28), (2, 0.01)]
9 [('politics', 0.21923481213882992), ('science', 0.7807651878611701)]
9 [(1, 0.18), (2, 0.82)]
10 [('politic

#### Resultados: esto mejora el codigo original, de 35 segundos a 6 segundos para hacer 100 iteraciones. 

## Opción 2

Sparse matrices

    -- Representación "Bag of Words": 
    B = [[2,0,1], # el doc 0 contiene dos veces la palabra 0 y 1 vez la palabra 2
         [0,2,0], # el doc 1 contiene dos veces la palabra 1
         [1,1,0]] # ...

    -- Representación one hot encoding de la misma matriz
    B = [[[1,0,0], [1,0,0], [0,0,1]],  # dos vectores distintos para las dos ocurrencias de la palabra 0 
         [[0,1,0], [0,1,0]],
         [[1,0,0], [1,0,0]]]