In [1]:
# examples/train_artm.py
from torch.utils.data import DataLoader

# from artm_lib.artm.model import ARTM
from artm_lib.artm.model2 import FullEM_ARTM
from artm_lib.data.collators import ARTMCollator
from artm_lib.data.dataset import ARTMDatasetParquet
from artm_lib.preprocessing.tokenizer import simple_tokenizer
from artm_lib.preprocessing.vocabulary import build_vocab_and_index_from_parquet

import numpy as np

In [2]:
# 1. Загрузка словаря и данных
token_to_id, doc_index = build_vocab_and_index_from_parquet(
    parquet_dir_str=r".\data\parquets", tokenizer=simple_tokenizer, min_df=5
)
vocab = [""] * len(token_to_id)
for token, idx in token_to_id.items():
    vocab[idx] = token


 Scanning: input.parquet
Vocab size: 8074 | Documents: 2225



In [3]:
# 2. DataLoader
dataset = ARTMDatasetParquet(
    doc_index=doc_index,
    token_to_id=token_to_id,
    text_column="Description",
    tokenizer=simple_tokenizer,
)
loader = DataLoader(
    dataset, batch_size=1, collate_fn=ARTMCollator(len(token_to_id)), num_workers=0
)


In [5]:
for doc_ids, bow in loader:
    print(f"doc_ids: {doc_ids}")
    print(f"Batch shape: {bow.shape}")
    print(f"Non-zero entries: {bow.nnz}")
    #print(f"bow: {bow}")
    break


doc_ids: [0]
Batch shape: (1, 8074)
Non-zero entries: 125


In [11]:
bow.shape
type(bow)
len(vocab)

8074

In [7]:
def _norm(x):
    x = np.maximum(x, np.zeros_like(x))
    norm = x.sum(axis=0)    #// суммирование по столбцам (получается одна строка)
    if norm.any() == 0:
        print(f'{norm=}')
    _eps = 1e-4
    x = np.where(norm > _eps, x / norm, np.zeros_like(x))
    return x

In [40]:
#data = X.copy()
batch_size = 100
data = DataLoader(
    dataset, batch_size=batch_size, collate_fn=ARTMCollator(len(token_to_id)), num_workers=0
)
doc_id, batch = next(iter(data))
#vocabulary = vocab.copy()

n_topics = 10
tol = 1e-3

rng = np.random.default_rng()

# 1 инициалиазация phi_wt и theta_td для всех d ∈ D, w ∈ W, t ∈ T
phi_wt = rng.uniform(size=(len(vocab), n_topics)) # (W, T)
theta_td = np.full(shape=(batch_size, n_topics), fill_value=1. / n_topics) # (D, T)

for i in range(100): # 2 для всех итераций i = 1, . . . , imax
    #print(i)
    # 3 обнулить n_wt, n_td, n_t для всех d ∈ D, w ∈ W, t ∈ T
    n_wt = np.zeros(shape=(len(vocab), n_topics)) # (W, T)
    n_td = np.zeros(shape=(batch_size, n_topics)) # (D, T)
    n_t = np.zeros(shape=(n_topics)) # (T)
    
    for d, n_dw in enumerate(batch): # 4 для всех d ∈ D 
        n_tdw = n_dw * _norm((phi_wt * theta_td[d]).T) # ()
    n_wt
            new_theta_td = _norm(np.sum(n_tdw, axis=0))
            if np.linalg.norm(new_theta_td - theta_td) < tol:
                theta_td = new_theta_td
                break
            theta_td = new_theta_td
        n_wt += n_tdw
    new_phi_wt = _norm(n_wt)    

    diff_phi_wt = np.linalg.norm(new_phi_wt - phi_wt, 'fro')
    print(diff_phi_wt)
    if diff_phi_wt < tol:
        phi_wt = new_phi_wt
        break
    phi_wt = new_phi_wt

print(phi_wt)


IndentationError: unindent does not match any outer indentation level (<string>, line 33)

In [30]:
#batch.shape
n_dw.shape
#n_dw[:, None].shape

(1, 8074)

In [84]:
#n_dw * 
_norm(phi_wt * theta_td[0, None])

ValueError: operands could not be broadcast together with shapes (10,8074) (1,10) 

In [41]:
doc_id, batch = next(iter(data))

In [96]:
# 1 инициалиазация phi_wt и theta_td для всех d ∈ D, w ∈ W, t ∈ T
phi_wt = rng.uniform(size=(len(vocab), n_topics)) # (W, T)
theta_td = np.full(shape=(batch_size, n_topics), fill_value=1. / n_topics) # (D, T)

for i in range(1): # 2 для всех итераций i = 1, . . . , imax
    #print(i)
    # 3 обнулить n_wt, n_td, n_t для всех d ∈ D, w ∈ W, t ∈ T
    n_wt = np.zeros(shape=(len(vocab), n_topics)) # (T, W)
    n_td = np.zeros(shape=(batch_size, n_topics)) # (D, T)
    n_t = np.zeros(shape=(n_topics)) # (T)

In [87]:
n_dw = batch[0]
n_dw.shape

(1, 8074)

In [115]:
print(f"{phi_wt.shape=}")  # (W, T)
#print(f"{phi_wt.T.shape=}")  # (T, W)
#print(f"{theta_td.shape=}")  # (D, T)
print(f"{theta_td[0].shape=}")  # (D, T)

print(_norm((phi_wt * theta_td[0])).shape)

print(f"{n_dw.shape=}")  # (W, 1)
#n_tdw = n_dw.T * _norm((phi_wt * theta_td[0]))
#print(n_tdw.shape)


phi_wt.shape=(8074, 10)
theta_td[0].shape=(10,)
(8074, 10)
n_dw.shape=(1, 8074)


In [92]:
phi_wt.T * theta_td[0]

ValueError: operands could not be broadcast together with shapes (10,8074) (10,) 

In [28]:
for doc_id, batch in data:
    for n_dw in batch:
        n_dw = n_dw#.todense()
        break

In [57]:
x1 = np.arange(12.0).reshape((4, 3))
x2 = np.arange(3.0)
print(x1.shape, x2.shape)
np.multiply(x1, x2)

(4, 3) (3,)


array([[ 0.,  1.,  4.],
       [ 0.,  4., 10.],
       [ 0.,  7., 16.],
       [ 0., 10., 22.]])

In [62]:
x1 = np.arange(12.0).reshape((3, 4))
x2 = np.arange(3.0).reshape((3, 1))
print(x1.shape, x2.shape)
np.multiply(x1, x2)

(3, 4) (3, 1)


array([[ 0.,  0.,  0.,  0.],
       [ 4.,  5.,  6.,  7.],
       [16., 18., 20., 22.]])

In [102]:
x1 = np.arange(60.0).reshape((20, 3))
x2 = np.arange(3.0)#.reshape((3, 1))
x2_1 = x2[:, None]
print(x1.shape, x2_1.shape)
(np.multiply(x1, x2)).shape

(20, 3) (3, 1)


(20, 3)

In [135]:
phi_wt = np.arange(6.0).reshape((3, 2))  # (W, T)
teta_td = np.arange(8.0).reshape((4, 2)) # (T, D)
print(phi_wt)
print(teta_td)
#print(x1.shape, x2_1.shape)
#(np.multiply(x1, x2)).shape
phi_wt_teta_td = phi_wt * teta_td[:, None]
print(phi_wt_teta_td.T)

[[0. 1.]
 [2. 3.]
 [4. 5.]]
[[0. 1.]
 [2. 3.]
 [4. 5.]
 [6. 7.]]
[[[ 0.  0.  0.  0.]
  [ 0.  4.  8. 12.]
  [ 0.  8. 16. 24.]]

 [[ 1.  3.  5.  7.]
  [ 3.  9. 15. 21.]
  [ 5. 15. 25. 35.]]]


In [167]:
# W = 3, T = 2, D = 4
phi_wt = np.arange(6.0).reshape((3, 2)) # (W, T)
print(f"{phi_wt=}")

teta_td = np.arange(8.0).reshape((2, 4)) # (T, D)
print(f"{teta_td=}")

phi_wt_teta_td = (phi_wt * teta_td[:, None].T).transpose(2, 0, 1) # (D, W, T) -> (T, D, W) 
print(f"{phi_wt_teta_td=}")

sum_phi_ws_teta_sd = np.sum(phi_wt_teta_td, axis=0) # (D, W)
print(f"{sum_phi_ws_teta_sd=}")

phi_wt_teta_td_norm = phi_wt_teta_td / sum_phi_ws_teta_sd # (T, D, W)
#phi_wt_teta_td_norm = _norm(phi_wt_teta_td)
print(f"{phi_wt_teta_td_norm=}")


phi_wt=array([[0., 1.],
       [2., 3.],
       [4., 5.]])
teta_td=array([[0., 1., 2., 3.],
       [4., 5., 6., 7.]])
phi_wt_teta_td=array([[[ 0.,  0.,  0.],
        [ 0.,  2.,  4.],
        [ 0.,  4.,  8.],
        [ 0.,  6., 12.]],

       [[ 4., 12., 20.],
        [ 5., 15., 25.],
        [ 6., 18., 30.],
        [ 7., 21., 35.]]])
sum_phi_ws_teta_sd=array([[ 4., 12., 20.],
       [ 5., 17., 29.],
       [ 6., 22., 38.],
       [ 7., 27., 47.]])
phi_wt_teta_td_norm=array([[[0.        , 0.        , 0.        ],
        [0.        , 0.11764706, 0.13793103],
        [0.        , 0.18181818, 0.21052632],
        [0.        , 0.22222222, 0.25531915]],

       [[1.        , 1.        , 1.        ],
        [1.        , 0.88235294, 0.86206897],
        [1.        , 0.81818182, 0.78947368],
        [1.        , 0.77777778, 0.74468085]]])


In [149]:
n_dw = np.arange(12.0).reshape((4, 3)) # (D, W)
print(f"{n_dw=}")

n_dw=array([[ 0.,  1.,  2.],
       [ 3.,  4.,  5.],
       [ 6.,  7.,  8.],
       [ 9., 10., 11.]])


In [171]:
n_tdw = n_dw * phi_wt_teta_td_norm
print(f"{n_tdw=}")

n_tdw=array([[[0.        , 0.        , 0.        ],
        [0.        , 0.47058824, 0.68965517],
        [0.        , 1.27272727, 1.68421053],
        [0.        , 2.22222222, 2.80851064]],

       [[0.        , 1.        , 2.        ],
        [3.        , 3.52941176, 4.31034483],
        [6.        , 5.72727273, 6.31578947],
        [9.        , 7.77777778, 8.19148936]]])


In [176]:
n_wt = np.sum(n_tdw, axis=1).T # (T, D, W) -> (T, W) -> (W, T)
print(f"{n_wt=}")
n_t = np.sum(n_wt, axis=0) # (W, T) -> (T, )
print(f"{n_t=}")

n_wt=array([[ 0.        , 18.        ],
       [ 3.96553773, 18.03446227],
       [ 5.18237634, 20.81762366]])
n_t=array([ 9.14791407, 56.85208593])


In [177]:
new_phi_wt = n_wt / n_t
print(f"{new_phi_wt=}")

new_phi_wt=array([[0.        , 0.31661107],
       [0.43349092, 0.31721725],
       [0.56650908, 0.36617168]])


In [178]:
n_td = np.sum(n_tdw, axis=2) # (T, D, W) -> (T, D)
print(f"{n_td=}")
n_d = np.sum(n_wt, axis=0) # (T, D) -> (D, )
print(f"{n_d=}")

n_td=array([[ 0.        ,  1.16024341,  2.9569378 ,  5.03073286],
       [ 3.        , 10.83975659, 18.0430622 , 24.96926714]])
n_d=array([ 9.14791407, 56.85208593])
