In [1]:
# examples/train_artm.py
from torch.utils.data import DataLoader

# from artm_lib.artm.model import ARTM
from artm_lib.artm.model2 import FullEM_ARTM
from artm_lib.data.collators import ARTMCollator
from artm_lib.data.dataset import ARTMDatasetParquet
from artm_lib.preprocessing.tokenizer import simple_tokenizer
from artm_lib.preprocessing.vocabulary import build_vocab_and_index_from_parquet

import numpy as np

In [2]:
# 1. Загрузка словаря и данных
token_to_id, doc_index = build_vocab_and_index_from_parquet(
    parquet_dir_str=r".\data\parquets", tokenizer=simple_tokenizer, min_df=5
)
vocab = [""] * len(token_to_id)
for token, idx in token_to_id.items():
    vocab[idx] = token


 Scanning: input.parquet
Vocab size: 8074 | Documents: 2225



In [3]:
# 2. DataLoader
dataset = ARTMDatasetParquet(
    doc_index=doc_index,
    token_to_id=token_to_id,
    text_column="Description",
    tokenizer=simple_tokenizer,
)
loader = DataLoader(
    dataset, batch_size=1, collate_fn=ARTMCollator(len(token_to_id)), num_workers=0
)


In [4]:
for doc_ids, bow in loader:
    print(f"doc_ids: {doc_ids}")
    print(f"Batch shape: {bow.shape}")
    print(f"Non-zero entries: {bow.nnz}")
    #print(f"bow: {bow}")
    break


doc_ids: [0]
Batch shape: (1, 8074)
Non-zero entries: 125


In [5]:
bow.shape
type(bow)
len(vocab)

8074

In [6]:
def _norm(x):
    x = np.maximum(x, np.zeros_like(x))
    norm = x.sum(axis=0)    #// суммирование по столбцам (получается одна строка)
    if norm.any() == 0:
        print(f'{norm=}')
    _eps = 1e-4
    x = np.where(norm > _eps, x / norm, np.zeros_like(x))
    return x

In [40]:
#data = X.copy()
batch_size = 100
data = DataLoader(
    dataset, batch_size=batch_size, collate_fn=ARTMCollator(len(token_to_id)), num_workers=0
)
doc_id, batch = next(iter(data))
#vocabulary = vocab.copy()

n_topics = 10
tol = 1e-3

rng = np.random.default_rng()

# 1 инициалиазация phi_wt и theta_td для всех d ∈ D, w ∈ W, t ∈ T
phi_wt = rng.uniform(size=(len(vocab), n_topics)) # (W, T)
theta_td = np.full(shape=(batch_size, n_topics), fill_value=1. / n_topics) # (D, T)

for i in range(100): # 2 для всех итераций i = 1, . . . , imax
    #print(i)
    # 3 обнулить n_wt, n_td, n_t для всех d ∈ D, w ∈ W, t ∈ T
    n_wt = np.zeros(shape=(len(vocab), n_topics)) # (W, T)
    n_td = np.zeros(shape=(batch_size, n_topics)) # (D, T)
    n_t = np.zeros(shape=(n_topics)) # (T)
    
    for d, n_dw in enumerate(batch): # 4 для всех d ∈ D 
        n_tdw = n_dw * _norm((phi_wt * theta_td[d]).T) # ()
    n_wt
            new_theta_td = _norm(np.sum(n_tdw, axis=0))
            if np.linalg.norm(new_theta_td - theta_td) < tol:
                theta_td = new_theta_td
                break
            theta_td = new_theta_td
        n_wt += n_tdw
    new_phi_wt = _norm(n_wt)    

    diff_phi_wt = np.linalg.norm(new_phi_wt - phi_wt, 'fro')
    print(diff_phi_wt)
    if diff_phi_wt < tol:
        phi_wt = new_phi_wt
        break
    phi_wt = new_phi_wt

print(phi_wt)


IndentationError: unindent does not match any outer indentation level (<string>, line 33)

In [30]:
#batch.shape
n_dw.shape
#n_dw[:, None].shape

(1, 8074)

In [84]:
#n_dw * 
_norm(phi_wt * theta_td[0, None])

ValueError: operands could not be broadcast together with shapes (10,8074) (1,10) 

In [41]:
doc_id, batch = next(iter(data))

In [96]:
# 1 инициалиазация phi_wt и theta_td для всех d ∈ D, w ∈ W, t ∈ T
phi_wt = rng.uniform(size=(len(vocab), n_topics)) # (W, T)
theta_td = np.full(shape=(batch_size, n_topics), fill_value=1. / n_topics) # (D, T)

for i in range(1): # 2 для всех итераций i = 1, . . . , imax
    #print(i)
    # 3 обнулить n_wt, n_td, n_t для всех d ∈ D, w ∈ W, t ∈ T
    n_wt = np.zeros(shape=(len(vocab), n_topics)) # (T, W)
    n_td = np.zeros(shape=(batch_size, n_topics)) # (D, T)
    n_t = np.zeros(shape=(n_topics)) # (T)

In [87]:
n_dw = batch[0]
n_dw.shape

(1, 8074)

In [115]:
print(f"{phi_wt.shape=}")  # (W, T)
#print(f"{phi_wt.T.shape=}")  # (T, W)
#print(f"{theta_td.shape=}")  # (D, T)
print(f"{theta_td[0].shape=}")  # (D, T)

print(_norm((phi_wt * theta_td[0])).shape)

print(f"{n_dw.shape=}")  # (W, 1)
#n_tdw = n_dw.T * _norm((phi_wt * theta_td[0]))
#print(n_tdw.shape)


phi_wt.shape=(8074, 10)
theta_td[0].shape=(10,)
(8074, 10)
n_dw.shape=(1, 8074)


In [92]:
phi_wt.T * theta_td[0]

ValueError: operands could not be broadcast together with shapes (10,8074) (10,) 

In [35]:
for doc_id, batch in data:
    for n_dw in batch:
        n_dw = n_dw#.todense()
        break

In [57]:
x1 = np.arange(12.0).reshape((4, 3))
x2 = np.arange(3.0)
print(x1.shape, x2.shape)
np.multiply(x1, x2)

(4, 3) (3,)


array([[ 0.,  1.,  4.],
       [ 0.,  4., 10.],
       [ 0.,  7., 16.],
       [ 0., 10., 22.]])

In [62]:
x1 = np.arange(12.0).reshape((3, 4))
x2 = np.arange(3.0).reshape((3, 1))
print(x1.shape, x2.shape)
np.multiply(x1, x2)

(3, 4) (3, 1)


array([[ 0.,  0.,  0.,  0.],
       [ 4.,  5.,  6.,  7.],
       [16., 18., 20., 22.]])

In [102]:
x1 = np.arange(60.0).reshape((20, 3))
x2 = np.arange(3.0)#.reshape((3, 1))
x2_1 = x2[:, None]
print(x1.shape, x2_1.shape)
(np.multiply(x1, x2)).shape

(20, 3) (3, 1)


(20, 3)

In [9]:
# W = 3, T = 2, D = 4
phi_wt = np.arange(6.0).reshape((3, 2))  # (W, T)
theta_td = np.arange(8.0).reshape((2, 4)) # (T, D)
print(phi_wt)
print(theta_td)
#print(x1.shape, x2_1.shape)
#(np.multiply(x1, x2)).shape
phi_wt_theta_td = phi_wt * theta_td.T[:, None]
print(phi_wt_theta_td.T)

[[0. 1.]
 [2. 3.]
 [4. 5.]]
[[0. 1. 2. 3.]
 [4. 5. 6. 7.]]
[[[ 0.  0.  0.  0.]
  [ 0.  2.  4.  6.]
  [ 0.  4.  8. 12.]]

 [[ 4.  5.  6.  7.]
  [12. 15. 18. 21.]
  [20. 25. 30. 35.]]]


In [28]:
phi_wt = np.arange(6.0).reshape((3, 2)) # (W, T) (3, 2)
print(f"{phi_wt.shape=}")
print(f"{phi_wt=}")

theta_td = np.arange(8.0).reshape((2, 4)) # (T, D) (2, 4)
print(f"{theta_td.shape=}")
print(f"{theta_td=}")

phi_wt_theta_td = (phi_wt * theta_td[:, None].T).transpose(2, 0, 1) # (D, W, T) -> (T, D, W) (4, 3, 2) -> (2, 4, 3) 
print(f"{phi_wt_theta_td.shape=}")
print(f"{phi_wt_theta_td=}")

sum_phi_ws_teta_sd = np.sum(phi_wt_theta_td, axis=0) # (D, W) (4, 3)
print(f"{sum_phi_ws_teta_sd.shape=}")
print(f"{sum_phi_ws_teta_sd=}")

#phi_wt_theta_td_norm = phi_wt_theta_td / sum_phi_ws_teta_sd # (T, D, W) (2, 4, 3)
phi_wt_theta_td_norm = _norm(phi_wt_theta_td) # (T, D, W) (2, 4, 3)
print(f"{phi_wt_theta_td_norm.shape=}")
print(f"{phi_wt_theta_td_norm=}")


phi_wt.shape=(3, 2)
phi_wt=array([[0., 1.],
       [2., 3.],
       [4., 5.]])
teta_td.shape=(2, 4)
teta_td=array([[0., 1., 2., 3.],
       [4., 5., 6., 7.]])
phi_wt_teta_td.shape=(2, 4, 3)
phi_wt_teta_td=array([[[ 0.,  0.,  0.],
        [ 0.,  2.,  4.],
        [ 0.,  4.,  8.],
        [ 0.,  6., 12.]],

       [[ 4., 12., 20.],
        [ 5., 15., 25.],
        [ 6., 18., 30.],
        [ 7., 21., 35.]]])
sum_phi_ws_teta_sd.shape=(4, 3)
sum_phi_ws_teta_sd=array([[ 4., 12., 20.],
       [ 5., 17., 29.],
       [ 6., 22., 38.],
       [ 7., 27., 47.]])
phi_wt_teta_td_norm.shape=(2, 4, 3)
phi_wt_teta_td_norm=array([[[0.        , 0.        , 0.        ],
        [0.        , 0.11764706, 0.13793103],
        [0.        , 0.18181818, 0.21052632],
        [0.        , 0.22222222, 0.25531915]],

       [[1.        , 1.        , 1.        ],
        [1.        , 0.88235294, 0.86206897],
        [1.        , 0.81818182, 0.78947368],
        [1.        , 0.77777778, 0.74468085]]])


In [14]:
n_dw = np.arange(12.0).reshape((4, 3)) # (D, W) (4, 3)
print(f"{n_dw.shape=}")
print(f"{n_dw=}")

n_dw.shape=(4, 3)
n_dw=array([[ 0.,  1.,  2.],
       [ 3.,  4.,  5.],
       [ 6.,  7.,  8.],
       [ 9., 10., 11.]])


In [15]:
n_tdw = n_dw * phi_wt_theta_td_norm # (T, D, W) (2, 4, 3)
print(f"{n_tdw.shape=}")
print(f"{n_tdw=}")

n_tdw.shape=(2, 4, 3)
n_tdw=array([[[0.        , 0.        , 0.        ],
        [0.        , 0.47058824, 0.68965517],
        [0.        , 1.27272727, 1.68421053],
        [0.        , 2.22222222, 2.80851064]],

       [[0.        , 1.        , 2.        ],
        [3.        , 3.52941176, 4.31034483],
        [6.        , 5.72727273, 6.31578947],
        [9.        , 7.77777778, 8.19148936]]])


In [16]:
n_wt = np.sum(n_tdw, axis=1).T # (T, D, W) -> (T, W) -> (W, T) (3, 2)
print(f"{n_wt.shape=}")
print(f"{n_wt=}")
n_t = np.sum(n_wt, axis=0) # (W, T) -> (T, ) (2, )
print(f"{n_t.shape=}")
print(f"{n_t=}")

n_wt.shape=(3, 2)
n_wt=array([[ 0.        , 18.        ],
       [ 3.96553773, 18.03446227],
       [ 5.18237634, 20.81762366]])
n_t.shape=(2,)
n_t=array([ 9.14791407, 56.85208593])


In [29]:
#new_phi_wt = n_wt / n_t # (W, T) (3, 2)
new_phi_wt = _norm(np.sum(n_tdw, axis=1).T) # (W, T) (3, 2)
print(f"{new_phi_wt.shape=}")
print(f"{new_phi_wt=}")


new_phi_wt.shape=(3, 2)
new_phi_wt=array([[0.        , 0.31661107],
       [0.43349092, 0.31721725],
       [0.56650908, 0.36617168]])
norm_phi_wt.shape=(3, 2)
norm_phi_wt=array([[0.        , 0.31661107],
       [0.43349092, 0.31721725],
       [0.56650908, 0.36617168]])


In [26]:
n_td = np.sum(n_tdw, axis=2) # (T, D, W) -> (T, D)
print(f"{n_td.shape=}")
print(f"{n_td=}")
n_d = np.sum(n_td, axis=0) # (T, D) -> (D, )
print(f"{n_d.shape=}")
print(f"{n_d=}")

n_td.shape=(2, 4)
n_td=array([[ 0.        ,  1.16024341,  2.9569378 ,  5.03073286],
       [ 3.        , 10.83975659, 18.0430622 , 24.96926714]])
n_d.shape=(4,)
n_d=array([ 3., 12., 21., 30.])


In [30]:
#new_theta_td = n_td / n_d
new_theta_td = _norm(np.sum(n_tdw, axis=2))
print(f"{new_theta_td.shape=}")
print(f"{new_theta_td=}")

new_theta_td.shape=(2, 4)
new_theta_td=array([[0.        , 0.09668695, 0.14080656, 0.1676911 ],
       [1.        , 0.90331305, 0.85919344, 0.8323089 ]])


In [30]:
def _norm(x, eps = 1e-4):
    x_nonnegative = np.maximum(x, np.zeros_like(x))
    norm = x_nonnegative.sum(axis=0)    #// суммирование по столбцам (получается одна строка)
    x_norm = np.divide(x_nonnegative, norm, out=np.zeros_like(x_nonnegative, dtype=np.float64), where=norm > eps)
    return x_norm

In [35]:
#data = X.copy()
batch_size = 100
data = DataLoader(
    dataset, batch_size=batch_size, collate_fn=ARTMCollator(len(token_to_id)), num_workers=0
)
doc_id, batch = next(iter(data))
#vocabulary = vocab.copy()

n_topics = 10
tol = 1e-3

rng = np.random.default_rng()

# 1 инициалиазация phi_wt и theta_td для всех d ∈ D, w ∈ W, t ∈ T
phi_wt = rng.uniform(size=(len(vocab), n_topics)) # (W, T)
theta_td = np.full(shape=(n_topics, batch_size), fill_value=1. / n_topics) # (T, D)
print(f"{phi_wt.shape=}")
print(f"{theta_td.shape=}")
    
#flag = False
for i in range(200): # 2 для всех итераций i = 1, . . . , imax
    #print(i)
    # 3 обнулить n_wt, n_td, n_t для всех d ∈ D, w ∈ W, t ∈ T
    #n_wt = np.zeros(shape=(len(vocab), n_topics)) # (W, T)
    #n_td = np.zeros(shape=(n_topics, batch_size)) # (T, D)
    #n_t = np.zeros(shape=(n_topics)) # (T)
    
    #for d, n_dw in enumerate(batch): # 4 для всех d ∈ D
    n_dw = batch.toarray()

    phi_wt_theta_td = (phi_wt * theta_td[:, None].T).transpose(2, 0, 1) # (D, W, T) -> (T, D, W)
    phi_wt_theta_td_norm = _norm(phi_wt_theta_td)  # (T, D, W)
    #print(f"{phi_wt_theta_td_norm.shape=}")
    
    n_tdw = n_dw * phi_wt_theta_td_norm # (T, D, W)
    
    #n_wt += np.sum(n_tdw, axis=1).T # (T, D, W) -> (T, W) -> (W, T) (3, 2)
    #n_t += np.sum(n_wt, axis=0) # (W, T) -> (T, ) (2, )
    #n_td += np.sum(n_tdw, axis=2) # (T, D, W) -> (T, D)
    #new_phi_wt = n_wt / n_t # (W, T) (3, 2)
    #new_theta_td = n_td  # (T, D)

    new_phi_wt = _norm(np.sum(n_tdw, axis=1).T) # (W, T) (3, 2)
    new_theta_td = np.sum(n_tdw, axis=2)  # (T, D)

    diff_phi_wt = np.linalg.norm(new_phi_wt - phi_wt, 'fro')
    print(f"{diff_phi_wt=}")
    if diff_phi_wt < tol:
        phi_wt = new_phi_wt
        break
    phi_wt = new_phi_wt
    theta_td = new_theta_td

print(f"{i=}")
print(f"{phi_wt=}")

phi_wt.shape=(8074, 10)
theta_td.shape=(10, 100)
diff_phi_wt=np.float64(163.88085324422386)
diff_phi_wt=np.float64(0.0027280369476043965)
diff_phi_wt=np.float64(0.005329205115481399)
diff_phi_wt=np.float64(0.008648074798655404)
diff_phi_wt=np.float64(0.013323016905576307)
diff_phi_wt=np.float64(0.01940142553980738)
diff_phi_wt=np.float64(0.025413985916460317)
diff_phi_wt=np.float64(0.028574675027603677)
diff_phi_wt=np.float64(0.027781740394943306)
diff_phi_wt=np.float64(0.02436612464964904)
diff_phi_wt=np.float64(0.02069768213902499)
diff_phi_wt=np.float64(0.017584356832470817)
diff_phi_wt=np.float64(0.014873949879927474)
diff_phi_wt=np.float64(0.012105191340555625)
diff_phi_wt=np.float64(0.009695529499106771)
diff_phi_wt=np.float64(0.007967849756599771)
diff_phi_wt=np.float64(0.0067329478911718815)
diff_phi_wt=np.float64(0.005961850388112014)
diff_phi_wt=np.float64(0.005391358600722193)
diff_phi_wt=np.float64(0.004770561136198987)
diff_phi_wt=np.float64(0.004145132561608937)
diff_phi_

In [37]:
top_n = 10
top_words = []
for t in range(n_topics):
    top_indices = np.argsort(phi_wt[t])[::-1][:top_n]
    top_words.append([vocab[i] for i in top_indices])
top_words

[['08', '07', '06', '05', '04', '03', '02', '01', '007', '00'],
 ['08', '07', '06', '05', '04', '03', '02', '01', '007', '00'],
 ['07', '00', '05', '06', '01', '08', '03', '04', '02', '007'],
 ['07', '06', '01', '00', '08', '05', '03', '04', '02', '007'],
 ['00', '07', '06', '08', '04', '05', '03', '02', '01', '007'],
 ['007', '07', '08', '06', '04', '05', '03', '02', '01', '00'],
 ['00', '07', '06', '04', '08', '05', '03', '02', '01', '007'],
 ['00', '07', '06', '08', '04', '05', '03', '02', '01', '007'],
 ['03', '07', '01', '05', '00', '02', '08', '06', '04', '007'],
 ['08', '07', '06', '05', '04', '03', '02', '01', '007', '00']]

In [61]:
dw = n_dw[0]
#[vocab[i] for i, n in enumerate(dw) if n > 0]
#[vocab[i] for i in dataset[0][1]]
#[k for k in vocab if vocab[k] == '08'] 
batch

<Compressed Sparse Row sparse matrix of dtype 'int32'
	with 11491 stored elements and shape (100, 8074)>

In [36]:
batch_size = 100
data = DataLoader(
    dataset, batch_size=batch_size, collate_fn=ARTMCollator(len(token_to_id)), num_workers=0
)
doc_id, batch = next(iter(data))
for d, n_dw in enumerate(batch): # 4 для всех d ∈ D
    get_n_dw = n_dw
    break

In [52]:
print(f"{phi_wt_theta_td_norm.shape=}")
print(f"{n_dw.shape=}")
print(f"{phi_wt_theta_td_norm[:, d].shape=}")


phi_wt_theta_td_norm.shape=(10, 100, 8074)
n_dw.shape=(100, 8074)
phi_wt_theta_td_norm[:, d].shape=(10, 8074)


In [55]:
n_dw_array = n_dw.toarray()
type(n_dw_array)

numpy.ndarray

In [56]:
n_dw_array * phi_wt_theta_td_norm

array([[[0.        , 0.        , 0.        , ..., 0.        ,
         0.        , 0.        ],
        [0.        , 0.        , 0.11127921, ..., 0.        ,
         0.        , 0.        ],
        [0.        , 0.        , 0.        , ..., 0.        ,
         0.        , 0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        ,
         0.        , 0.        ],
        [0.        , 0.        , 0.        , ..., 0.        ,
         0.        , 0.        ],
        [0.        , 0.        , 0.11127921, ..., 0.        ,
         0.        , 0.        ]],

       [[0.        , 0.        , 0.        , ..., 0.        ,
         0.        , 0.        ],
        [0.        , 0.        , 0.16383969, ..., 0.        ,
         0.        , 0.        ],
        [0.        , 0.        , 0.        , ..., 0.        ,
         0.        , 0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        ,
         0.        , 0.        ],
        [0. 