In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
data_config = {
    "csv_dir": r".",
    "text_column": "Description",
    "min_df": 5,          # опционально: фильтрация редких токенов
    "max_df": 0.95,
    "tokenizer": your_custom_tokenizer,  # Callable[[str], List[str]]
}

In [None]:
from pathlib import Path
from collections import defaultdict, Counter
import pandas as pd

class CorpusIndexer:
    def __init__(self, csv_dir: str, text_column: str = "text"):
        self.csv_dir = Path(csv_dir)
        self.text_column = text_column
        self.doc_paths = []      # list of (filepath, line_idx)
        self.token_freq = Counter()
        self._scanned = False

    def scan_corpus(self, tokenizer):
        """Проход по всем CSV для сбора частот токенов и индексации документов."""
        self.doc_paths.clear()
        self.token_freq.clear()
        
        for csv_file in sorted(self.csv_dir.glob("*.csv")):
            df = pd.read_csv(csv_file)
            for line_idx, row in df.iterrows():
                text = row[self.text_column]
                tokens = tokenizer(text)
                self.token_freq.update(tokens)
                self.doc_paths.append((str(csv_file), line_idx))
        self._scanned = True

    def build_vocab(self, min_df: int = 1, max_df: float = 1.0):
        assert self._scanned, "Call scan_corpus first!"
        total_docs = len(self.doc_paths)
        min_count = min_df
        max_count = int(max_df * total_docs)

        # Фильтрация
        filtered_tokens = [
            token for token, freq in self.token_freq.items()
            if min_count <= freq <= max_count
        ]
        # Сортируем для детерминизма
        filtered_tokens.sort()
        token_to_id = {token: i for i, token in enumerate(filtered_tokens)}
        return token_to_id

In [None]:
class ARTMDataset(torch.utils.data.Dataset):
    def __init__(self, doc_paths, token_to_id, text_column, tokenizer):
        self.doc_paths = doc_paths
        self.token_to_id = token_to_id  # замороженный dict
        self.text_column = text_column
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.doc_paths)

    def __getitem__(self, idx):
        file_path, line_idx = self.doc_paths[idx]
        df = pd.read_csv(file_path)
        text = df.iloc[line_idx][self.text_column]
        tokens = self.tokenizer(text)
        # Маппинг в ID, игнорируем неизвестные
        token_ids = [self.token_to_id[t] for t in tokens if t in self.token_to_id]
        return idx, token_ids  # idx = глобальный ID документа

In [None]:
def artm_collate_fn(batch, vocab_size: int):
    doc_ids = []
    rows, cols, data = [], [], []
    current_row = 0

    for doc_id, token_ids in batch:
        doc_ids.append(doc_id)
        if not token_ids:
            current_row += 1
            continue
        counts = Counter(token_ids)
        for token_id, cnt in counts.items():
            if 0 <= token_id < vocab_size:  # защита
                rows.append(current_row)
                cols.append(token_id)
                data.append(cnt)
        current_row += 1

    bow_matrix = csr_matrix(
        (data, (rows, cols)),
        shape=(len(batch), vocab_size),
        dtype=np.int32
    )
    return doc_ids, bow_matrix

In [None]:
# 1. Индексация и словарь
indexer = CorpusIndexer(".", text_column="Description")
indexer.scan_corpus(tokenizer=my_tokenizer)
token_to_id = indexer.build_vocab(min_df=5, max_df=0.9)

# 2. Датасет и загрузчик
dataset = ARTMDataset(
    doc_paths=indexer.doc_paths,
    token_to_id=token_to_id,
    text_column="text",
    tokenizer=my_tokenizer
)

loader = torch.utils.data.DataLoader(
    dataset,
    batch_size=128,
    collate_fn=lambda b: artm_collate_fn(b, vocab_size=len(token_to_id)),
    num_workers=6,
    shuffle=True
)

# 3. Твоя ARTM-модель
for epoch in range(num_epochs):
    for doc_ids, bow in loader:
        # bow — scipy.sparse.csr_matrix
        # Передаёшь в свою реализацию ARTM
        your_artm_model.update(bow)