In [15]:
from data import file_utils

In [16]:
import torch
from torch.utils.data import DataLoader
import numpy as np
import scipy.io
import scipy.sparse
from scipy.sparse import issparse
from sentence_transformers import SentenceTransformer
from data.preprocess import Preprocess
from typing import List, Tuple, Union, Mapping, Any, Callable, Iterable


In [17]:
class DocEmbedModel:
    def __init__(self, 
                 model : Union[str, callable] = "all-MiniLM-L6-v2",
                 device : str = "cuda",
                 verbose = False):
        self.verbose = verbose

        if isinstance(model, str):
            self.model = SentenceTransformer(model, device = device)
        else:
            self.model = model
        
    def encode(self, 
                docs: List[str],
                convert_to_tensors: bool = False):
        embeddings = self.model.encode(
                        docs, 
                        convert_to_tensor=convert_to_tensors,
                        show_progress_bar=self.verbose
                    )

        return embeddings

In [18]:
### test
doc_embed_test = DocEmbedModel()
output = doc_embed_test.encode(
    ["hello hi i am a man", "good morning"],
    convert_to_tensors=True
)

In [19]:
output.shape

torch.Size([2, 384])

In [20]:
class RawDataset:
    def __init__(self,
                 docs, 
                 preprocess = None,
                 batch_size = 200,
                 device = "cuda",
                 as_tensor = True,
                 contextual_embed = False,
                 pretrained_WE = False,
                 doc_embed_model = "all-MiniLM-L6-v2",
                 embed_model_device = None,
                 vocab = None,
                 verbose = False):
        if preprocess is None:
            preprocess = Preprocess(verbose=verbose)

        rst = preprocess.preprocess(docs, pretrained_WE=pretrained_WE)

        self.train_data = rst["train_bow"]
        self.train_texts = rst["train_texts"]
        self.vocab = rst["vocab"]

        if issparse(self.train_data):
            self.train_data = self.train_data.toarray()

        self.vocab_size = len(self.vocab)
        
        if contextual_embed:
            if embed_model_device is None:
                embed_model_device = device
            
            if isinstance(doc_embed_model, str):
                self.doc_embedder = DocEmbedModel(doc_embed_model, device = embed_model_device)
            else:
                self.doc_embedder = doc_embed_model
            
            self.train_contextual_embed = self.doc_embedder.encode(docs)
            self.contextual_embed_size = self.train_contextual_embed.shape[1]

        if as_tensor:
            if contextual_embed:
                self.train_data = np.concatenate((self.train_data, self.train_contextual_embed), axis = 1)
            
            self.train_data = torch.from_numpy(self.train_data).float().to(device)

            self.train_dataloader = DataLoader(self.train_data, batch_size, shuffle=True)

In [21]:
### test
test_raw_dataset = RawDataset(
    docs = ["hello hi i am a man physics close windown, ahi", "good morning"],
    verbose=True,
    as_tensor=True
)

loading train texts: 100%|██████████| 2/2 [00:00<?, ?it/s]
parsing texts: 100%|██████████| 2/2 [00:00<00:00, 1967.31it/s]
2025-07-12 22:29:45,148 - TopMost - Real vocab size: 8
2025-07-12 22:29:45,149 - TopMost - Real training size: 2 	 avg length: 4.000


In [22]:
print(test_raw_dataset.train_texts)
print(test_raw_dataset.train_data)

['hello man physics close windown ahi', 'good morning']
tensor([[1., 1., 0., 1., 1., 0., 1., 1.],
        [0., 0., 1., 0., 0., 1., 0., 0.]], device='cuda:0')


In [23]:
class BasicDataset:
    def __init__(self,
                 dataset_dir,
                 batch_size=200,
                 read_labels=False,
                 as_tensor=True,
                 contextual_embed=False,
                 doc_embed_model="all-MiniLM-L6-v2",
                 device='cpu'
                ):
        # train_bow: NxV
        # test_bow: Nxv
        # word_emeddings: VxD
        # vocab: V, ordered by word id.

        self.load_data(dataset_dir, read_labels)
        self.vocab_size = len(self.vocab)

        print("train_size: ", self.train_bow.shape[0])
        print("test_size: ", self.test_bow.shape[0])
        print("vocab_size: ", self.vocab_size)
        print("average length: {:.3f}".format(self.train_bow.sum(1).sum() / self.train_bow.shape[0]))

        if contextual_embed:
            self.doc_embedder = DocEmbedModel(doc_embed_model, device)
            self.train_contextual_embed = self.doc_embedder.encode(self.train_texts)
            self.test_contextual_embed = self.doc_embedder.encode(self.test_texts)

            self.contextual_embed_size = self.train_contextual_embed.shape[1]

        if as_tensor:
            if not contextual_embed:
                self.train_data = self.train_bow
                self.test_data = self.test_bow
            else:
                self.train_data = np.concatenate((self.train_bow, self.train_contextual_embed), axis=1)
                self.test_data = np.concatenate((self.test_bow, self.test_contextual_embed), axis=1)

            self.train_data = torch.from_numpy(self.train_data).to(device)
            self.test_data = torch.from_numpy(self.test_data).to(device)

            self.train_dataloader = DataLoader(self.train_data, batch_size=batch_size, shuffle=True)
            self.test_dataloader = DataLoader(self.test_data, batch_size=batch_size, shuffle=False)

    def load_data(self, path, read_labels):

        self.train_bow = scipy.sparse.load_npz(f'{path}/train_bow.npz').toarray().astype('float32')
        self.test_bow = scipy.sparse.load_npz(f'{path}/test_bow.npz').toarray().astype('float32')
        self.pretrained_WE = scipy.sparse.load_npz(f'{path}/word_embeddings.npz').toarray().astype('float32')

        self.train_texts = file_utils.read_text(f'{path}/train_texts.txt')
        self.test_texts = file_utils.read_text(f'{path}/test_texts.txt')

        if read_labels:
            self.train_labels = np.loadtxt(f'{path}/train_labels.txt', dtype=int)
            self.test_labels = np.loadtxt(f'{path}/test_labels.txt', dtype=int)

        self.vocab = file_utils.read_text(f'{path}/vocab.txt')

In [24]:
import os
print(os.getcwd())  # Xem working directory
print(os.listdir())  # Xem file/folder tại đây

d:\MachineLearning\federated_vae\main
['basic_dataset.ipynb', 'basic_trainer.ipynb', 'data', 'evaluation', 'experiment.ipynb', 'model', 'sample.ipynb', 'test_flwr', 'trainer', 'utils', '__init__.py', '__pycache__']


In [25]:
### test
test_basic_dataset = BasicDataset(
    dataset_dir = "../data/20NG"
)

train_size:  11314
test_size:  7532
vocab_size:  5000
average length: 110.543


In [26]:
for train_data in test_basic_dataset.train_dataloader:
    print(train_data.shape)

torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200, 5000])
torch.Size([200,