In [1]:
import sys
project_root = "d:/MachineLearning/federated_vae"
sys.path.append(project_root)

In [2]:
import torch
from main.data.basic_dataset import BasicDataset
import torch.nn as nn
from collections import defaultdict
import numpy as np
from main.utils import _utils

In [94]:
class BasicTrainer:
    def __init__(self, 
                 model : nn.Module,
                 dataset : BasicDataset,
                 num_top_words = 15,
                 epochs = 200,
                 learning_rate = 0.002,
                 batch_size = 200,
                 verbose = False,
                 device = "cuda"):
        self.model = model
        self.dataset = dataset
        self.num_top_words = num_top_words
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.verbose = verbose
        self.log_interval = 1
        self.data_size = len(self.dataset.train_data)
        self.device = device

    def make_optimizer(self):
        return torch.optim.Adam(self.model.parameters(), lr = self.learning_rate)
    
    def train(self):
        optimizer = self.make_optimizer()

        for epoch in range(self.epochs):
            self.model.train()
            total_loss = 0.0
            for batch_data in self.dataset.train_dataloader:
                batch_data = batch_data.to(self.device)
                output = self.model(batch_data)

                batch_loss = output['loss']

                optimizer.zero_grad()
                batch_loss.backward()
                optimizer.step()

                total_loss += batch_loss * len(batch_data)

            if (epoch % self.log_interval == 0):    
                print(f"Epoch: {epoch:03d} | Loss: {total_loss / self.data_size}")

        top_words = self.get_top_words()
        train_theta = self.test(self.dataset.train_data)

        return top_words, train_theta

    def test(self, bow):
        data_size = bow.shape[0]
        theta = list()
        all_idx = torch.split(torch.arange(data_size), self.batch_size)
        with torch.no_grad():
            self.model.eval()
            for idx in all_idx:
                batch_input = bow[idx]
                batch_input = batch_input.to(self.device)
                # print(batch_input.device)
                batch_theta = self.model.get_theta(batch_input)
                theta.extend(batch_theta.cpu().tolist())

        theta = np.asarray(theta)
        return theta

    def get_beta(self):
        beta = self.model.get_beta().detach().cpu().numpy()
        return beta

    def get_top_words(self, num_top_words=None):
        if num_top_words is None:
            num_top_words = self.num_top_words
        beta = self.get_beta()
        top_words = _utils.get_top_words(beta, self.dataset.vocab, num_top_words, self.verbose)
        return top_words

    def export_theta(self):
        train_theta = self.test(self.dataset.train_data)
        test_theta = self.test(self.dataset.test_data)
        return train_theta, test_theta


In [99]:
### test
from main.model.ETM import ETM
test_basic_dataset = BasicDataset(
    dataset_dir = "../../data/20NG"
)
test_model = ETM(test_basic_dataset.vocab_size).to("cuda")
test_basic_trainer = BasicTrainer(
    model=test_model,
    dataset=test_basic_dataset,
    verbose=True,
    epochs = 100
)

train_size:  11314
test_size:  7532
vocab_size:  5000
average length: 110.543


In [100]:
rst = test_basic_trainer.train()

Epoch: 000 | Loss: 2006.024169921875
Epoch: 001 | Loss: 1504.8192138671875
Epoch: 002 | Loss: 1261.488525390625
Epoch: 003 | Loss: 1135.0997314453125
Epoch: 004 | Loss: 1056.57275390625
Epoch: 005 | Loss: 1007.9369506835938
Epoch: 006 | Loss: 976.2202758789062
Epoch: 007 | Loss: 953.3737182617188
Epoch: 008 | Loss: 936.7518920898438
Epoch: 009 | Loss: 924.4547729492188
Epoch: 010 | Loss: 914.7496948242188
Epoch: 011 | Loss: 906.7327270507812
Epoch: 012 | Loss: 900.0980224609375
Epoch: 013 | Loss: 894.4736328125
Epoch: 014 | Loss: 889.7579345703125
Epoch: 015 | Loss: 885.5658569335938
Epoch: 016 | Loss: 882.1094360351562
Epoch: 017 | Loss: 879.121337890625
Epoch: 018 | Loss: 876.5726318359375
Epoch: 019 | Loss: 874.4020385742188
Epoch: 020 | Loss: 872.5117797851562
Epoch: 021 | Loss: 870.705078125
Epoch: 022 | Loss: 869.1187744140625
Epoch: 023 | Loss: 867.86328125
Epoch: 024 | Loss: 866.7814331054688
Epoch: 025 | Loss: 865.7118530273438
Epoch: 026 | Loss: 864.8282470703125
Epoch: 027 |

In [101]:
########################### test new documents ####################################
from main.data.preprocess import Preprocess

preprocess = Preprocess()

new_docs = [
    "This is a new document about space, including words like space, satellite, launch, orbit.",
    "This is a new document about Microsoft Windows, including words like windows, files, dos."
]

parsed_new_docs, new_bow = preprocess.parse(new_docs, vocab=test_basic_dataset.vocab)
print(new_bow.shape)

print(new_bow.toarray())
input = torch.as_tensor(new_bow.toarray(), device="cuda").float()
new_theta = test_basic_trainer.test(input)

print(new_theta.argmax(1))

parsing texts: 100%|██████████| 2/2 [00:00<?, ?it/s]

(2, 5000)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[19  4]





In [102]:
top_words, train_theta = rst

In [103]:
for x in new_theta.argmax(1):
    print(top_words[x])

posting please space everyone michael clipper company pay experience college anything performance happen texas business
windows software lot public took western includes non library computing population office young images sounds
