# CTGAN과 같은 모델에서 기본적으로 사용되는 기능들을 제공하는 함수들

In [1]:
"""BaseSynthesizer module."""

import contextlib
import numpy as np
import torch


@contextlib.contextmanager
def set_random_states(random_state, set_model_random_state):
    """Context manager for managing the random state.

    Args:
        random_state (int or tuple):
            The random seed or a tuple of (numpy.random.RandomState, torch.Generator).
        set_model_random_state (function):
            Function to set the random state on the model.
    """
    original_np_state = np.random.get_state()
    original_torch_state = torch.get_rng_state()

    random_np_state, random_torch_state = random_state

    np.random.set_state(random_np_state.get_state())
    torch.set_rng_state(random_torch_state.get_state())

    try:
        yield
    finally:
        current_np_state = np.random.RandomState()
        current_np_state.set_state(np.random.get_state())
        current_torch_state = torch.Generator()
        current_torch_state.set_state(torch.get_rng_state())
        set_model_random_state((current_np_state, current_torch_state))

        np.random.set_state(original_np_state)
        torch.set_rng_state(original_torch_state)


def random_state(function):
    """Set the random state before calling the function.

    Args:
        function (Callable):
            The function to wrap around.
    """

    def wrapper(self, *args, **kwargs):
        if self.random_states is None:
            return function(self, *args, **kwargs)

        else:
            with set_random_states(self.random_states, self.set_random_state):
                return function(self, *args, **kwargs)

    return wrapper


class BaseSynthesizer:
    """Base class for all default synthesizers of ``CTGAN``."""

    random_states = None

    def __getstate__(self):
        """Improve pickling state for ``BaseSynthesizer``.

        Convert to ``cpu`` device before starting the pickling process in order to be able to
        load the model even when used from an external tool such as ``SDV``. Also, if
        ``random_states`` are set, store their states as dictionaries rather than generators.

        Returns:
            dict:
                Python dict representing the object.
        """
        device_backup = self._device
        self.set_device(torch.device('cpu'))
        state = self.__dict__.copy()
        self.set_device(device_backup)
        if (
            isinstance(self.random_states, tuple) and
            isinstance(self.random_states[0], np.random.RandomState) and
            isinstance(self.random_states[1], torch.Generator)
        ):
            state['_numpy_random_state'] = self.random_states[0].get_state()
            state['_torch_random_state'] = self.random_states[1].get_state()
            state.pop('random_states')

        return state

    def __setstate__(self, state):
        """Restore the state of a ``BaseSynthesizer``.

        Restore the ``random_states`` from the state dict if those are present and then
        set the device according to the current hardware.
        """
        if '_numpy_random_state' in state and '_torch_random_state' in state:
            np_state = state.pop('_numpy_random_state')
            torch_state = state.pop('_torch_random_state')

            current_torch_state = torch.Generator()
            current_torch_state.set_state(torch_state)

            current_numpy_state = np.random.RandomState()
            current_numpy_state.set_state(np_state)
            state['random_states'] = (
                current_numpy_state,
                current_torch_state
            )

        self.__dict__ = state
        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        self.set_device(device)

    def save(self, path):
        """Save the model in the passed `path`."""
        device_backup = self._device
        self.set_device(torch.device('cpu'))
        torch.save(self, path)
        self.set_device(device_backup)

    @classmethod
    def load(cls, path):
        """Load the model stored in the passed `path`."""
        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        model = torch.load(path)
        model.set_device(device)
        return model

    def set_random_state(self, random_state):
        """Set the random state.

        Args:
            random_state (int, tuple, or None):
                Either a tuple containing the (numpy.random.RandomState, torch.Generator)
                or an int representing the random seed to use for both random states.
        """
        if random_state is None:
            self.random_states = random_state
        elif isinstance(random_state, int):
            self.random_states = (
                np.random.RandomState(seed=random_state),
                torch.Generator().manual_seed(random_state),
            )
        elif (
            isinstance(random_state, tuple) and
            isinstance(random_state[0], np.random.RandomState) and
            isinstance(random_state[1], torch.Generator)
        ):
            self.random_states = random_state
        else:
            raise TypeError(
                f'`random_state` {random_state} expected to be an int or a tuple of '
                '(`np.random.RandomState`, `torch.Generator`)')

# DataSampler 함수 (이산형 변수 랜덤하게 선택한 후 로그확률로 카테고리 선택)

In [2]:
"""DataSampler module."""

import numpy as np


class DataSampler(object):
    """DataSampler samples the conditional vector and corresponding data for CTGAN."""

    def __init__(self, data, output_info, log_frequency):
        self._data_length = len(data)

        def is_discrete_column(column_info):
            return (len(column_info) == 1
                    and column_info[0].activation_fn == 'softmax')

        n_discrete_columns = sum(
            [1 for column_info in output_info if is_discrete_column(column_info)])

        self._discrete_column_matrix_st = np.zeros(
            n_discrete_columns, dtype='int32')

        # Store the row id for each category in each discrete column.
        # For example _rid_by_cat_cols[a][b] is a list of all rows with the
        # a-th discrete column equal value b.
        self._rid_by_cat_cols = []

        # Compute _rid_by_cat_cols
        st = 0
        for column_info in output_info:
            if is_discrete_column(column_info):
                span_info = column_info[0]
                ed = st + span_info.dim

                rid_by_cat = []
                for j in range(span_info.dim):
                    rid_by_cat.append(np.nonzero(data[:, st + j])[0])
                self._rid_by_cat_cols.append(rid_by_cat)
                st = ed
            else:
                st += sum([span_info.dim for span_info in column_info])
        assert st == data.shape[1]

        # Prepare an interval matrix for efficiently sample conditional vector
        max_category = max([
            column_info[0].dim
            for column_info in output_info
            if is_discrete_column(column_info)
        ], default=0)

        self._discrete_column_cond_st = np.zeros(n_discrete_columns, dtype='int32')
        self._discrete_column_n_category = np.zeros(n_discrete_columns, dtype='int32')
        self._discrete_column_category_prob = np.zeros((n_discrete_columns, max_category))
        self._n_discrete_columns = n_discrete_columns
        self._n_categories = sum([
            column_info[0].dim
            for column_info in output_info
            if is_discrete_column(column_info)
        ])

        st = 0
        current_id = 0
        current_cond_st = 0
        for column_info in output_info:
            if is_discrete_column(column_info):
                span_info = column_info[0]
                ed = st + span_info.dim
                category_freq = np.sum(data[:, st:ed], axis=0)
                if log_frequency:
                    category_freq = np.log(category_freq + 1)
                category_prob = category_freq / np.sum(category_freq)
                self._discrete_column_category_prob[current_id, :span_info.dim] = category_prob
                self._discrete_column_cond_st[current_id] = current_cond_st
                self._discrete_column_n_category[current_id] = span_info.dim
                current_cond_st += span_info.dim
                current_id += 1
                st = ed
            else:
                st += sum([span_info.dim for span_info in column_info])

    def _random_choice_prob_index(self, discrete_column_id):
        probs = self._discrete_column_category_prob[discrete_column_id]
        r = np.expand_dims(np.random.rand(probs.shape[0]), axis=1)
        return (probs.cumsum(axis=1) > r).argmax(axis=1)

    def sample_condvec(self, batch):
        """Generate the conditional vector for training.

        Returns:
            cond (batch x #categories):
                The conditional vector.
            mask (batch x #discrete columns):
                A one-hot vector indicating the selected discrete column.
            discrete column id (batch):
                Integer representation of mask.
            category_id_in_col (batch):
                Selected category in the selected discrete column.
        """
        if self._n_discrete_columns == 0:
            return None

        discrete_column_id = np.random.choice(
            np.arange(self._n_discrete_columns), batch)

        cond = np.zeros((batch, self._n_categories), dtype='float32')
        mask = np.zeros((batch, self._n_discrete_columns), dtype='float32')
        mask[np.arange(batch), discrete_column_id] = 1
        category_id_in_col = self._random_choice_prob_index(discrete_column_id)
        category_id = (self._discrete_column_cond_st[discrete_column_id] + category_id_in_col)
        cond[np.arange(batch), category_id] = 1

        return cond, mask, discrete_column_id, category_id_in_col

    def sample_original_condvec(self, batch):
        """Generate the conditional vector for generation use original frequency."""
        if self._n_discrete_columns == 0:
            return None

        category_freq = self._discrete_column_category_prob.flatten()
        category_freq = category_freq[category_freq != 0]
        category_freq = category_freq / np.sum(category_freq)
        col_idxs = np.random.choice(np.arange(len(category_freq)), batch, p=category_freq)
        cond = np.zeros((batch, self._n_categories), dtype='float32')
        cond[np.arange(batch), col_idxs] = 1

        return cond

    def sample_data(self, data, n, col, opt):
        """Sample data from original training data satisfying the sampled conditional vector.

        Args:
            data:
                The training data.
        Returns:
            n:
                n rows of matrix data.
        """
        if col is None:
            idx = np.random.randint(len(data), size=n)
            return data[idx]

        idx = []
        for c, o in zip(col, opt):
            idx.append(np.random.choice(self._rid_by_cat_cols[c][o]))

        return data[idx]

    def dim_cond_vec(self):
        """Return the total number of categories."""
        return self._n_categories

    def generate_cond_from_condition_column_info(self, condition_info, batch):
        """Generate the condition vector."""
        vec = np.zeros((batch, self._n_categories), dtype='float32')
        id_ = self._discrete_column_matrix_st[condition_info['discrete_column_id']]
        id_ += condition_info['value_id']
        vec[:, id_] = 1
        return vec

# 연속형 변수에 대해서 GMM으로 [-1.1]로 정규화 한 후 Tanh로 변환
# 이산형 변수에 대해서 Onehotencoder사용해 원-핫 벡터 사용

In [3]:
!pip install rdt



In [4]:
"""DataTransformer module."""

from collections import namedtuple

import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from rdt.transformers import ClusterBasedNormalizer, OneHotEncoder

SpanInfo = namedtuple('SpanInfo', ['dim', 'activation_fn'])
ColumnTransformInfo = namedtuple(
    'ColumnTransformInfo', [
        'column_name', 'column_type', 'transform', 'output_info', 'output_dimensions'
    ]
)


class DataTransformer(object):
    """Data Transformer.

    Model continuous columns with a BayesianGMM and normalize them to a scalar between [-1, 1]
    and a vector. Discrete columns are encoded using a OneHotEncoder.
    """

    def __init__(self, max_clusters=10, weight_threshold=0.005):
        """Create a data transformer.

        Args:
            max_clusters (int):
                Maximum number of Gaussian distributions in Bayesian GMM.
            weight_threshold (float):
                Weight threshold for a Gaussian distribution to be kept.
        """
        self._max_clusters = max_clusters
        self._weight_threshold = weight_threshold

    def _fit_continuous(self, data):
        """Train Bayesian GMM for continuous columns.

        Args:
            data (pd.DataFrame):
                A dataframe containing a column.

        Returns:
            namedtuple:
                A ``ColumnTransformInfo`` object.
        """
        column_name = data.columns[0]
        gm = ClusterBasedNormalizer(
            missing_value_generation='from_column',
            max_clusters=min(len(data), self._max_clusters),
            weight_threshold=self._weight_threshold
        )
        gm.fit(data, column_name)
        num_components = sum(gm.valid_component_indicator)

        return ColumnTransformInfo(
            column_name=column_name, column_type='continuous', transform=gm,
            output_info=[SpanInfo(1, 'tanh'), SpanInfo(num_components, 'softmax')],
            output_dimensions=1 + num_components)

    def _fit_discrete(self, data):
        """Fit one hot encoder for discrete column.

        Args:
            data (pd.DataFrame):
                A dataframe containing a column.

        Returns:
            namedtuple:
                A ``ColumnTransformInfo`` object.
        """
        column_name = data.columns[0]
        ohe = OneHotEncoder()
        ohe.fit(data, column_name)
        num_categories = len(ohe.dummies)

        return ColumnTransformInfo(
            column_name=column_name, column_type='discrete', transform=ohe,
            output_info=[SpanInfo(num_categories, 'softmax')],
            output_dimensions=num_categories)

    def fit(self, raw_data, discrete_columns=()):
        """Fit the ``DataTransformer``.

        Fits a ``ClusterBasedNormalizer`` for continuous columns and a
        ``OneHotEncoder`` for discrete columns.

        This step also counts the #columns in matrix data and span information.
        """
        self.output_info_list = []
        self.output_dimensions = 0
        self.dataframe = True

        if not isinstance(raw_data, pd.DataFrame):
            self.dataframe = False
            # work around for RDT issue #328 Fitting with numerical column names fails
            discrete_columns = [str(column) for column in discrete_columns]
            column_names = [str(num) for num in range(raw_data.shape[1])]
            raw_data = pd.DataFrame(raw_data, columns=column_names)

        self._column_raw_dtypes = raw_data.infer_objects().dtypes
        self._column_transform_info_list = []
        for column_name in raw_data.columns:
            if column_name in discrete_columns:
                column_transform_info = self._fit_discrete(raw_data[[column_name]])
            else:
                column_transform_info = self._fit_continuous(raw_data[[column_name]])

            self.output_info_list.append(column_transform_info.output_info)
            self.output_dimensions += column_transform_info.output_dimensions
            self._column_transform_info_list.append(column_transform_info)

    def _transform_continuous(self, column_transform_info, data):
        column_name = data.columns[0]
        flattened_column = data[column_name].to_numpy().flatten()
        data = data.assign(**{column_name: flattened_column})
        gm = column_transform_info.transform
        transformed = gm.transform(data)

        #  Converts the transformed data to the appropriate output format.
        #  The first column (ending in '.normalized') stays the same,
        #  but the lable encoded column (ending in '.component') is one hot encoded.
        output = np.zeros((len(transformed), column_transform_info.output_dimensions))
        output[:, 0] = transformed[f'{column_name}.normalized'].to_numpy()
        index = transformed[f'{column_name}.component'].to_numpy().astype(int)
        output[np.arange(index.size), index + 1] = 1.0

        return output

    def _transform_discrete(self, column_transform_info, data):
        ohe = column_transform_info.transform
        return ohe.transform(data).to_numpy()

    def _synchronous_transform(self, raw_data, column_transform_info_list):
        """Take a Pandas DataFrame and transform columns synchronous.

        Outputs a list with Numpy arrays.
        """
        column_data_list = []
        for column_transform_info in column_transform_info_list:
            column_name = column_transform_info.column_name
            data = raw_data[[column_name]]
            if column_transform_info.column_type == 'continuous':
                column_data_list.append(self._transform_continuous(column_transform_info, data))
            else:
                column_data_list.append(self._transform_discrete(column_transform_info, data))

        return column_data_list

    def _parallel_transform(self, raw_data, column_transform_info_list):
        """Take a Pandas DataFrame and transform columns in parallel.

        Outputs a list with Numpy arrays.
        """
        processes = []
        for column_transform_info in column_transform_info_list:
            column_name = column_transform_info.column_name
            data = raw_data[[column_name]]
            process = None
            if column_transform_info.column_type == 'continuous':
                process = delayed(self._transform_continuous)(column_transform_info, data)
            else:
                process = delayed(self._transform_discrete)(column_transform_info, data)
            processes.append(process)

        return Parallel(n_jobs=-1)(processes)

    def transform(self, raw_data):
        """Take raw data and output a matrix data."""
        if not isinstance(raw_data, pd.DataFrame):
            column_names = [str(num) for num in range(raw_data.shape[1])]
            raw_data = pd.DataFrame(raw_data, columns=column_names)

        # Only use parallelization with larger data sizes.
        # Otherwise, the transformation will be slower.
        if raw_data.shape[0] < 500:
            column_data_list = self._synchronous_transform(
                raw_data,
                self._column_transform_info_list
            )
        else:
            column_data_list = self._parallel_transform(
                raw_data,
                self._column_transform_info_list
            )

        return np.concatenate(column_data_list, axis=1).astype(float)

    def _inverse_transform_continuous(self, column_transform_info, column_data, sigmas, st):
        gm = column_transform_info.transform
        data = pd.DataFrame(
            column_data[:, :2], columns=list(gm.get_output_sdtypes())).astype(float)
        data[data.columns[1]] = np.argmax(column_data[:, 1:], axis=1)
        if sigmas is not None:
            selected_normalized_value = np.random.normal(data.iloc[:, 0], sigmas[st])
            data.iloc[:, 0] = selected_normalized_value

        return gm.reverse_transform(data)

    def _inverse_transform_discrete(self, column_transform_info, column_data):
        ohe = column_transform_info.transform
        data = pd.DataFrame(column_data, columns=list(ohe.get_output_sdtypes()))
        return ohe.reverse_transform(data)[column_transform_info.column_name]

    def inverse_transform(self, data, sigmas=None):
        """Take matrix data and output raw data.

        Output uses the same type as input to the transform function.
        Either np array or pd dataframe.
        """
        st = 0
        recovered_column_data_list = []
        column_names = []
        for column_transform_info in self._column_transform_info_list:
            dim = column_transform_info.output_dimensions
            column_data = data[:, st:st + dim]
            if column_transform_info.column_type == 'continuous':
                recovered_column_data = self._inverse_transform_continuous(
                    column_transform_info, column_data, sigmas, st)
            else:
                recovered_column_data = self._inverse_transform_discrete(
                    column_transform_info, column_data)

            recovered_column_data_list.append(recovered_column_data)
            column_names.append(column_transform_info.column_name)
            st += dim

        recovered_data = np.column_stack(recovered_column_data_list)
        recovered_data = (pd.DataFrame(recovered_data, columns=column_names)
                          .astype(self._column_raw_dtypes))
        if not self.dataframe:
            recovered_data = recovered_data.to_numpy()

        return recovered_data

    def convert_column_name_value_to_id(self, column_name, value):
        """Get the ids of the given `column_name`."""
        discrete_counter = 0
        column_id = 0
        for column_transform_info in self._column_transform_info_list:
            if column_transform_info.column_name == column_name:
                break
            if column_transform_info.column_type == 'discrete':
                discrete_counter += 1

            column_id += 1

        else:
            raise ValueError(f"The column_name `{column_name}` doesn't exist in the data.")

        ohe = column_transform_info.transform
        data = pd.DataFrame([value], columns=[column_transform_info.column_name])
        one_hot = ohe.transform(data).to_numpy()[0]
        if sum(one_hot) == 0:
            raise ValueError(f"The value `{value}` doesn't exist in the column `{column_name}`.")

        return {
            'discrete_column_id': discrete_counter,
            'column_id': column_id,
            'value_id': np.argmax(one_hot)
        }

# G, D, Q 네트워크 생성

# Q네트워크에서 평균이랑 분산 pac 사이즈 고려하지 않고 Batch_size,latent_dim으로 출력하는 판별기 코드

In [5]:
import torch
from torch import nn
from torch.nn import Module, Linear, LeakyReLU, Dropout, Sequential

class Discriminator(Module):
    """Discriminator for the CTGAN with D_head and Q_head for InfoGAN."""

    def __init__(self, input_dim, discriminator_dim, latent_dim, pac=10):
        super(Discriminator, self).__init__()
        dim = input_dim * pac
        self.pac = pac
        self.pacdim = dim

        # Shared layers between D_head and Q_head
        seq = []
        for item in list(discriminator_dim):
            seq += [Linear(dim, item), LeakyReLU(0.2), Dropout(0.5)]
            dim = item

        self.shared = Sequential(*seq)

        # D_head for real/fake classification
        self.D_head = Linear(dim, 1)

        # Q_head for latent code prediction
        self.Q_mu = Linear(dim, latent_dim)
        self.Q_var = Linear(dim, latent_dim)

    def calc_gradient_penalty(self, real_data, fake_data, device='cpu', pac=10, lambda_=10):
        """Compute the gradient penalty."""
        alpha = torch.rand(real_data.size(0) // pac, 1, 1, device=device)
        alpha = alpha.repeat(1, pac, real_data.size(1))
        alpha = alpha.view(-1, real_data.size(1))

        interpolates = alpha * real_data + ((1 - alpha) * fake_data)

        disc_interpolates = self(interpolates)[0]

        gradients = torch.autograd.grad(
            outputs=disc_interpolates, inputs=interpolates,
            grad_outputs=torch.ones(disc_interpolates.size(), device=device),
            create_graph=True, retain_graph=True, only_inputs=True
        )[0]

        gradients_view = gradients.view(-1, pac * real_data.size(1)).norm(2, dim=1) - 1
        gradient_penalty = ((gradients_view) ** 2).mean() * lambda_

        return gradient_penalty

    def forward(self, input_):
        """Apply the Discriminator and Q network to the `input_`."""
        batch_size = input_.size(0)
        assert batch_size % self.pac == 0
        # Combine input according to pacdim
        combined_input = input_.view(batch_size // self.pac, -1)
        shared_output = self.shared(combined_input)

        # D_head for real/fake classification
        disc_output = self.D_head(shared_output)

        # Q_head for latent code prediction
        mu = self.Q_mu(shared_output)
        var = torch.exp(self.Q_var(shared_output))  # Ensure variance is positive

        # Repeat mu and var to match batch_size
        mu = mu.repeat(self.pac, 1)
        var = var.repeat(self.pac, 1)

        return disc_output, mu.view(batch_size, -1), var.view(batch_size, -1)

# Test with batch size and pac
input_dim = 128
discriminator_dim = [256, 128]
latent_dim = 10
pac = 10
batch_size = 100

discriminator = Discriminator(input_dim, discriminator_dim, latent_dim, pac=pac)

# Generate random data for testing
real_data = torch.randn(batch_size, input_dim)
fake_data = torch.randn(batch_size, input_dim)
disc_output, mu, var = discriminator(real_data)
print(disc_output.shape, mu.shape, var.shape)


torch.Size([10, 1]) torch.Size([100, 10]) torch.Size([100, 10])


# pac사이즈 고려해서 평균이랑 분산 출력하는 판별기 -> 이렇게 되면 차원이 맞지않음.(Q네트워크는 굳이 pac 사이즈 고려 필요 없음)

In [6]:
# import torch
# from torch import nn
# from torch.nn import Module, Linear, LeakyReLU, Dropout, Sequential

# class Discriminator(Module):
#     """Discriminator for the CTGAN with D_head and Q_head for InfoGAN."""

#     def __init__(self, input_dim, discriminator_dim, latent_dim, pac=10):
#         super(Discriminator, self).__init__()
#         dim = input_dim * pac
#         self.pac = pac
#         self.pacdim = dim

#         # Shared layers between D_head and Q_head
#         seq = []
#         for item in list(discriminator_dim):
#             seq += [Linear(dim, item), LeakyReLU(0.2), Dropout(0.5)]
#             dim = item

#         self.shared = Sequential(*seq)

#         # D_head for real/fake classification
#         self.D_head = Linear(dim, 1)

#         # Q_head for latent code prediction
#         self.Q_mu = Linear(dim, latent_dim)
#         self.Q_var = Linear(dim, latent_dim)

#     def calc_gradient_penalty(self, real_data, fake_data, device='cpu', pac=10, lambda_=10):
#         """Compute the gradient penalty."""
#         alpha = torch.rand(real_data.size(0) // pac, 1, 1, device=device)
#         alpha = alpha.repeat(1, pac, real_data.size(1))
#         alpha = alpha.view(-1, real_data.size(1))

#         interpolates = alpha * real_data + ((1 - alpha) * fake_data)

#         disc_interpolates = self(interpolates)[0]

#         gradients = torch.autograd.grad(
#             outputs=disc_interpolates, inputs=interpolates,
#             grad_outputs=torch.ones(disc_interpolates.size(), device=device),
#             create_graph=True, retain_graph=True, only_inputs=True
#         )[0]

#         gradients_view = gradients.view(-1, pac * real_data.size(1)).norm(2, dim=1) - 1
#         gradient_penalty = ((gradients_view) ** 2).mean() * lambda_

#         return gradient_penalty

#     def forward(self, input_):
#         """Apply the Discriminator and Q network to the `input_`."""
#         assert input_.size()[0] % self.pac == 0
#         shared_output = self.shared(input_.view(-1, self.pacdim))


#         # D_head for real/fake classification
#         disc_output = self.D_head(shared_output)

#         # Q_head for latent code prediction
#         mu = self.Q_mu(shared_output)
#         var = torch.exp(self.Q_var(shared_output))  # Ensure variance is positive

#         return disc_output, mu, var

# # Example usage
# input_dim = 128
# discriminator_dim = [256, 128]
# latent_dim = 2
# discriminator = Discriminator(input_dim, discriminator_dim, latent_dim)

# # Generate random data for testing
# real_data = torch.randn(100, input_dim)
# print(real_data.shape)
# fake_data = torch.randn(100, input_dim)
# disc_output, mu, var = discriminator(real_data)
# print(disc_output.shape, mu.shape, var.shape)

In [7]:
import torch
from torch.nn import Module, Linear, BatchNorm1d, ReLU

class Residual(Module):
    """Residual layer for the CTGAN with latent code."""

    def __init__(self, i, o, latent_dim):
        super(Residual, self).__init__()
        self.fc = Linear(i + latent_dim, o)
        self.bn = BatchNorm1d(o)
        self.relu = ReLU()
        self.latent_dim = latent_dim

    def forward(self, input_, latent_code):
        """Apply the Residual layer to the `input_` and `latent_code`."""
        # Concatenate input with latent code
        latent_code_expanded = latent_code.expand(input_.size(0), self.latent_dim)
        combined_input = torch.cat([input_, latent_code_expanded], dim=1)
        out = self.fc(combined_input)
        out = self.bn(out)
        out = self.relu(out)
        return torch.cat([out, input_], dim=1)


In [8]:
from torch.nn import Sequential

class Generator(Module):
    """Generator for the CTGAN with latent code."""

    def __init__(self, embedding_dim, generator_dim, data_dim, latent_dim):
        super(Generator, self).__init__()
        dim = embedding_dim
        self.latent_dim = latent_dim
        seq = []
        for item in list(generator_dim):
            seq += [Residual(dim, item, latent_dim)]
            dim += item
        seq.append(Linear(dim, data_dim))
        self.seq = Sequential(*seq)

    def forward(self, input_, latent_code):
        """Apply the Generator to the `input_` and `latent_code`."""
        data = input_
        for layer in self.seq:
            if isinstance(layer, Residual):
                data = layer(data, latent_code)
            else:
                data = layer(data)
        return data


In [9]:
def sample(self, n, condition_column=None, condition_value=None):
        """Sample data similar to the training data.

        Choosing a condition_column and condition_value will increase the probability of the
        discrete condition_value happening in the condition_column.

        Args:
            n (int):
                Number of rows to sample.
            condition_column (string):
                Name of a discrete column.
            condition_value (string):
                Name of the category in the condition_column which we wish to increase the
                probability of happening.

        Returns:
            numpy.ndarray or pandas.DataFrame
        """
        if condition_column is not None and condition_value is not None:
            condition_info = self._transformer.convert_column_name_value_to_id(
                condition_column, condition_value)
            global_condition_vec = self._data_sampler.generate_cond_from_condition_column_info(
                condition_info, self._batch_size)
        else:
            global_condition_vec = None

        steps = n // self._batch_size + 1
        data = []
        for i in range(steps):
            # 노이즈 생성
            mean = torch.zeros(self._batch_size, self._embedding_dim)
            std = mean + 1
            fakez = torch.normal(mean=mean, std=std).to(self._device)

            if global_condition_vec is not None:
                condvec = global_condition_vec.copy()
            else:
                condvec = self._data_sampler.sample_original_condvec(self._batch_size)

            if condvec is None:
                pass
            else:
                c1 = condvec
                c1 = torch.from_numpy(c1).to(self._device)
                fakez = torch.cat([fakez, c1], dim=1)

            fake = self._generator(fakez)
            fakeact = self._apply_activate(fake)
            data.append(fakeact.detach().cpu().numpy())

        data = np.concatenate(data, axis=0)
        data = data[:n]

        return self._transformer.inverse_transform(data)

In [10]:
import numpy as np
import torch

class MutualInformationLoss(torch.nn.Module):
    def __init__(self):
        super(MutualInformationLoss, self).__init__()

    def forward(self, x, mu, var):
        logli = -0.5 * (var.mul(2 * np.pi) + 1e-6).log() - (x - mu).pow(2).div(var.mul(2.0) + 1e-6)
        nll = -(logli.sum(1).mean())
        return nll


In [11]:
# # CTGAN
# from torch import nn, optim
# from tqdm import tqdm
# from torch.nn import functional
# import warnings

# import numpy as np
# import pandas as pd
# import torch
# from torch import optim
# from torch.nn import BatchNorm1d, Dropout, LeakyReLU, Linear, Module, ReLU, Sequential, functional
# from tqdm import tqdm


# class CTGAN(BaseSynthesizer):
#     """Conditional Table GAN Synthesizer.

#     This is the core class of the CTGAN project, where the different components
#     are orchestrated together.
#     For more details about the process, please check the [Modeling Tabular data using
#     Conditional GAN](https://arxiv.org/abs/1907.00503) paper.

#     Args:
#         embedding_dim (int):
#             Size of the random sample passed to the Generator. Defaults to 128.
#         generator_dim (tuple or list of ints):
#             Size of the output samples for each one of the Residuals. A Residual Layer
#             will be created for each one of the values provided. Defaults to (256, 256).
#         discriminator_dim (tuple or list of ints):
#             Size of the output samples for each one of the Discriminator Layers. A Linear Layer
#             will be created for each one of the values provided. Defaults to (256, 256).
#         generator_lr (float):
#             Learning rate for the generator. Defaults to 2e-4.
#         generator_decay (float):
#             Generator weight decay for the Adam Optimizer. Defaults to 1e-6.
#         discriminator_lr (float):
#             Learning rate for the discriminator. Defaults to 2e-4.
#         discriminator_decay (float):
#             Discriminator weight decay for the Adam Optimizer. Defaults to 1e-6.
#         batch_size (int):
#             Number of data samples to process in each step.
#         discriminator_steps (int):
#             Number of discriminator updates to do for each generator update.
#             From the WGAN paper: https://arxiv.org/abs/1701.07875. WGAN paper
#             default is 5. Default used is 1 to match original CTGAN implementation.
#         log_frequency (boolean):
#             Whether to use log frequency of categorical levels in conditional
#             sampling. Defaults to ``True``.
#         verbose (boolean):
#             Whether to have print statements for progress results. Defaults to ``False``.
#         epochs (int):
#             Number of training epochs. Defaults to 300.
#         pac (int):
#             Number of samples to group together when applying the discriminator.
#             Defaults to 10.
#         cuda (bool):
#             Whether to attempt to use cuda for GPU computation.
#             If this is False or CUDA is not available, CPU will be used.
#             Defaults to ``True``.
#     """

#     def __init__(self, embedding_dim=128, generator_dim=(256, 256), discriminator_dim=(256, 256),
#                  latent_dim=1, generator_lr=2e-4, generator_decay=1e-6, discriminator_lr=2e-4,
#                  discriminator_decay=1e-6, batch_size=500, discriminator_steps=1,
#                  log_frequency=True, verbose=False, epochs=300, pac=10, cuda=True):


#         assert batch_size % 2 == 0

#         self._embedding_dim = embedding_dim
#         self._generator_dim = generator_dim
#         self._discriminator_dim = discriminator_dim
#         self._latent_dim = latent_dim  # latent_dim 추가

#         self._generator_lr = generator_lr
#         self._generator_decay = generator_decay
#         self._discriminator_lr = discriminator_lr
#         self._discriminator_decay = discriminator_decay

#         self._batch_size = batch_size
#         self._discriminator_steps = discriminator_steps
#         self._log_frequency = log_frequency
#         self._verbose = verbose
#         self._epochs = epochs
#         self.pac = pac

#         if not cuda or not torch.cuda.is_available():
#             device = 'cpu'
#         elif isinstance(cuda, str):
#             device = cuda
#         else:
#             device = 'cuda'

#         self._device = torch.device(device)

#         self._transformer = None
#         self._data_sampler = None
#         self._generator = None
#         self.loss_values = None


#     @staticmethod
#     def _gumbel_softmax(logits, tau=1, hard=False, eps=1e-10, dim=-1):
#         """Deals with the instability of the gumbel_softmax for older versions of torch.

#         For more details about the issue:
#         https://drive.google.com/file/d/1AA5wPfZ1kquaRtVruCd6BiYZGcDeNxyP/view?usp=sharing

#         Args:
#             logits […, num_features]:
#                 Unnormalized log probabilities
#             tau:
#                 Non-negative scalar temperature
#             hard (bool):
#                 If True, the returned samples will be discretized as one-hot vectors,
#                 but will be differentiated as if it is the soft sample in autograd
#             dim (int):
#                 A dimension along which softmax will be computed. Default: -1.

#         Returns:
#             Sampled tensor of same shape as logits from the Gumbel-Softmax distribution.
#         """
#         for _ in range(10):
#             transformed = functional.gumbel_softmax(logits, tau=tau, hard=hard, eps=eps, dim=dim)
#             if not torch.isnan(transformed).any():
#                 return transformed

#         raise ValueError('gumbel_softmax returning NaN.')


#     def _apply_activate(self, data):
#       """Apply proper activation function to the output of the generator."""
#       data_t = []
#       st = 0
#       for column_info in self._transformer.output_info_list:
#           for span_info in column_info:
#               if span_info.activation_fn == 'tanh':
#                   ed = st + span_info.dim
#                   data_t.append(torch.tanh(data[:, st:ed]))
#                   st = ed
#               elif span_info.activation_fn == 'softmax':
#                   ed = st + span_info.dim
#                   transformed = self._gumbel_softmax(data[:, st:ed], tau=0.2)
#                   data_t.append(transformed)
#                   st = ed
#               else:
#                   raise ValueError(f'Unexpected activation function {span_info.activation_fn}.')

#       return torch.cat(data_t, dim=1)

#     def _cond_loss(self, data, c, m):
#       """Compute the cross entropy loss on the fixed discrete column."""
#       loss = []
#       st = 0
#       st_c = 0
#       for column_info in self._transformer.output_info_list:
#           for span_info in column_info:
#               if len(column_info) != 1 or span_info.activation_fn != 'softmax':
#                   # not discrete column
#                   st += span_info.dim
#               else:
#                   ed = st + span_info.dim
#                   ed_c = st_c + span_info.dim
#                   tmp = functional.cross_entropy(
#                       data[:, st:ed],
#                       torch.argmax(c[:, st_c:ed_c], dim=1),
#                       reduction='none'
#                   )
#                   loss.append(tmp)
#                   st = ed
#                   st_c = ed_c

#       loss = torch.stack(loss, dim=1)  # noqa: PD013

#       return (loss * m).sum() / data.size()[0]


#     # 이산형 열이 존재하는지
#     def _validate_discrete_columns(self, train_data, discrete_columns):
#       """Check whether ``discrete_columns`` exists in ``train_data``.

#       Args:
#           train_data (numpy.ndarray or pandas.DataFrame):
#               Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame.
#           discrete_columns (list-like):
#               List of discrete columns to be used to generate the Conditional
#               Vector. If ``train_data`` is a Numpy array, this list should
#               contain the integer indices of the columns. Otherwise, if it is
#               a ``pandas.DataFrame``, this list should contain the column names.
#       """
#       if isinstance(train_data, pd.DataFrame):
#           invalid_columns = set(discrete_columns) - set(train_data.columns)
#       elif isinstance(train_data, np.ndarray):
#           invalid_columns = []
#           for column in discrete_columns:
#               if column < 0 or column >= train_data.shape[1]:
#                   invalid_columns.append(column)
#       else:
#           raise TypeError('``train_data`` should be either pd.DataFrame or np.array.')

#       if invalid_columns:
#           raise ValueError(f'Invalid columns found: {invalid_columns}')

#     @random_state
#     def fit(self, train_data, discrete_columns=(), epochs=None):
#       """Fit the CTGAN Synthesizer models to the training data.
#       # CTGAN 모델 학습 (latent code 추가)

#       Args:
#           train_data (numpy.ndarray or pandas.DataFrame):
#               Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame.
#           discrete_columns (list-like):
#               List of discrete columns to be used to generate the Conditional
#               Vector. If ``train_data`` is a Numpy array, this list should
#               contain the integer indices of the columns. Otherwise, if it is
#               a ``pandas.DataFrame``, this list should contain the column names.
#       """
#       self._validate_discrete_columns(train_data, discrete_columns)

#       if epochs is None:
#           epochs = self._epochs
#       else:
#           warnings.warn(
#               ('`epochs` argument in `fit` method has been deprecated and will be removed '
#                 'in a future version. Please pass `epochs` to the constructor instead'),
#               DeprecationWarning
#           )

#       self._transformer = DataTransformer()
#       self._transformer.fit(train_data, discrete_columns)

#       train_data = self._transformer.transform(train_data)

#       self._data_sampler = DataSampler(
#           train_data,
#           self._transformer.output_info_list,
#           self._log_frequency)

#       data_dim = self._transformer.output_dimensions

#       self._generator = Generator(
#           self._embedding_dim + self._data_sampler.dim_cond_vec(),
#           self._generator_dim,
#           data_dim,
#           self._latent_dim  # latent_dim 추가
#       ).to(self._device)

#       discriminator = Discriminator(
#           data_dim + self._data_sampler.dim_cond_vec(),
#           self._discriminator_dim,
#           latent_dim=self._latent_dim,  # latent_dim 추가
#           pac=self.pac
#       ).to(self._device)

#       optimizerG = optim.Adam(
#           self._generator.parameters(), lr=self._generator_lr, betas=(0.5, 0.9),
#           weight_decay=self._generator_decay
#       )

#       optimizerD = optim.Adam(
#           discriminator.parameters(), lr=self._discriminator_lr,
#           betas=(0.5, 0.9), weight_decay=self._discriminator_decay
#       )

#       mean = torch.zeros(self._batch_size, self._embedding_dim, device=self._device)
#       std = torch.ones(self._batch_size, self._embedding_dim, device=self._device)

#       self.loss_values = pd.DataFrame(columns=['Epoch', 'Generator Loss', 'Discriminator Loss'])

#       epoch_iterator = tqdm(range(epochs), disable=(not self._verbose))
#       if self._verbose:
#           description = 'Gen. ({gen:.2f}) | Discrim. ({dis:.2f})'
#           epoch_iterator.set_description(description.format(gen=0, dis=0))

#       steps_per_epoch = max(len(train_data) // self._batch_size, 1)
#       for i in epoch_iterator:
#           for id_ in range(steps_per_epoch):

#               for n in range(self._discriminator_steps):
#                   fakez = torch.normal(mean=mean, std=std)
#                   # latent_code = torch.normal(mean=torch.zeros(self._batch_size, self._latent_dim), std=torch.ones(self._batch_size, self._latent_dim)).to(self._device)  # latent_code 추가
#                   latent_code = torch.rand(self._batch_size, self._latent_dim, device=self._device) * 2 - 1  # uniform 분포에서 latent code 생성

#                   condvec = self._data_sampler.sample_condvec(self._batch_size)
#                   if condvec is None:
#                       c1, m1, col, opt = None, None, None, None
#                       real = self._data_sampler.sample_data(
#                           train_data, self._batch_size, col, opt)
#                   else:
#                       c1, m1, col, opt = condvec
#                       c1 = torch.from_numpy(c1).to(self._device)
#                       m1 = torch.from_numpy(m1).to(self._device)
#                       fakez = torch.cat([fakez, c1], dim=1)

#                       perm = np.arange(self._batch_size)
#                       np.random.shuffle(perm)
#                       real = self._data_sampler.sample_data(
#                           train_data, self._batch_size, col[perm], opt[perm])
#                       c2 = c1[perm]
#                   # conditional vector와 noise에다가 latent_code 추가
#                   fake = self._generator(fakez, latent_code)  # latent_code 추가
#                   fakeact = self._apply_activate(fake)

#                   real = torch.from_numpy(real.astype('float32')).to(self._device)

#                   if c1 is not None:
#                       fake_cat = torch.cat([fakeact, c1], dim=1)
#                       real_cat = torch.cat([real, c2], dim=1)
#                   else:
#                       real_cat = real
#                       fake_cat = fakeact

#                   y_fake = discriminator(fake_cat)
#                   y_real = discriminator(real_cat)

#                   pen = discriminator.calc_gradient_penalty(
#                       real_cat, fake_cat, self._device, self.pac)
#                   loss_d = -(torch.mean(y_real) - torch.mean(y_fake))

#                   optimizerD.zero_grad(set_to_none=False)
#                   pen.backward(retain_graph=True)
#                   loss_d.backward()
#                   optimizerD.step()

#               fakez = torch.normal(mean=mean, std=std)
#               # latent_code = torch.normal(mean=torch.zeros(self._batch_size, self._latent_dim), std=torch.ones(self._batch_size, self._latent_dim)).to(self._device)  # latent_code 추가
#               latent_code = torch.rand(self._batch_size, self._latent_dim, device=self._device) * 2 - 1  # uniform 분포에서 latent_code 생성
#               condvec = self._data_sampler.sample_condvec(self._batch_size)

#               if condvec is None:
#                   c1, m1, col, opt = None, None, None, None
#               else:
#                   c1, m1, col, opt = condvec
#                   c1 = torch.from_numpy(c1).to(self._device)
#                   m1 = torch.from_numpy(m1).to(self._device)
#                   fakez = torch.cat([fakez, c1], dim=1)

#               fake = self._generator(fakez, latent_code)  # latent_code 추가
#               fakeact = self._apply_activate(fake)

#               if c1 is not None:
#                   y_fake = discriminator(torch.cat([fakeact, c1], dim=1))
#               else:
#                   y_fake = discriminator(fakeact)

#               if condvec is None:
#                   cross_entropy = 0
#               else:
#                   cross_entropy = self._cond_loss(fake, c1, m1)

#               mu, var = self._q_network(fake)  # Q 네트워크로부터 mu와 var 추출
#               mi_loss = self.mutual_information_loss(latent_code, mu, var)  # mutual information loss 계산

#               loss_g = -torch.mean(y_fake) + cross_entropy + mi_loss  # mutual information loss를 Generator 손실에 추가

#               optimizerG.zero_grad(set_to_none=False)
#               loss_g.backward()
#               optimizerG.step()

#           generator_loss = loss_g.detach().cpu().item()
#           discriminator_loss = loss_d.detach().cpu().item()

#           epoch_loss_df = pd.DataFrame({
#               'Epoch': [i],
#               'Generator Loss': [generator_loss],
#               'Discriminator Loss': [discriminator_loss]
#           })
#           if not self.loss_values.empty:
#               self.loss_values = pd.concat(
#                   [self.loss_values, epoch_loss_df]
#               ).reset_index(drop=True)
#           else:
#               self.loss_values = epoch_loss_df

#           if self._verbose:
#               epoch_iterator.set_description(
#                   description.format(gen=generator_loss, dis=discriminator_loss)
#               )



#     def sample(self, n, condition_column=None, condition_value=None):
#         """Sample data similar to the training data.

#         Choosing a condition_column and condition_value will increase the probability of the
#         discrete condition_value happening in the condition_column.

#         Args:
#             n (int):
#                 Number of rows to sample.
#             condition_column (string):
#                 Name of a discrete column.
#             condition_value (string):
#                 Name of the category in the condition_column which we wish to increase the
#                 probability of happening.

#         Returns:
#             numpy.ndarray or pandas.DataFrame
#         """
#         if condition_column is not None and condition_value is not None:
#             condition_info = self._transformer.convert_column_name_value_to_id(
#                 condition_column, condition_value)
#             global_condition_vec = self._data_sampler.generate_cond_from_condition_column_info(
#                 condition_info, self._batch_size)
#         else:
#             global_condition_vec = None

#         steps = n // self._batch_size + 1
#         data = []
#         for i in range(steps):
#             mean = torch.zeros(self._batch_size, self._embedding_dim)
#             std = mean + 1
#             fakez = torch.normal(mean=mean, std=std).to(self._device)
#             latent_code = torch.rand(self._batch_size, self._latent_dim, device=self._device) * 2 - 1  # uniform 분포에서 latent code 생성

#             if global_condition_vec is not None:
#                 condvec = global_condition_vec.copy()
#             else:
#                 condvec = self._data_sampler.sample_original_condvec(self._batch_size)

#             if condvec is None:
#                 pass
#             else:
#                 c1 = condvec
#                 c1 = torch.from_numpy(c1).to(self._device)
#                 fakez = torch.cat([fakez, c1], dim=1)

#             fake = self._generator(fakez, latent_code) # latent code 추가
#             fakeact = self._apply_activate(fake)
#             data.append(fakeact.detach().cpu().numpy())

#         data = np.concatenate(data, axis=0)
#         data = data[:n]

#         return self._transformer.inverse_transform(data)

#     def set_device(self, device):
#       """Set the `device` to be used ('GPU' or 'CPU)."""
#       self._device = device
#       if self._generator is not None:
#           self._generator.to(self._device)
#       if self._discriminator is not None:
#           self._discriminator.to(self._device)
#       if self._q_network is not None:
#           self._q_network.to(self._device)


In [12]:
# CTGAN
from torch import nn, optim
from tqdm import tqdm
from torch.nn import functional
import warnings

import numpy as np
import pandas as pd
import torch
from torch import optim
from torch.nn import BatchNorm1d, Dropout, LeakyReLU, Linear, Module, ReLU, Sequential, functional
from tqdm import tqdm


class CTGAN(BaseSynthesizer):
    """Conditional Table GAN Synthesizer.

    This is the core class of the CTGAN project, where the different components
    are orchestrated together.
    For more details about the process, please check the [Modeling Tabular data using
    Conditional GAN](https://arxiv.org/abs/1907.00503) paper.

    Args:
        embedding_dim (int):
            Size of the random sample passed to the Generator. Defaults to 128.
        generator_dim (tuple or list of ints):
            Size of the output samples for each one of the Residuals. A Residual Layer
            will be created for each one of the values provided. Defaults to (256, 256).
        discriminator_dim (tuple or list of ints):
            Size of the output samples for each one of the Discriminator Layers. A Linear Layer
            will be created for each one of the values provided. Defaults to (256, 256).
        generator_lr (float):
            Learning rate for the generator. Defaults to 2e-4.
        generator_decay (float):
            Generator weight decay for the Adam Optimizer. Defaults to 1e-6.
        discriminator_lr (float):
            Learning rate for the discriminator. Defaults to 2e-4.
        discriminator_decay (float):
            Discriminator weight decay for the Adam Optimizer. Defaults to 1e-6.
        batch_size (int):
            Number of data samples to process in each step.
        discriminator_steps (int):
            Number of discriminator updates to do for each generator update.
            From the WGAN paper: https://arxiv.org/abs/1701.07875. WGAN paper
            default is 5. Default used is 1 to match original CTGAN implementation.
        log_frequency (boolean):
            Whether to use log frequency of categorical levels in conditional
            sampling. Defaults to ``True``.
        verbose (boolean):
            Whether to have print statements for progress results. Defaults to ``False``.
        epochs (int):
            Number of training epochs. Defaults to 300.
        pac (int):
            Number of samples to group together when applying the discriminator.
            Defaults to 10.
        cuda (bool):
            Whether to attempt to use cuda for GPU computation.
            If this is False or CUDA is not available, CPU will be used.
            Defaults to ``True``.
    """

    def __init__(self, embedding_dim=128, generator_dim=(256, 256), discriminator_dim=(256, 256),
                 latent_dim=1, generator_lr=2e-4, generator_decay=1e-6, discriminator_lr=2e-4,
                 discriminator_decay=1e-6, batch_size=500, discriminator_steps=1,
                 log_frequency=True, verbose=False, epochs=300, pac=10, cuda=True):


        assert batch_size % 2 == 0

        self._embedding_dim = embedding_dim
        self._generator_dim = generator_dim
        self._discriminator_dim = discriminator_dim
        self._latent_dim = latent_dim  # latent_dim 추가

        self._generator_lr = generator_lr
        self._generator_decay = generator_decay
        self._discriminator_lr = discriminator_lr
        self._discriminator_decay = discriminator_decay

        self._batch_size = batch_size
        self._discriminator_steps = discriminator_steps
        self._log_frequency = log_frequency
        self._verbose = verbose
        self._epochs = epochs
        self.pac = pac

        if not cuda or not torch.cuda.is_available():
            device = 'cpu'
        elif isinstance(cuda, str):
            device = cuda
        else:
            device = 'cuda'

        self._device = torch.device(device)

        self._transformer = None
        self._data_sampler = None
        self._generator = None
        self.loss_values = None
        self._discriminator = None  # Discriminator 초기화 변수 추가
        self._q_network = None  # Q 네트워크 초기화 변수 추가
        self.mutual_information_loss = MutualInformationLoss()  # MutualInformationLoss 인스턴스 생성

    @staticmethod
    def _gumbel_softmax(logits, tau=1, hard=False, eps=1e-10, dim=-1):
        """Deals with the instability of the gumbel_softmax for older versions of torch.

        For more details about the issue:
        https://drive.google.com/file/d/1AA5wPfZ1kquaRtVruCd6BiYZGcDeNxyP/view?usp=sharing

        Args:
            logits […, num_features]:
                Unnormalized log probabilities
            tau:
                Non-negative scalar temperature
            hard (bool):
                If True, the returned samples will be discretized as one-hot vectors,
                but will be differentiated as if it is the soft sample in autograd
            dim (int):
                A dimension along which softmax will be computed. Default: -1.

        Returns:
            Sampled tensor of same shape as logits from the Gumbel-Softmax distribution.
        """
        for _ in range(10):
            transformed = functional.gumbel_softmax(logits, tau=tau, hard=hard, eps=eps, dim=dim)
            if not torch.isnan(transformed).any():
                return transformed

        raise ValueError('gumbel_softmax returning NaN.')

    def _apply_activate(self, data):
        """Apply proper activation function to the output of the generator."""
        data_t = []
        st = 0
        for column_info in self._transformer.output_info_list:
            for span_info in column_info:
                if span_info.activation_fn == 'tanh':
                    ed = st + span_info.dim
                    data_t.append(torch.tanh(data[:, st:ed]))
                    st = ed
                elif span_info.activation_fn == 'softmax':
                    ed = st + span_info.dim
                    transformed = self._gumbel_softmax(data[:, st:ed], tau=0.2)
                    data_t.append(transformed)
                    st = ed
                else:
                    raise ValueError(f'Unexpected activation function {span_info.activation_fn}.')

        return torch.cat(data_t, dim=1)

    def _cond_loss(self, data, c, m):
        """Compute the cross entropy loss on the fixed discrete column."""
        loss = []
        st = 0
        st_c = 0
        for column_info in self._transformer.output_info_list:
            for span_info in column_info:
                if len(column_info) != 1 or span_info.activation_fn != 'softmax':
                    # not discrete column
                    st += span_info.dim
                else:
                    ed = st + span_info.dim
                    ed_c = st_c + span_info.dim
                    tmp = functional.cross_entropy(
                        data[:, st:ed],
                        torch.argmax(c[:, st_c:ed_c], dim=1),
                        reduction='none'
                    )
                    loss.append(tmp)
                    st = ed
                    st_c = ed_c

        loss = torch.stack(loss, dim=1)  # noqa: PD013

        return (loss * m).sum() / data.size()[0]

    def _validate_discrete_columns(self, train_data, discrete_columns):
        """Check whether ``discrete_columns`` exists in ``train_data``.

        Args:
            train_data (numpy.ndarray or pandas.DataFrame):
                Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame.
            discrete_columns (list-like):
                List of discrete columns to be used to generate the Conditional
                Vector. If ``train_data`` is a Numpy array, this list should
                contain the integer indices of the columns. Otherwise, if it is
                a ``pandas.DataFrame``, this list should contain the column names.
        """
        if isinstance(train_data, pd.DataFrame):
            invalid_columns = set(discrete_columns) - set(train_data.columns)
        elif isinstance(train_data, np.ndarray):
            invalid_columns = []
            for column in discrete_columns:
                if column < 0 or column >= train_data.shape[1]:
                    invalid_columns.append(column)
        else:
            raise TypeError('``train_data`` should be either pd.DataFrame or np.array.')

        if invalid_columns:
            raise ValueError(f'Invalid columns found: {invalid_columns}')

    @random_state
    def fit(self, train_data, discrete_columns=(), epochs=None):
        """Fit the CTGAN Synthesizer models to the training data.
        # CTGAN 모델 학습 (latent code 추가)

        Args:
            train_data (numpy.ndarray or pandas.DataFrame):
                Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame.
            discrete_columns (list-like):
                List of discrete columns to be used to generate the Conditional
                Vector. If ``train_data`` is a Numpy array, this list should
                contain the integer indices of the columns. Otherwise, if it is
                a ``pandas.DataFrame``, this list should contain the column names.
        """
        self._validate_discrete_columns(train_data, discrete_columns)

        if epochs is None:
            epochs = self._epochs
        else:
            warnings.warn(
                ('`epochs` argument in `fit` method has been deprecated and will be removed '
                 'in a future version. Please pass `epochs` to the constructor instead'),
                DeprecationWarning
            )

        self._transformer = DataTransformer()
        self._transformer.fit(train_data, discrete_columns)

        train_data = self._transformer.transform(train_data)

        self._data_sampler = DataSampler(
            train_data,
            self._transformer.output_info_list,
            self._log_frequency)

        data_dim = self._transformer.output_dimensions

        self._generator = Generator(
            self._embedding_dim + self._data_sampler.dim_cond_vec(),
            self._generator_dim,
            data_dim,
            self._latent_dim  # latent_dim 추가
        ).to(self._device)

        self._discriminator = Discriminator(
            data_dim + self._data_sampler.dim_cond_vec(),
            self._discriminator_dim,
            latent_dim=self._latent_dim,  # latent_dim 추가
            pac=self.pac
        ).to(self._device)

        optimizerG = optim.Adam(
            self._generator.parameters(), lr=self._generator_lr, betas=(0.5, 0.9),
            weight_decay=self._generator_decay
        )

        optimizerD = optim.Adam(
            self._discriminator.parameters(), lr=self._discriminator_lr,
            betas=(0.5, 0.9), weight_decay=self._discriminator_decay
        )

        mean = torch.zeros(self._batch_size, self._embedding_dim, device=self._device)
        std = torch.ones(self._batch_size, self._embedding_dim, device=self._device)

        self.loss_values = pd.DataFrame(columns=['Epoch', 'Generator Loss', 'Discriminator Loss'])

        epoch_iterator = tqdm(range(epochs), disable=(not self._verbose))
        if self._verbose:
            description = 'Gen. ({gen:.2f}) | Discrim. ({dis:.2f})'
            epoch_iterator.set_description(description.format(gen=0, dis=0))

        steps_per_epoch = max(len(train_data) // self._batch_size, 1)
        for i in epoch_iterator:
            for id_ in range(steps_per_epoch):

                for n in range(self._discriminator_steps):
                    fakez = torch.normal(mean=mean, std=std)
                    latent_code = torch.rand(self._batch_size, self._latent_dim, device=self._device) * 2 - 1  # uniform 분포에서 latent code 생성

                    condvec = self._data_sampler.sample_condvec(self._batch_size)
                    if condvec is None:
                        c1, m1, col, opt = None, None, None, None
                        real = self._data_sampler.sample_data(
                            train_data, self._batch_size, col, opt)
                    else:
                        c1, m1, col, opt = condvec
                        c1 = torch.from_numpy(c1).to(self._device)
                        m1 = torch.from_numpy(m1).to(self._device)
                        fakez = torch.cat([fakez, c1], dim=1)

                        perm = np.arange(self._batch_size)
                        np.random.shuffle(perm)
                        real = self._data_sampler.sample_data(
                            train_data, self._batch_size, col[perm], opt[perm])
                        c2 = c1[perm]
                    # conditional vector와 noise에다가 latent_code 추가
                    fake = self._generator(fakez, latent_code)  # latent_code 추가
                    fakeact = self._apply_activate(fake)

                    real = torch.from_numpy(real.astype('float32')).to(self._device)

                    if c1 is not None:
                        fake_cat = torch.cat([fakeact, c1], dim=1)
                        real_cat = torch.cat([real, c2], dim=1)
                    else:
                        real_cat = real
                        fake_cat = fakeact

                    y_fake, _, _ = self._discriminator(fake_cat)
                    y_real, _, _ = self._discriminator(real_cat)

                    pen = self._discriminator.calc_gradient_penalty(
                        real_cat, fake_cat, self._device, self.pac)
                    loss_d = -(torch.mean(y_real) - torch.mean(y_fake))

                    optimizerD.zero_grad(set_to_none=False)
                    pen.backward(retain_graph=True)
                    loss_d.backward()
                    optimizerD.step()

                fakez = torch.normal(mean=mean, std=std)
                latent_code = torch.rand(self._batch_size, self._latent_dim, device=self._device) * 2 - 1  # uniform 분포에서 latent code 생성
                condvec = self._data_sampler.sample_condvec(self._batch_size)

                if condvec is None:
                    c1, m1, col, opt = None, None, None, None
                else:
                    c1, m1, col, opt = condvec
                    c1 = torch.from_numpy(c1).to(self._device)
                    m1 = torch.from_numpy(m1).to(self._device)
                    fakez = torch.cat([fakez, c1], dim=1)

                fake = self._generator(fakez, latent_code)  # latent_code 추가
                fakeact = self._apply_activate(fake)

                if c1 is not None:
                    y_fake, mu, var = self._discriminator(torch.cat([fakeact, c1], dim=1))
                else:
                    y_fake, mu, var = self._discriminator(fakeact)

                if condvec is None:
                    cross_entropy = 0
                else:
                    cross_entropy = self._cond_loss(fake, c1, m1)

                # 여기서 latent_code만 있는게 아니라 fake 데이터에서 연속형 변수만큼만 뽑아야 함
                mi_loss = self.mutual_information_loss(latent_code, mu, var)  # mutual information loss 계산

                loss_g = -torch.mean(y_fake) + cross_entropy + mi_loss  # mutual information loss를 Generator 손실에 추가

                optimizerG.zero_grad(set_to_none=False)
                loss_g.backward()
                optimizerG.step()

            generator_loss = loss_g.detach().cpu().item()
            discriminator_loss = loss_d.detach().cpu().item()

            epoch_loss_df = pd.DataFrame({
                'Epoch': [i],
                'Generator Loss': [generator_loss],
                'Discriminator Loss': [discriminator_loss]
            })
            if not self.loss_values.empty:
                self.loss_values = pd.concat(
                    [self.loss_values, epoch_loss_df]
                ).reset_index(drop=True)
            else:
                self.loss_values = epoch_loss_df

            if self._verbose:
                epoch_iterator.set_description(
                    description.format(gen=generator_loss, dis=discriminator_loss)
                )

    def sample(self, n, condition_column=None, condition_value=None):
        """Sample data similar to the training data.

        Choosing a condition_column and condition_value will increase the probability of the
        discrete condition_value happening in the condition_column.

        Args:
            n (int):
                Number of rows to sample.
            condition_column (string):
                Name of a discrete column.
            condition_value (string):
                Name of the category in the condition_column which we wish to increase the
                probability of happening.

        Returns:
            numpy.ndarray or pandas.DataFrame
        """
        if condition_column is not None and condition_value is not None:
            condition_info = self._transformer.convert_column_name_value_to_id(
                condition_column, condition_value)
            global_condition_vec = self._data_sampler.generate_cond_from_condition_column_info(
                condition_info, self._batch_size)
        else:
            global_condition_vec = None

        steps = n // self._batch_size + 1
        data = []
        for i in range(steps):
            mean = torch.zeros(self._batch_size, self._embedding_dim)
            std = mean + 1
            fakez = torch.normal(mean=mean, std=std).to(self._device)
            latent_code = torch.rand(self._batch_size, self._latent_dim, device=self._device) * 2 - 1  # uniform 분포에서 latent code 생성

            if global_condition_vec is not None:
                condvec = global_condition_vec.copy()
            else:
                condvec = self._data_sampler.sample_original_condvec(self._batch_size)

            if condvec is None:
                pass
            else:
                c1 = condvec
                c1 = torch.from_numpy(c1).to(self._device)
                fakez = torch.cat([fakez, c1], dim=1)

            fake = self._generator(fakez, latent_code) # latent code 추가
            fakeact = self._apply_activate(fake)
            data.append(fakeact.detach().cpu().numpy())

        data = np.concatenate(data, axis=0)
        data = data[:n]

        return self._transformer.inverse_transform(data)

    def set_device(self, device):
        """Set the `device` to be used ('GPU' or 'CPU)."""
        self._device = device
        if self._generator is not None:
            self._generator.to(self._device)
        if self._discriminator is not None:
            self._discriminator.to(self._device)
        if self._q_network is not None:
            self._q_network.to(self._device)


In [13]:
import pandas as pd
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

data=pd.read_csv("/content/adult.csv")
# 종속변수는 그냥 0,1로 설정
data['income'] = data['income'].replace({' <=50K': 0, ' >50K': 1})

In [15]:
# 데이터 프레임을 훈련용과 테스트용으로 분리
train_df, test_df = train_test_split(real_data, test_size=0.3, random_state=42)

In [48]:
# 훈련용 데이터로 샘플 생성
# Names of the columns that are discrete
discrete_columns = [
    'workclass',
    'education',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'native-country',
    'income'
]

# CTGAN 모델 초기화
embedding_dim = 128
generator_dim = (256, 256)
discriminator_dim = (256, 256)
latent_dim = 1  # latent code의 차원
batch_size = 500
epochs = 100
cuda = True
pac = 10  # pac 값을 batch_size의 약수로 설정

ctgan = CTGAN(
    embedding_dim=embedding_dim,
    generator_dim=generator_dim,
    discriminator_dim=discriminator_dim,
    latent_dim=latent_dim,
    batch_size=batch_size,
    epochs=epochs,
    pac=pac,  # pac 값을 설정
    cuda=cuda
)

In [49]:
ctgan.fit(train_df, discrete_columns)
# Create synthetic data
synthetic_data = ctgan.sample(10000)

In [50]:
# 범주형 컬럼 get dummies 사용
synthetic_data_get_dummies=pd.get_dummies(synthetic_data)
test_df_get_dummies=pd.get_dummies(test_df)

X_train=synthetic_data_get_dummies.drop('income',axis=1)
y_train=synthetic_data_get_dummies['income']

X_test=test_df_get_dummies.drop('income',axis=1)
y_test=test_df_get_dummies['income']


# 범주형 컬럼에서 빈도가 적은 카테고리가 테스트에 포함되지 않을수도 있으니 그 카테고리 False로 차원 맞춰주기
if X_train.shape[1]>X_test.shape[1]:
  synthetic_data_get_dummies, test_df_get_dummies = synthetic_data_get_dummies.align(test_df_get_dummies, join='outer', axis=1, fill_value=False)
elif X_train.shape[1]<X_test.shape[1]:
  test_df_get_dummies, synthetic_data_get_dummies = test_df_get_dummies.align(synthetic_data_get_dummies, join='outer', axis=1, fill_value=False)
else:
  synthetic_data_get_dummies=pd.get_dummies(synthetic_data)
  test_df_get_dummies=pd.get_dummies(test_df)


X_train=synthetic_data_get_dummies.drop('income',axis=1)
y_train=synthetic_data_get_dummies['income']

X_test=test_df_get_dummies.drop('income',axis=1)
y_test=test_df_get_dummies['income']

In [51]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(10000, 108)
(10000,)
(9769, 108)
(9769,)


In [52]:
# 의사결정 나무 모델 초기화 및 학습
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

In [53]:
y_pred = clf.predict(X_test)

In [54]:
# latent_dim : 1, epochs : 100
# F1 스코어 계산
f1 = f1_score(y_test, y_pred)
print("F1 스코어:", f1)

F1 스코어: 0.5445866782382296


In [47]:
# latent_dim : 1, epochs : 100
# F1 스코어 계산
f1 = f1_score(y_test, y_pred)
print("F1 스코어:", f1)

F1 스코어: 0.5606884057971016


In [None]:
# # SVM
# from sklearn.svm import SVC
# # Linear SVM 모델 초기화 및 학습
# clf = SVC(kernel='linear', random_state=42)
# clf.fit(X_train, y_train)

# # 테스트 데이터에 대한 예측 수행
# y_pred = clf.predict(X_test)

# # F1 스코어 계산
# f1 = f1_score(y_test, y_pred)

# print("F1 스코어:", f1)

In [40]:
from sklearn.neural_network import MLPClassifier

# MLP 모델 초기화 및 학습
mlp = MLPClassifier(random_state=42, max_iter=500)
mlp.fit(X_train, y_train)

# 테스트 데이터에 대한 예측 수행
y_pred = mlp.predict(X_test)

# F1 스코어 계산
f1 = f1_score(y_test, y_pred)

print("F1 스코어:", f1)

F1 스코어: 0.24621072088724585
