In [None]:
!pip install sdv

Collecting sdv
  Downloading sdv-1.19.0-py3-none-any.whl.metadata (14 kB)
Collecting boto3<2.0.0,>=1.28 (from sdv)
  Downloading boto3-1.37.33-py3-none-any.whl.metadata (6.7 kB)
Collecting botocore<2.0.0,>=1.31 (from sdv)
  Downloading botocore-1.37.33-py3-none-any.whl.metadata (5.7 kB)
Collecting copulas>=0.12.1 (from sdv)
  Downloading copulas-0.12.2-py3-none-any.whl.metadata (9.4 kB)
Collecting ctgan>=0.11.0 (from sdv)
  Downloading ctgan-0.11.0-py3-none-any.whl.metadata (10 kB)
Collecting deepecho>=0.7.0 (from sdv)
  Downloading deepecho-0.7.0-py3-none-any.whl.metadata (10 kB)
Collecting rdt>=1.14.0 (from sdv)
  Downloading rdt-1.16.0-py3-none-any.whl.metadata (10 kB)
Collecting sdmetrics>=0.19.0 (from sdv)
  Downloading sdmetrics-0.20.0-py3-none-any.whl.metadata (9.4 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3<2.0.0,>=1.28->sdv)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.12.0,>=0.11.0 (from boto3<2.0.0,>=1.28->sdv)
  Downloading s

In [None]:
import sdv
print(sdv.__version__)

1.19.0


In [None]:
import pandas as pd
import numpy as np
from sdv.lite import SingleTablePreset
from sklearn.preprocessing import StandardScaler
import joblib
import os
from datetime import datetime
from sdv.lite import SingleTablePreset
from sdv.metadata import SingleTableMetadata

from sdv.single_table import TVAESynthesizer
from sdv.metadata import SingleTableMetadata

In [None]:
def encode_categorical_features(df, target_column):
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    categorical_cols = [col for col in categorical_cols if col != target_column]
    df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=False)
    return df_encoded, categorical_cols



In [None]:
def decode_categorical_features(df_synth, original_df, target_column):
    decoded_df = df_synth.copy()
    cat_columns = original_df.select_dtypes(include=['object', 'category']).columns.tolist()
    cat_columns = [col for col in cat_columns if col != target_column]

    for col in cat_columns:
        one_hot_cols = [c for c in df_synth.columns if c.startswith(col + '_')]
        if not one_hot_cols:
            continue

        decoded_df[col] = df_synth[one_hot_cols].idxmax(axis=1).str[len(col)+1:]
        decoded_df.drop(columns=one_hot_cols, inplace=True)

    return decoded_df



In [None]:


def train_tvae_model(df, epochs=100):
    """
    Train a synthetic data model using TVAE with specified number of epochs.

    Args:
        df (pd.DataFrame): The real dataset.
        epochs (int): Number of training iterations.

    Returns:
        model: Trained TVAE model
    """
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(data=df)

    model = TVAESynthesizer(metadata=metadata, epochs=epochs)
    model.fit(df)
    return model



In [None]:
def generate_synthetic_data(tvae, scaler, original_df, feature_names, target_column, num_rows):
    synthetic_data = tvae.sample(num_rows)
    numeric_cols = [col for col in synthetic_data.columns if col in feature_names and synthetic_data[col].dtype != 'object' and col != target_column]
    synthetic_data[numeric_cols] = scaler.inverse_transform(synthetic_data[numeric_cols])

    # Decode one-hot encoded features
    decoded_df = decode_categorical_features(synthetic_data, original_df, target_column)
    decoded_df[target_column] = synthetic_data[target_column].values
    return decoded_df



In [None]:
def save_files(model, synthetic_df, model_dir='models', data_dir='synthetic_data'):
    os.makedirs(model_dir, exist_ok=True)
    os.makedirs(data_dir, exist_ok=True)

    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    model_path = os.path.join(model_dir, f'tvae_model_{timestamp}.pkl')
    data_path = os.path.join(data_dir, f'synthetic_data_{timestamp}.csv')

    joblib.dump(model, model_path)
    synthetic_df.to_csv(data_path, index=False)

    print(f"✅ Model saved to: {model_path}")
    print(f"✅ Synthetic data saved to: {data_path}")

    return model_path, data_path

In [None]:
# Load data
df = pd.read_csv("customer_churn_dataset-training-master.csv")
target_column = "Churn"

tvae_model = train_tvae_model(df)
synthetic_data = tvae_model.sample(num_rows=200000)
# Save model and synthetic data
model_path, data_path = save_files(tvae_model, synthetic_data)




✅ Model saved to: models/tvae_model_20250413_153946.pkl
✅ Synthetic data saved to: synthetic_data/synthetic_data_20250413_153946.csv


In [None]:
synthetic_data = tvae_model.sample(num_rows=500000)
# Save model and synthetic data
model_path, data_path = save_files(tvae_model, synthetic_data)

✅ Model saved to: models/tvae_model_20250413_155017.pkl
✅ Synthetic data saved to: synthetic_data/synthetic_data_20250413_155017.csv
