In [1]:
!pip install sdv

Collecting sdv
  Downloading sdv-1.19.0-py3-none-any.whl.metadata (14 kB)
Collecting boto3<2.0.0,>=1.28 (from sdv)
  Downloading boto3-1.37.28-py3-none-any.whl.metadata (6.7 kB)
Collecting botocore<2.0.0,>=1.31 (from sdv)
  Downloading botocore-1.37.28-py3-none-any.whl.metadata (5.7 kB)
Collecting copulas>=0.12.1 (from sdv)
  Downloading copulas-0.12.2-py3-none-any.whl.metadata (9.4 kB)
Collecting ctgan>=0.11.0 (from sdv)
  Downloading ctgan-0.11.0-py3-none-any.whl.metadata (10 kB)
Collecting deepecho>=0.7.0 (from sdv)
  Downloading deepecho-0.7.0-py3-none-any.whl.metadata (10 kB)
Collecting rdt>=1.14.0 (from sdv)
  Downloading rdt-1.15.1-py3-none-any.whl.metadata (10 kB)
Collecting sdmetrics>=0.19.0 (from sdv)
  Downloading sdmetrics-0.19.0-py3-none-any.whl.metadata (9.4 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3<2.0.0,>=1.28->sdv)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.12.0,>=0.11.0 (from boto3<2.0.0,>=1.28->sdv)
  Downloading s

### CTGAN

In [55]:
import pandas as pd
from sdv.single_table import CTGANSynthesizer
from sklearn.model_selection import train_test_split
from sdv.metadata import Metadata

# Load dataset
data = pd.read_csv('customer_churn_dataset-testing-master.csv')
data.dropna(inplace=True)

# Rename dataset columns
data.rename(columns={
    "Usage Frequency": "Usage_Frequency",
    "Support Calls": "Support_Calls",
    "Payment Delay": "Payment_Delay",
    "Subscription Type": "Subscription_Type",
    "Contract Length": "Contract_Length",
    "Total Spend": "Total_Spend",
    "Last Interaction": "Last_Interaction"
}, inplace=True)

# Strip whitespace from column names
data.columns = data.columns.str.strip()

# Identify categorical columns
categorical_cols = ['Gender', 'Subscription_Type', 'Contract_Length']

# Convert categorical columns to string
for col in categorical_cols:
    data[col] = data[col].astype(str)

# Split into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Drop unique identifier
train_data = train_data.drop(columns=["CustomerID"], errors="ignore")

# Define Metadata
metadata = Metadata()
metadata.add_table(
    table_name='customer_churn_dataset'
)
metadata = metadata.detect_from_dataframe(train_data)

# Initialize and Train CTGAN

# Option 1: Choose a batch_size divisible by likely pac values
ctgan = CTGANSynthesizer(metadata, epochs=50, batch_size=40, verbose=True) # Try batch_size=40 (divisible by 5, 10)
# OR
# ctgan = CTGANSynthesizer(metadata, epochs=100, batch_size=30, verbose=True) # Try batch_size=30 (divisible by ...)

# Option 2: Explicitly set the pac parameter
# ctgan = CTGANSynthesizer(metadata, epochs=100, batch_size=32, verbose=True, pac=4) # Example: pac=4 divides 32
# OR
# ctgan = CTGANSynthesizer(metadata, epochs=100, batch_size=64, verbose=True, pac=8) # Example: pac=8 divides 64





In [56]:
metadata

{
    "tables": {
        "table": {
            "columns": {
                "Age": {
                    "sdtype": "numerical"
                },
                "Gender": {
                    "sdtype": "categorical"
                },
                "Tenure": {
                    "sdtype": "numerical"
                },
                "Usage_Frequency": {
                    "sdtype": "numerical"
                },
                "Support_Calls": {
                    "sdtype": "numerical"
                },
                "Payment_Delay": {
                    "sdtype": "numerical"
                },
                "Subscription_Type": {
                    "sdtype": "categorical"
                },
                "Contract_Length": {
                    "sdtype": "categorical"
                },
                "Total_Spend": {
                    "sdtype": "numerical"
                },
                "Last_Interaction": {
                    "sdtype": "numerical"
      

In [57]:
ctgan.fit(train_data)


Gen. (-2.34) | Discrim. (-0.09): 100%|██████████| 50/50 [23:00<00:00, 27.61s/it]


In [58]:

# Generate synthetic data
synthetic_data = ctgan.sample(num_rows=len(test_data))


In [59]:
synthetic_data.head()

Unnamed: 0,Age,Gender,Tenure,Usage_Frequency,Support_Calls,Payment_Delay,Subscription_Type,Contract_Length,Total_Spend,Last_Interaction,Churn
0,51,Male,1,20,1,29,Premium,Monthly,474,8,0
1,55,Male,13,8,1,13,Basic,Annual,473,11,0
2,25,Female,60,20,1,26,Standard,Quarterly,832,21,0
3,23,Female,27,25,1,7,Premium,Annual,1000,29,0
4,47,Male,26,25,1,30,Basic,Monthly,670,7,0
