In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sdv.tabular import GaussianCopula
import warnings

# Load your existing dataset
data = pd.read_csv('Downloads/filtered_data.csv')

# Drop columns that are entirely NaN
data = data.dropna(axis=1, how='all')

# Copy data to avoid SettingWithCopyWarning
data = data.copy()

# Imputing missing values
# For numerical columns, fill missing values with the median
numeric_cols = data.select_dtypes(include=[np.number]).columns
imputer_numeric = SimpleImputer(strategy='median')
numeric_data_imputed = imputer_numeric.fit_transform(data[numeric_cols])
numeric_data_imputed = pd.DataFrame(numeric_data_imputed, columns=numeric_cols)

# For categorical columns, fill missing values with the mode
categorical_cols = data.select_dtypes(include=[object]).columns
imputer_categorical = SimpleImputer(strategy='most_frequent')
categorical_data_imputed = imputer_categorical.fit_transform(data[categorical_cols])
categorical_data_imputed = pd.DataFrame(categorical_data_imputed, columns=categorical_cols)

# Combine the imputed numerical and categorical columns back into a single DataFrame
data_imputed = pd.concat([numeric_data_imputed, categorical_data_imputed], axis=1)

# Suppress warnings to avoid stopping the execution
warnings.filterwarnings('ignore')

# Using GaussianCopula to generate synthetic data
gaussian_copula = GaussianCopula()
gaussian_copula.fit(data_imputed)
gaussian_copula_synthetic_data = gaussian_copula.sample(50000)

# Saving the GaussianCopula-generated synthetic dataset
gaussian_copula_synthetic_data.to_csv('gaussian_copula_synthetic_dataset.csv', index=False)
print(f"Gaussian Copula synthetic dataset saved with {gaussian_copula_synthetic_data.shape[0]} data points")


  from .autonotebook import tqdm as notebook_tqdm


Gaussian Copula synthetic dataset saved with 50000 data points
