<a href="https://colab.research.google.com/github/jhawkins311/Capstone/blob/main/colab_scripts/synthetic_data_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Synthetic Data Generator and Evaluator Colab Notebook
# Author: Capstone Project Team

# ============================
# 1. SETUP
# ============================
!pip install -q sdv[all] pandas matplotlib scikit-learn
!pip install -q PyDrive

import os
import shutil
import uuid
import zipfile
import pandas as pd
import matplotlib.pyplot as plt
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer, TVAESynthesizer, GaussianCopulaSynthesizer
from sdv.evaluation.single_table import evaluate_quality, run_diagnostic
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from sklearn.model_selection import train_test_split

# ============================
# 2. USER INPUT SECTION
# ============================
# Upload your dataset here (manual step in Colab)
from google.colab import files
uploaded = files.upload()

file_path = list(uploaded.keys())[0]
df = pd.read_csv(file_path)

# Optional: select sensitive and target columns manually
sensitive_cols = []  # e.g., ['gender', 'marital_status']
target_col = None  # e.g., 'default_payment_next_month'

# Generate Job ID
job_id = str(uuid.uuid4())[:8]
os.makedirs(job_id, exist_ok=True)

# ============================
# 3. METADATA + PREP
# ============================
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(df)
metadata.save_to_json(f"{job_id}/metadata.json")

# Visualize metadata
with open(f"{job_id}/metadata.txt", "w") as f:
    f.write(str(metadata.to_dict()))

# ============================
# 4. TRAIN MODELS
# ============================
def train_and_evaluate_model(model_name, SynthesizerClass):
    print(f"\n[INFO] Training {model_name}...")
    synthesizer = SynthesizerClass(metadata, epochs=300, verbose=True) if 'epochs' in SynthesizerClass.__init__.__code__.co_varnames else SynthesizerClass(metadata)
    synthesizer.fit(df)
    synthetic = synthesizer.sample(len(df))

    print("[INFO] Running diagnostics and evaluation...")
    diagnostic = run_diagnostic(real_data=df, synthetic_data=synthetic, metadata=metadata)
    quality = evaluate_quality(real_data=df, synthetic_data=synthetic, metadata=metadata)

    # Save outputs
    synthetic.to_csv(f"{job_id}/{model_name}_synthetic.csv", index=False)
    pd.DataFrame(diagnostic.get_details()).to_csv(f"{job_id}/{model_name}_diagnostic.csv")
    pd.DataFrame(quality.get_details()).to_csv(f"{job_id}/{model_name}_quality.csv")

    # Visualize basic comparison plot
    plt.figure(figsize=(10, 4))
    df.select_dtypes(include='number').hist(bins=30, figsize=(12, 8))
    plt.suptitle("Real Data Distributions", fontsize=16)
    plt.savefig(f"{job_id}/{model_name}_real.png")
    plt.close()

    synthetic.select_dtypes(include='number').hist(bins=30, figsize=(12, 8))
    plt.suptitle(f"{model_name} Synthetic Distributions", fontsize=16)
    plt.savefig(f"{job_id}/{model_name}_synthetic.png")
    plt.close()

# Train all models
train_and_evaluate_model("CTGAN", CTGANSynthesizer)
train_and_evaluate_model("TVAE", TVAESynthesizer)
train_and_evaluate_model("GaussianCopula", GaussianCopulaSynthesizer)

# ============================
# 5. ZIP & UPLOAD TO GOOGLE DRIVE
# ============================
print("[INFO] Zipping results...")
zipf = zipfile.ZipFile(f"{job_id}_results.zip", 'w', zipfile.ZIP_DEFLATED)
for root, _, files in os.walk(job_id):
    for file in files:
        zipf.write(os.path.join(root, file), arcname=os.path.join(job_id, file))
zipf.close()

print("[INFO] Authenticating PyDrive...")
from google.colab import auth
auth.authenticate_user()

gauth = GoogleAuth()
gauth.LocalWebserverAuth()
drive = GoogleDrive(gauth)

print("[INFO] Uploading zip to Google Drive...")
uploaded_file = drive.CreateFile({"title": f"{job_id}_results.zip"})
uploaded_file.SetContentFile(f"{job_id}_results.zip")
uploaded_file.Upload()
file_url = f"https://drive.google.com/uc?id={uploaded_file['id']}"

print("\n[✔] Job Completed")
print(f"Job ID: {job_id}")
print(f"Download Link: {file_url}")
