In [1]:
import pandas as pd
from mostlyai.sdk import MostlyAI

# load original data
repo_url = 'https://github.com/mostly-ai/public-demo-data'
df_original = pd.read_csv(f'{repo_url}/raw/dev/creditcard_default/creditcard_default.csv.gz')

# instantiate SDK
mostly = MostlyAI(local=True)

# train a generator
g = mostly.train(config={
        'name': 'Credit Card Fraud',          # name of the generator
        'tables': [{                         # provide list of table(s)
            'name': 'fraud',                # name of the table
            'data': df_original,             # the original data as pd.DataFrame
            'tabular_model_configuration': { # tabular model configuration (optional)
                'max_training_time': 2,      # cap runtime for demo; set None for max accuracy
                # model, max_epochs,,..      # further model configurations (optional)
                'differential_privacy': {    # differential privacy configuration (optional)
                    'max_epsilon': 5.0,      # - max epsilon value, used as stopping criterion
                    'delta': 1e-5,           # - delta value
                }
            },
            # columns, keys, compute,..      # further table configurations (optional)
        }]
    },
    start=True,                              # start training immediately (default: True)
    wait=True,                               # wait for completion (default: True)
)

Output()

In [2]:
df_samples = mostly.probe(g, size=100)
df_samples

Unnamed: 0,limit_bal,sex,education,marriage,age,pay_0,pay_2,pay_3,pay_4,pay_5,...,bill_amt4,bill_amt5,bill_amt6,pay_amt1,pay_amt2,pay_amt3,pay_amt4,pay_amt5,pay_amt6,has_default
0,96730,2,1,1,49,0,0,-1,0,5,...,91598,109384,248,2527,5241,2258,1022,4957,0,0
1,525350,1,3,1,40,0,-1,0,-1,-1,...,21782,13774,34455,2121,3395,3369,3625,5027,106157,0
2,37270,1,2,2,56,0,0,0,0,0,...,65883,130273,824,1296,2003,1438,4559,2003,1001,1
3,105750,2,2,1,53,0,0,-1,0,0,...,11605,378426,164084,4227,2574,1633,4918,1570,4639,1
4,10000,1,2,1,50,0,0,0,0,0,...,5929,3783,19038,2000,2640,1000,1004,1000,620,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,550540,2,1,2,27,2,0,0,0,0,...,141131,18975,19158,5461,2738,5259,3008,1194,296,0
96,607530,1,3,3,34,0,0,0,0,0,...,6146,124593,3580,4154,5124,3306,1227,1092,2005,0
97,133630,1,2,2,29,-1,-2,0,0,0,...,15445,107774,19318,13347,3301,268,302,1522,2352,0
98,297980,2,1,2,37,-1,-2,-2,4,-1,...,42451,28402,1185,0,0,324,501,2094,1001,0


In [None]:
df_samples = mostly.probe(g, seed=pd.DataFrame({
    'Amount': [40.03, 14.02, 1.74],
    'Time': [171780, 37712, 83904],
}))
df_samples

In [3]:
sd = mostly.generate(g, size=500_000)
df_synthetic = sd.data()
df_synthetic

Output()

Unnamed: 0,limit_bal,sex,education,marriage,age,pay_0,pay_2,pay_3,pay_4,pay_5,...,bill_amt4,bill_amt5,bill_amt6,pay_amt1,pay_amt2,pay_amt3,pay_amt4,pay_amt5,pay_amt6,has_default
0,665810,2,1,0,66,0,0,0,0,0,...,40426,29192,19293,6148,2040,10368,394,6723,0,1
1,137030,2,2,1,47,-1,2,-1,-1,-1,...,22990,9759,27271,140,18422,721,10345,1038,5024,0
2,477210,1,2,2,30,0,0,2,0,0,...,-13,77927,49416,389,0,502,2004,3521,0,0
3,319500,2,1,2,49,0,0,0,0,0,...,39288,23672,34,7800,0,696,7635,538,5029,0
4,176720,2,1,3,53,-1,6,2,2,4,...,107,11301,26310,2193,0,0,0,336,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,97570,2,6,2,30,-2,0,0,0,2,...,85662,84,4902,3003,4010,1533,2002,3037,1490,1
499996,145350,1,3,1,42,1,-1,0,-1,0,...,596,84242,473634,0,5070,1002,1005,611,948,0
499997,10000,1,3,1,36,-1,0,0,2,0,...,9684,19388,22925,2040,5537,443,6000,3016,3015,0
499998,396010,1,4,2,29,-2,2,2,-1,0,...,10826,18288,11709,3006,0,16060,2000,0,0,0


In [4]:
from datasets import Dataset
from huggingface_hub import login

login()

# Convert each DataFrame to a Hugging Face Dataset & Push to Hub
players_dataset = Dataset.from_pandas(df_synthetic)
players_dataset.push_to_hub("synthetic-credit-card-defaults")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/500 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/ZennyKenny/synthetic-credit-card-defaults/commit/62594205e5dcd7484313fd99428c4f2571eb63fb', commit_message='Upload dataset', commit_description='', oid='62594205e5dcd7484313fd99428c4f2571eb63fb', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/ZennyKenny/synthetic-credit-card-defaults', endpoint='https://huggingface.co', repo_type='dataset', repo_id='ZennyKenny/synthetic-credit-card-defaults'), pr_revision=None, pr_num=None)