In [None]:
import pandas as pd
from mostlyai.sdk import MostlyAI

# load original data
repo_url = 'https://github.com/mostly-ai/public-demo-data'
df_original = pd.read_csv(f'{repo_url}/raw/dev/creditcard_fraud/fraud.csv.gz')

# instantiate SDK
mostly = MostlyAI(local=True)

# train a generator
g = mostly.train(config={
        'name': 'Credit Card Fraud',          # name of the generator
        'tables': [{                         # provide list of table(s)
            'name': 'fraud',                # name of the table
            'data': df_original,             # the original data as pd.DataFrame
            'tabular_model_configuration': { # tabular model configuration (optional)
                'max_training_time': 2,      # cap runtime for demo; set None for max accuracy
                # model, max_epochs,,..      # further model configurations (optional)
                'differential_privacy': {    # differential privacy configuration (optional)
                    'max_epsilon': 5.0,      # - max epsilon value, used as stopping criterion
                    'delta': 1e-5,           # - delta value
                }
            },
            # columns, keys, compute,..      # further table configurations (optional)
        }]
    },
    start=True,                              # start training immediately (default: True)
    wait=True,                               # wait for completion (default: True)
)

Output()

In [4]:
df_samples = mostly.probe(g, size=100)
df_samples

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,171780,2.18692,0.42510,1.70606,-1.66984,-1.78719,0.13467,-1.18616,0.13560,0.34146,...,0.19911,-0.65050,0.03489,0.46011,0.21833,-0.72073,-0.09970,-0.00822,40.03,0
1,50878,1.37094,-0.15642,1.99814,-1.15017,1.94364,-1.01502,-1.69689,0.19513,-0.83324,...,-0.30484,-7.43430,-0.02217,-1.05101,-0.29647,-0.54336,-0.46148,-0.02618,1.74,0
2,126885,-1.29209,-0.20474,0.18084,0.77489,0.46804,-0.81432,0.40407,-0.11793,-1.04775,...,-0.14684,-0.88191,0.00737,0.75941,0.79535,-0.71451,-0.03552,-0.38494,37.05,0
3,86585,0.98479,-0.53931,1.13078,-2.54038,-0.37447,-0.31093,0.26009,0.36246,-1.27377,...,0.21789,-0.64528,0.16401,0.06106,-0.59337,0.08992,-0.00835,-0.05291,119.93,0
4,37712,-1.07580,-0.73503,1.28728,1.10322,1.71715,1.50671,0.25282,1.13553,1.38051,...,-0.06146,-0.75049,-0.04009,0.12331,-0.35951,-0.10651,-0.01200,0.01550,59.37,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,83904,2.07017,-1.10247,1.98703,-0.36082,-0.33517,-0.40490,-0.07842,-0.00947,-0.37080,...,-0.10430,-0.28439,0.02455,-0.66764,0.33665,0.61745,0.01378,0.08132,14.02,0
96,35724,1.32034,1.47617,1.09268,-0.31464,-1.69804,0.95152,-0.87543,0.19867,0.94231,...,-0.11416,-0.42011,0.04012,0.31643,0.74817,0.33178,0.27130,0.06853,11.85,0
97,39131,-0.38448,0.06588,1.81040,0.97211,-0.74727,-0.28699,-0.64920,0.10784,-1.33688,...,0.11626,0.33218,-0.19075,-0.03857,0.20898,-0.92159,0.02245,0.34067,317.75,0
98,82406,-2.04706,0.92765,-2.29912,-1.12028,-1.32522,0.75533,0.05011,-0.14394,0.26699,...,0.40452,0.56976,0.17565,0.85080,-0.19960,-0.12607,-0.09684,0.03461,3742.34,0


In [5]:
df_samples = mostly.probe(g, seed=pd.DataFrame({
    'Amount': [40.03, 14.02, 1.74],
    'Time': [171780, 37712, 83904],
}))
df_samples

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,171780,1.11576,-2.77638,-2.02667,0.99669,0.69968,-0.56336,0.63531,-0.00333,-0.37643,...,-0.05584,1.11505,-0.16567,0.36832,-0.19786,0.08242,7.82601,0.18114,40.03,0
1,37712,1.90427,0.53809,1.81518,1.1919,0.02212,2.36727,0.63782,0.41815,-0.02401,...,-1.40258,0.08917,-0.01301,0.47111,-0.27814,-0.49496,-0.02409,0.1316,14.02,0
2,83904,0.0388,0.11281,0.27882,1.12167,-0.97669,-0.18628,-0.04998,0.02552,-1.55481,...,0.04899,-0.8348,0.31275,0.49872,0.76453,-0.21152,0.04112,0.01943,1.74,0


In [6]:
sd = mostly.generate(g, size=500_000)
df_synthetic = sd.data()
df_synthetic

Output()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,146199,2.01467,0.57116,0.16464,0.75753,0.6638,-0.65968,-1.17587,1.16585,-0.43884,...,0.05173,1.49229,0.01899,0.0673,0.47932,-0.08791,-0.10828,0.06519,10.78,0
1,142439,-1.86879,-0.7174,-1.3188,-1.33087,0.42966,-0.25401,-1.39403,-0.59895,-0.96013,...,-0.08921,0.11499,-0.0716,-1.16618,0.6012,-0.39407,0.2499,-0.35562,74.21,0
2,125769,-1.59341,-1.42935,-1.36896,-0.46871,1.54862,-0.73101,0.02114,0.92982,-0.35979,...,-0.27474,0.11237,0.18847,-0.17905,-0.24569,-0.2176,-0.00986,0.01216,4.57,0
3,87479,2.0884,-0.16415,-0.14401,0.43316,1.92267,-1.62636,-0.99858,0.17308,0.33944,...,-0.05609,-0.54982,-0.07284,-0.81163,0.6286,-0.15181,0.15034,-0.04921,67.22,0
4,51064,1.20851,1.51925,-1.06905,1.32278,0.5501,-0.10649,-0.71236,0.09322,0.39786,...,-0.13234,0.79504,0.08656,-0.0721,-0.58513,0.25105,-0.05931,-0.02886,232.37,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,138101,-0.63764,0.13164,0.80531,-0.35363,0.3342,0.11375,-1.0228,0.0193,0.01565,...,0.18348,0.49197,0.33281,-0.2582,-0.57705,-0.36758,-0.01197,-0.07054,13.39,0
499996,153006,1.74794,-1.85203,0.31912,0.48635,-0.45767,-0.39648,-0.55676,0.15675,-1.20452,...,0.01584,-0.45553,0.00539,0.57826,-0.00345,1.04016,0.20785,0.00566,4.78,0
499997,129241,-1.23813,0.95948,0.10372,0.72531,1.3267,3.29791,0.36615,-0.05459,1.19772,...,0.11757,1.14183,0.3073,-0.27633,-0.93202,-0.3543,-0.10924,-0.31845,1.13,0
499998,47602,-1.55344,0.0113,1.73389,-0.80197,-0.44043,-0.59251,0.07329,-0.44042,0.2209,...,0.18775,-6.28519,0.40457,-0.33553,1.04514,-0.96876,-0.09022,0.04477,1.74,0


In [7]:
from datasets import Dataset
from huggingface_hub import login

login()

# Convert each DataFrame to a Hugging Face Dataset & Push to Hub
players_dataset = Dataset.from_pandas(df_synthetic)
players_dataset.push_to_hub("synthetic-credit-card-fraud")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/500 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/ZennyKenny/synthetic-credit-card-fraud/commit/02d620cf21f8e9453173a1f8486edd79ffdd4642', commit_message='Upload dataset', commit_description='', oid='02d620cf21f8e9453173a1f8486edd79ffdd4642', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/ZennyKenny/synthetic-credit-card-fraud', endpoint='https://huggingface.co', repo_type='dataset', repo_id='ZennyKenny/synthetic-credit-card-fraud'), pr_revision=None, pr_num=None)