In [None]:
import pandas as pd
from mostlyai.sdk import MostlyAI

# load original data
repo_url = 'https://github.com/mostly-ai/public-demo-data'
df_original = pd.read_csv(f'{repo_url}/raw/dev/airbnb/airbnb.csv.gz')

# instantiate SDK
mostly = MostlyAI()

# train a generator
g = mostly.train(config={
        'name': 'Airbnb',          # name of the generator
        'tables': [{                         # provide list of table(s)
            'name': 'airbnb',                # name of the table
            'data': df_original,             # the original data as pd.DataFrame
            'tabular_model_configuration': { # tabular model configuration (optional)
                'max_training_time': 2,      # cap runtime for demo; set None for max accuracy
                # model, max_epochs,,..      # further model configurations (optional)
                'differential_privacy': {    # differential privacy configuration (optional)
                    'max_epsilon': 5.0,      # - max epsilon value, used as stopping criterion
                    'delta': 1e-5,           # - delta value
                }
            },
            # columns, keys, compute,..      # further table configurations (optional)
        }]
    },
    start=True,                              # start training immediately (default: True)
    wait=True,                               # wait for completion (default: True)
)

Output()

In [None]:
df_samples = mostly.probe(g, size=100)
df_samples

In [3]:
df_samples = mostly.probe(g, seed=pd.DataFrame({
    'neighbourhood': ['Flatbush', 'Jamaica'],
    'room_type': ['Private Room', 'Shared Room'],
}))
df_samples

Unnamed: 0,neighbourhood_group,neighbourhood,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,availability_365,latitude,longitude
0,Manhattan,Flatbush,Private Room,66,3,9,2018-10-22,1.77,0,40.7267,-73.97728
1,Manhattan,Jamaica,Shared Room,299,2,4,2017-03-19,0.44,322,40.71701,-73.96224


In [4]:
sd = mostly.generate(g, size=100_000)
df_synthetic = sd.data()
df_synthetic

Output()

Unnamed: 0,neighbourhood_group,neighbourhood,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,availability_365,latitude,longitude
0,Manhattan,Upper East Side,Entire home/apt,178,1,16,2019-03-04,0.12,365,40.73354,-73.93819
1,Manhattan,Gramercy,Entire home/apt,79,1,21,2015-12-22,0.04,300,40.64878,-73.93888
2,Manhattan,East Harlem,Entire home/apt,92,30,112,2019-01-01,,77,40.71831,-73.94129
3,Brooklyn,Rockaway Beach,Private room,55,1,3,2016-09-18,0.3,6,40.68687,-73.98868
4,Manhattan,Astoria,Entire home/apt,676,1,43,2018-09-20,1.06,0,40.80006,-73.77163
...,...,...,...,...,...,...,...,...,...,...,...
99995,Brooklyn,Prospect-Lefferts Gardens,Private room,38,2,69,2015-10-21,0.3,158,40.79448,-73.93642
99996,Queens,Briarwood,Entire home/apt,55,7,34,2019-06-30,0.84,192,40.68047,-73.95265
99997,Brooklyn,Bushwick,Entire home/apt,114,3,3,2019-06-21,0.64,0,40.764,-73.89489
99998,Bronx,Lower East Side,Entire home/apt,150,10,1,2018-10-03,2.48,231,40.74071,-73.98508


In [10]:
from datasets import Dataset
from huggingface_hub import login

# Log in to Hugging Face Hub
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [11]:
airbnb = Dataset.from_pandas(df_synthetic)

# Upload each dataset separately
airbnb.push_to_hub("synthetic-airbnb-rentals-metadata")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/100 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/ZennyKenny/synthetic-airbnb-rentals-metadata/commit/a1966d3073417ffebd178c6026841757e3825672', commit_message='Upload dataset', commit_description='', oid='a1966d3073417ffebd178c6026841757e3825672', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/ZennyKenny/synthetic-airbnb-rentals-metadata', endpoint='https://huggingface.co', repo_type='dataset', repo_id='ZennyKenny/synthetic-airbnb-rentals-metadata'), pr_revision=None, pr_num=None)