In [None]:
import pandas as pd
from mostlyai.sdk import MostlyAI

# load original data
df_original = pd.read_csv('https://huggingface.co/datasets/ZennyKenny/demo_customer_nps/raw/main/demo_customer_feedback.csv')

# instantiate SDK
mostly = MostlyAI(local=True)

# train a generator
g = mostly.train(config={
        'name': 'Net Promotor Score',          # name of the generator
        'tables': [{                         # provide list of table(s)
            'name': 'train',                # name of the table
            'data': df_original,             # the original data as pd.DataFrame
            'tabular_model_configuration': { # tabular model configuration (optional)
                'max_training_time': 2,      # cap runtime for demo; set None for max accuracy
                # 'model': 'microsoft/phi-1_5',
                'differential_privacy': {    # differential privacy configuration (optional)
                    'max_epsilon': 5.0,      # - max epsilon value, used as stopping criterion
                    'delta': 1e-5,           # - delta value
                }
            },
        }]
    },
    start=True,                              # start training immediately (default: True)
    wait=True,                               # wait for completion (default: True)
)

Output()

In [7]:
from mostlyai.sdk import MostlyAI

# client mode
mostly = MostlyAI(local=True)

mostly.models()

{'TABULAR': ['MOSTLY_AI/Small', 'MOSTLY_AI/Medium', 'MOSTLY_AI/Large'],
 'LANGUAGE': ['MOSTLY_AI/LSTMFromScratch-3m',
  'microsoft/phi-1_5',
  '(HuggingFace-hosted models)']}

In [8]:
df_samples = mostly.probe(g, size=100)
df_samples

Unnamed: 0,customer_id,customer_name,customer_segment,customer_nps,customer_comment
0,467826,_RARE_,smb,9.3,"Product is okay, but it feels overpriced for w..."
1,419978,_RARE_,smb,6.9,User-friendly platform and exceptional support...
2,330872,_RARE_,smb,1.9,"Pretty good value for money, but some features..."
3,863351,_RARE_,enterprise,3.1,Excellent customer care and great value for th...
4,751376,_RARE_,sole_proprietor,2.4,Worst product I’ve used. Would not recommend t...
...,...,...,...,...,...
95,110118,_RARE_,smb,6.4,Excellent customer care and great value for th...
96,925582,_RARE_,smb,1.3,"Support team is responsive, but the platform i..."
97,455365,_RARE_,smb,9.2,User-friendly platform and exceptional support...
98,146910,_RARE_,smb,0.2,"Pretty good value for money, but some features..."


In [9]:
sd = mostly.generate(g, size=100_000)
df_synthetic = sd.data()
df_synthetic

Output()

Unnamed: 0,customer_id,customer_name,customer_segment,customer_nps,customer_comment
0,841887,_RARE_,sole_proprietor,7.6,"Support team is responsive, but the platform i..."
1,387053,_RARE_,sole_proprietor,4.8,Fantastic value and very easy to understand.
2,471613,_RARE_,sole_proprietor,9.0,"Satisfied with the platform, but there is room..."
3,127234,_RARE_,enterprise,2.9,Not worth the price. Very disappointing service.
4,443536,_RARE_,enterprise,9.2,Terrible experience. The platform is difficult...
...,...,...,...,...,...
99995,847432,_RARE_,sole_proprietor,0.2,Terrible experience. The platform is difficult...
99996,971682,_RARE_,smb,0.3,Terrible experience. The platform is difficult...
99997,259798,_RARE_,sole_proprietor,8.4,Excellent customer care and great value for th...
99998,471713,_RARE_,sole_proprietor,0.5,"Product is okay, but it feels overpriced for w..."


In [10]:
from datasets import Dataset

airbnb = Dataset.from_pandas(df_synthetic)

# Upload each dataset separately
airbnb.push_to_hub("synthetic-net-promotor-score")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/100 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/ZennyKenny/synthetic-net-promotor-score/commit/a899d5148d605e2330cbd90f77965e12e87593cf', commit_message='Upload dataset', commit_description='', oid='a899d5148d605e2330cbd90f77965e12e87593cf', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/ZennyKenny/synthetic-net-promotor-score', endpoint='https://huggingface.co', repo_type='dataset', repo_id='ZennyKenny/synthetic-net-promotor-score'), pr_revision=None, pr_num=None)