In [30]:
import pandas as pd
from sdv.metadata import SingleTableMetadata

### Without Any changes

In [66]:
df = pd.read_csv("SampleDataFoodSales.csv")
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=df)

constraint = {
    'constraint_class': 'FixedCombinations',
    'constraint_parameters': {
        'column_names': ['Category', 'Product']
    }
}
constraint_1 = {
    'constraint_class': 'FixedCombinations',
    'constraint_parameters': {
        'column_names': ['Region', 'City']
    }
}

from sdv.single_table import GaussianCopulaSynthesizer

synthesizer = GaussianCopulaSynthesizer(metadata)
synthesizer.add_constraints(constraints=[
    constraint
])
synthesizer.add_constraints(constraints=[
    constraint_1
])
synthesizer.fit(df)
synthetic_data = synthesizer.sample(num_rows=1000)

Sampling rows: 100%|████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 11096.42it/s]


In [67]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=df,
    synthetic_data=synthetic_data,
    metadata=metadata)

Creating report: 100%|███████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  7.44it/s]



Overall Quality Score: 73.25%

Properties:
Column Shapes: 86.38%
Column Pair Trends: 60.11%


### Manual Addition of 2 cities along with some random MRP values

In [31]:
df = pd.read_csv("SampleDataFoodSales.csv")

In [52]:
df1 = pd.DataFrame({'ID': ['ID00001', 'ID00002'],
                    'Region': ['West', 'Midwest'],
                   'City': ['Seattle', 'Chicago']})



# df1 = pd.DataFrame({'ID': ['ID00001', 'ID00002'],
#                    'Date': ['1-Jan', '1-Jan'],
#                    'Region': ['West', 'Midwest'],
#                    'City': ['Seattle', 'Chicago'],
#                    'Category': ['Bars', 'Crackers'],
#                    'Product': ['Carrot', 'Whole Wheat'],
#                    'Qty': [34, 45],
#                    'UnitPrice': [1.77, 3.49],
#                    'TotalPrice': [60.18, 157.05]})


In [53]:
df_final = df.append(df1, ignore_index =True)

  df_final = df.append(df1, ignore_index =True)


In [54]:
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=df_final)
#metadata.set_primary_key(column_name='ID')
constraint = {
    'constraint_class': 'FixedCombinations',
    'constraint_parameters': {
        'column_names': ['Category', 'Product']
    }
}

constraint_1 = {
    'constraint_class': 'FixedCombinations',
    'constraint_parameters': {
        'column_names': ['Region', 'City']
    }
}

metadata.update_column(
    column_name='ID',
    sdtype='id',
    regex_format='ID[0-9]{5}')

metadata.set_primary_key(column_name='ID')

In [55]:
from sdv.single_table import GaussianCopulaSynthesizer

synthesizer = GaussianCopulaSynthesizer(metadata, default_distribution = 'uniform')
synthesizer.add_constraints(constraints=[
    constraint
])
synthesizer.add_constraints(constraints=[
    constraint_1
])
synthesizer.fit(df_final)
synthetic_data = synthesizer.sample(num_rows=1000)

Sampling rows: 100%|████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 24730.13it/s]


In [56]:
synthetic_data.City.value_counts()

Seattle        192
San Diego      172
New York       165
Chicago        163
Los Angeles    161
Boston         147
Name: City, dtype: int64

In [59]:
synthetic_data[synthetic_data['City'] == "Chicago"]

Unnamed: 0,ID,Date,Region,City,Category,Product,Qty,UnitPrice,TotalPrice
3,ID00003,19-Sep,Midwest,Chicago,Cookies,Oatmeal Raisin,140.0,3.07,375.89
11,ID00011,5-Aug,Midwest,Chicago,Cookies,Oatmeal Raisin,186.0,2.85,632.74
16,ID00016,22-Sep,Midwest,Chicago,Bars,Bran,187.0,3.26,502.78
20,ID00020,21-Jun,Midwest,Chicago,Bars,Banana,201.0,2.91,104.39
25,ID00025,19-Jan,Midwest,Chicago,Bars,Bran,107.0,2.01,104.58
...,...,...,...,...,...,...,...,...,...
977,ID00977,27-Jul,Midwest,Chicago,Bars,Bran,32.0,2.40,50.86
978,ID00978,6-Nov,Midwest,Chicago,Bars,Bran,215.0,2.37,267.20
989,ID00989,9-Jul,Midwest,Chicago,Bars,Bran,42.0,3.32,186.23
995,ID00995,25-Jan,Midwest,Chicago,Cookies,Arrowroot,288.0,1.36,506.48


In [60]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=df_final,
    synthetic_data=synthetic_data,
    metadata=metadata)

Creating report: 100%|███████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  9.79it/s]



Overall Quality Score: 53.7%

Properties:
Column Shapes: 65.96%
Column Pair Trends: 41.45%


In [61]:
from sdv.evaluation.single_table import run_diagnostic

diagnostic_report = run_diagnostic(
    real_data=df_final,
    synthetic_data=synthetic_data,
    metadata=metadata)

Creating report: 100%|███████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.90it/s]


DiagnosticResults:

SUCCESS:
✓ The synthetic data covers over 90% of the numerical ranges present in the real data
✓ The synthetic data covers over 90% of the categories present in the real data
✓ Over 90% of the synthetic rows are not copies of the real data
✓ The synthetic data follows over 90% of the min/max boundaries set by the real data





In [64]:
from sdv.evaluation.single_table import get_column_plot

fig = get_column_plot(
    real_data=df_final,
    synthetic_data=synthetic_data,
    column_name='UnitPrice',
    metadata=metadata
)
    
fig.show()

In [43]:
synthetic_data.to_csv("test.csv")