In [1]:
%pip install be-great datasets transformers trl sdmetric

[31mERROR: Could not find a version that satisfies the requirement sdmetric (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for sdmetric[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


## Step 0: load dataset

First we load the table we want to synthesize.

In [2]:
#from google.colab import drive
#drive.mount('/content/gdrive')

In [3]:
data_path = "csv/wilt.csv"
#data_path = "ChurnModeling.csv"
#data_path = "iris.csv"
data_name = data_path.replace(".csv", "").replace("csv/", "")
test_idx = 500 # signals 0.5 split ratio

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv(data_path)
target = data.columns[-1]
#num_sample = min(500, len(data))
num_sample = None
if num_sample is not None:
    data = data.sample(num_sample, random_state = 42)

In [5]:
balanced_data = data

In [6]:
print(balanced_data.shape, balanced_data.columns)

(4839, 6) Index(['GLCM_Pan', 'Mean_G', 'Mean_R', 'Mean_NIR', 'SD_Plan', 'class'], dtype='object')


In [7]:
train, test = train_test_split(balanced_data, test_size=0.2, random_state=42)

## Step 1: supervised-finetuning for table generation

In this step, we finetune a distillgpt2 model to perform synthetic table generation.

In [8]:
from be_great import GReaT
from transformers import AutoModelForCausalLM

duration = 500
max_seq_len = 150

trained_checkpoint = None
#trained_checkpoint = "./great_checkpoint_ctrBalanced500_100"

model_great = GReaT(llm='gpt2', batch_size=32,  epochs=duration, fp16=True,save_steps=30000)
if trained_checkpoint is not None:
    model_great.load_from_dir(trained_checkpoint)

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
if trained_checkpoint is None:
    model_great.fit(train)

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


In [None]:
model_great.save(f"checkpoints/great_checkpoint_{data_name}_{test_idx}")



In [None]:
base_model = model_great.model
base_model.save_pretrained(f"checkpoints/trained_base_model_{data_name}_{test_idx}")

In [None]:
synthetic_data = model_great.sample(n_samples=len(train),max_length=max_seq_len)

406it [00:08, 48.60it/s]                         


In [None]:
synthetic_data.to_csv(f"synth_data/{data_name}_GReaT_default_{test_idx}.csv",index=False)

## Step 2: Create DPO Dataset

A DPO-compatible dataset should have 3 entries: prompt, chosen, rejected.

We select the conditional columns from real rows as our prompt. For chosen set we use the corresponding real columns; for rejected set: 1, if the target is not in the prompt, then we alter the target value in the chosen set to create a rejected set with "wrong" target value, which should be less favorable, 2 if the target is in the prompt, then we replace all values in the chosen set with values from a row with different target.

In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
import random

def create_perturbed_dataset(df, target, p):
    random.seed(42)  # For reproducibility
    np.random.seed(42)
    
    data = []

    all_unique_categories = df[target].unique()
    
    for idx, row in df.iterrows():
        # Perturb the order of cells
        shuffled_row = row.sample(frac=1)
        #print(shuffled_row)
        
        # Determine split index
        split_idx = int(p * len(shuffled_row))
        
        # Split into prompt and chosen sets
        prompt_set = shuffled_row[:split_idx]
        chosen_set = shuffled_row[split_idx:]
        
        if target in chosen_set.index:
            # If target is in chosen set
            chosen_target_value = chosen_set[target]
            # Change the category of the target column to create the rejected set
            different_categories = all_unique_categories[all_unique_categories != chosen_target_value]
            rejected_target_value = np.random.choice(different_categories)
            rejected_set = chosen_set.copy()
            rejected_set[target] = rejected_target_value
        else:
            # If target is in prompt set
            chosen_target_value = prompt_set[target]
            # Sample another row with different target classes
            other_rows = df[df[target] != chosen_target_value].sample(1, random_state=42)
            rejected_set = chosen_set.copy()
            for col in rejected_set.index:
                if col != target:
                    rejected_set[col] = other_rows.iloc[0][col]
        
        # Convert to strings
        prompt_str = ", ".join([f"{col} is {val}" for col, val in prompt_set.items()])
        chosen_str = ", ".join([f"{col} is {val}" for col, val in chosen_set.items()])
        rejected_str = ", ".join([f"{col} is {val}" for col, val in rejected_set.items()])
        
        data.append({
            "prompt": prompt_str,
            "chosen": chosen_str,
            "rejected": rejected_str
        })
    
    # Create a HuggingFace dataset
    dataset = Dataset.from_pandas(pd.DataFrame(data))
    return dataset


In [None]:
split_ratio = test_idx / 1000
dataset = create_perturbed_dataset(train, target, split_ratio)
print(dataset)

Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 400
})


## Step 3: DPO Training



In [None]:
from transformers import AutoTokenizer,AutoModelForCausalLM
from trl.models.modeling_value_head import AutoModelForCausalLMWithValueHead
from trl import DPOTrainer,DPOConfig
import torch

llm = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained(llm)
tokenizer.pad_token = tokenizer.eos_token

base_model_path = f"checkpoints/trained_base_model_{data_name}_{test_idx}"
beta = 0.1
epochs = 3
output_dir = f"checkpoints/trained_dpo_model_{data_name}_{test_idx}"

model = AutoModelForCausalLM.from_pretrained(
    base_model_path # location of saved SFT model
)
model_ref = AutoModelForCausalLM.from_pretrained(
    base_model_path
)

training_args = DPOConfig(
    beta=beta,
    output_dir=output_dir
)

dpo_trainer = DPOTrainer(
    model,
    model_ref,
    args=training_args,
    beta=beta,
    train_dataset=dataset,
    eval_dataset=dataset,
    tokenizer=tokenizer
)


Map: 100%|██████████| 400/400 [00:00<00:00, 932.13 examples/s]
Map: 100%|██████████| 400/400 [00:00<00:00, 948.24 examples/s]


In [None]:
dpo_trainer.train()

Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss


TrainOutput(global_step=150, training_loss=0.13053329467773436, metrics={'train_runtime': 25.8637, 'train_samples_per_second': 46.397, 'train_steps_per_second': 5.8, 'total_flos': 0.0, 'train_loss': 0.13053329467773436, 'epoch': 3.0})

In [None]:
dpo_trainer.save_model()


## Step 4: Utility Evaluation

Finally we load the trained parameters back to GReaT model, generate synthetic data, train another XGBoost on new synthtic data and observe changes its utility

In [None]:
# Run this cell if we are returning after RL training.
model = AutoModelForCausalLM.from_pretrained(output_dir)

In [None]:
model_great.parameters = model.parameters
new_synthetic_data = model_great.sample(n_samples=len(train),max_length=duration)

401it [00:08, 48.52it/s]                         


In [None]:
new_synthetic_data.to_csv(f"synth_data/{data_name}_GReaTDPO_default_{test_idx}.csv",index=False)

In [None]:
def column_type_mapping(df):
    type_mapping = {}
    for column in df.columns:
        if pd.api.types.is_numeric_dtype(df[column]):
            type_mapping[column] = 'continuous'
        else:
            type_mapping[column] = 'categorical'
    return type_mapping

In [None]:
from evaluator import *
import os

report_dir = "report"
os.makedirs(report_dir, exist_ok=True)

config = {"holdout_seed":42, "holdout_size":0.2, 'target_column':target,
        "fidelity_metrics": ["SumStats", "ColumnShape", "ColumnShapeHoldout"],
    "privacy_metrics": [],
    "utility_metrics": ["TabularUtility"],}
column_name_to_datatype = column_type_mapping(balanced_data)

save_path = f"{report_dir}/{data_name}_GReaT_default_{test_idx}"
evaluation_pipeline = EvaluationPipeline(real_data=balanced_data, synth_data=synthetic_data, column_name_to_datatype=column_name_to_datatype, config=config, save_path=save_path)
evaluation_pipeline.run_pipeline()



Initialing Evaluator...
Fidelity Evaluation Module initialized
Privacy Evaluation Module initialized
Utility Evaluation Module initialized
evaluated  SumStats
Generating report ...
(1/2) Evaluating Column Shapes: : 100%|██████████| 15/15 [00:00<00:00, 2166.55it/s]
(2/2) Evaluating Column Pair Trends: : 100%|██████████| 105/105 [00:01<00:00, 70.51it/s]

Overall Quality Score: 31.14%

Properties:
- Column Shapes: 43.78%
- Column Pair Trends: 18.5%
Generating report ...
(1/2) Evaluating Column Shapes: : 100%|██████████| 15/15 [00:00<00:00, 2398.48it/s]
(2/2) Evaluating Column Pair Trends: : 100%|██████████| 105/105 [00:01<00:00, 74.01it/s]

Overall Quality Score: 30.54%

Properties:
- Column Shapes: 43.23%
- Column Pair Trends: 17.85%
evaluated  ColumnShape
Traceback (most recent call last):
  File "/opt/conda/envs/llm/lib/python3.10/site-packages/evaluator/interfaces/evaluation_interface.py", line 36, in evaluate
    metric_instance = self.metric_factory.create_instance(metric, self.real

  metrics = pd.concat([metrics, pd.DataFrame({


Fitting XGBoost
Fitting CatBoost
Fitted real data!
Fitting NaiveBayes
Fitting KNeighbors
Fitting DecisionTree
Fitting RandomForest


  _warn_prf(average, modifier, msg_start, len(result))
  metrics = pd.concat([metrics, pd.DataFrame({


Fitting XGBoost
Fitting CatBoost
Fitted synth data!
Fitting NaiveBayes
Fitting KNeighbors
Fitting DecisionTree
Fitting RandomForest


  metrics = pd.concat([metrics, pd.DataFrame({


Fitting XGBoost
Fitting CatBoost
Fitted augmented data!
evaluated  TabularUtility
Plotting box plots
Plotting  SumStats
Plotting column shape
Plotting  ColumnShape
Plotting  TabularUtility
Making report path at: report/census1000_GReaT_default_100
Error converting  ColumnShape_real_table.csv  to csv
Error converting  ColumnShape_synthetic_table.csv  to csv


In [None]:
# For RL-based data
save_path = f"{report_dir}/{data_name}_GReaTDPO_default_{test_idx}"
evaluation_pipeline = EvaluationPipeline(real_data=balanced_data, synth_data=new_synthetic_data, column_name_to_datatype=column_name_to_datatype, config=config, save_path=save_path)
evaluation_pipeline.run_pipeline()

Initialing Evaluator...
Fidelity Evaluation Module initialized
Privacy Evaluation Module initialized
Utility Evaluation Module initialized
evaluated  SumStats
Generating report ...
(1/2) Evaluating Column Shapes: : 100%|██████████| 15/15 [00:00<00:00, 2072.97it/s]
(2/2) Evaluating Column Pair Trends: :   0%|          | 0/105 [00:00<?, ?it/s]

(2/2) Evaluating Column Pair Trends: : 100%|██████████| 105/105 [00:01<00:00, 68.22it/s]

Overall Quality Score: 31.45%

Properties:
- Column Shapes: 44.33%
- Column Pair Trends: 18.56%
Generating report ...
(1/2) Evaluating Column Shapes: : 100%|██████████| 15/15 [00:00<00:00, 2239.35it/s]
(2/2) Evaluating Column Pair Trends: : 100%|██████████| 105/105 [00:01<00:00, 72.25it/s]

Overall Quality Score: 30.68%

Properties:
- Column Shapes: 43.52%
- Column Pair Trends: 17.85%
evaluated  ColumnShape
Traceback (most recent call last):
  File "/opt/conda/envs/llm/lib/python3.10/site-packages/evaluator/interfaces/evaluation_interface.py", line 36, in evaluate
    metric_instance = self.metric_factory.create_instance(metric, self.real_data, self.synth_data, self.holdout_data, self.column_name_to_datatype, self.config)
  File "/opt/conda/envs/llm/lib/python3.10/site-packages/evaluator/factories/metric_factory.py", line 29, in create_instance
    return metric_class(real_data, synth_data, holdou


The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.



Fitting XGBoost
Fitting CatBoost
Fitted real data!
Fitting NaiveBayes
Fitting KNeighbors
Fitting DecisionTree
Fitting RandomForest



The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.



Fitting XGBoost
Fitting CatBoost
Fitted synth data!
Fitting NaiveBayes
Fitting KNeighbors
Fitting DecisionTree
Fitting RandomForest



The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.



Fitting XGBoost
Fitting CatBoost
Fitted augmented data!
evaluated  TabularUtility
Plotting box plots
Plotting  SumStats
Plotting column shape
Plotting  ColumnShape
Plotting  TabularUtility
Making report path at: report/census1000_GReaTDPO_default_100
Error converting  ColumnShape_real_table.csv  to csv
Error converting  ColumnShape_synthetic_table.csv  to csv


## Compare F1 scores

In [None]:
df_pre = pd.read_csv(f"{report_dir}/{data_name}_GReaT_default_{test_idx}/TabularUtility_synth.csv")
df_post = pd.read_csv(f"{report_dir}/{data_name}_GReaTDPO_default_{test_idx}/TabularUtility_synth.csv")

print(df_pre['F1'].mean())
print(df_post['F1'].mean())