In [1]:
%pip install be-great datasets transformers trl sdmetric

[31mERROR: Could not find a version that satisfies the requirement sdmetric (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for sdmetric[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


## Step 0: load dataset

First we load the table we want to synthesize.

In [2]:
#from google.colab import drive
#drive.mount('/content/gdrive')

In [3]:
#data_path = "census1000.csv"
#data_path = "ChurnModeling.csv"
data_path = "ctrBalanced500.csv"
data_name = data_path.replace(".csv", "")
test_idx = 100

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

data = pd.read_csv(data_path)

num_sample = min(500, len(data))
if num_sample is not None:
    data = data.sample(num_sample, random_state = 42)

In [5]:
balanced_data = data
target = data.columns[-1]

# Assume target is label encoded as 0/1

target_encoder = LabelEncoder()
balanced_data[target] = target_encoder.fit_transform(balanced_data[target])


# Separate the rows where label is 1
data_1 = data[data[target] == 1]
data_0 = data[data[target] == 0]

# Randomly sample from rows where label is 0
data_0_sampled = data_0.sample(n=min(len(data_1), len(data_0)), random_state=42)

# Combine the two dataframes
balanced_data = pd.concat([data_1, data_0_sampled])

balanced_data[target].value_counts()

label
1    262
0    238
Name: count, dtype: int64

In [6]:
print(balanced_data.shape, balanced_data.columns)

(500, 24) Index(['age', 'residence', 'city', 'emui_dev', 'device_name', 'device_size',
       'task_id', 'adv_id', 'creat_type_cd', 'adv_prim_id', 'slot_id',
       'spread_app_id', 'hispace_app_tags', 'app_second_class', 'pt_d',
       'u_refreshTimes_x', 'u_feedLifeCycle_y', 'u_refreshTimes_y', 'i_cat',
       'i_upTimes', 'e_m', 'e_pl', 'user_id', 'label'],
      dtype='object')


In [7]:
train, test = train_test_split(balanced_data, test_size=0.2, random_state=42)

## Step 1: supervised-finetuning for table generation

In this step, we finetune a distillgpt2 model to perform synthetic table generation.

In [11]:
from be_great import GReaT
from transformers import AutoModelForCausalLM

duration = 500
max_seq_len = 500

trained_checkpoint = None
#trained_checkpoint = "./great_checkpoint_ctrBalanced500_100"

model_great = GReaT(llm='gpt2', batch_size=32,  epochs=duration, fp16=True,save_steps=30000)
if trained_checkpoint is not None:
    model_great.load_from_dir(trained_checkpoint)

In [12]:
if trained_checkpoint is None:
    model_great.fit(train)

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.8372
1000,0.6514
1500,0.6123
2000,0.5792
2500,0.5486
3000,0.5212
3500,0.4981
4000,0.478
4500,0.4622
5000,0.4495


In [14]:
f"./great_checkpoint_{data_name}_{test_idx}"

'./great_checkpoint_ctrBalanced500_100'

In [13]:
model_great.save(f"./great_checkpoint_{data_name}_{test_idx}")



In [15]:
base_model = model_great.model
base_model.save_pretrained(f"./trained_base_model_{data_name}_{test_idx}")

In [16]:
synthetic_data = model_great.sample(n_samples=len(train),max_length=max_seq_len)

465it [00:20, 22.28it/s]                         


In [17]:
synthetic_data.to_csv(f"{data_name}_GReaT_default_{test_idx}.csv",index=False)

## Step 2: reward model training

Our reward model is a powerful classifier trained on the real tabular data. We apply it on synthetic table rows, and the reward is maximize when the distance between synthetic and predicted class probabilities are minimized. The idea is that the synthetic data should preserved the feature-target relationship as found in the real data, by powerful classsifiers such as XGboost.

In [18]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

real_columns = train.columns

X_train, X_test = train.iloc[:, :-1], test.iloc[:, :-1]
y_train, y_test = train.iloc[:, -1], test.iloc[:, -1]

# Identify numerical and categorical columns
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Define the model and preprocessors
classifier = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
pipeline_real = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', classifier)])

# Fit the pipeline on the training data
pipeline_real.fit(X_train, y_train)

# Predict on the test data
y_pred = pipeline_real.predict(X_test)

# Evaluation as a sanity check
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.61
Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.56      0.61        54
           1       0.56      0.67      0.61        46

    accuracy                           0.61       100
   macro avg       0.62      0.61      0.61       100
weighted avg       0.62      0.61      0.61       100



In [19]:
# Now try fitting a model from synthetic data and evaluate on test set
x_synth, y_synth = synthetic_data.iloc[:, :-1],synthetic_data.iloc[:, -1]

classifier_synth = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
pipeline_synth = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', classifier_synth)])
pipeline_synth.fit(x_synth, y_synth)
y_pred = pipeline_synth.predict(X_test)
from sklearn.metrics import accuracy_score, classification_report
print("Synthetic Accuracy:", accuracy_score(y_test, y_pred))
print("Synthetic Classification Report:\n", classification_report(y_test, y_pred))

Synthetic Accuracy: 0.52
Synthetic Classification Report:
               precision    recall  f1-score   support

           0       0.57      0.48      0.52        54
           1       0.48      0.57      0.52        46

    accuracy                           0.52       100
   macro avg       0.52      0.52      0.52       100
weighted avg       0.53      0.52      0.52       100



## Step 3: reward finetuning

Now we define a reward function that does the following


1.   Given a generated text, reverse it back to a table row.
2.   Use the pre-trained classifier to predict its target based generated features.
3.   Given the distance between predicted and synthetic targets, calculate its reward.

Then we finetune the model trained above in PPO setting.

Cross check to see if PPO and RL work correctly



In [20]:
import numpy as np
import torch
from be_great.great_utils import _convert_text_to_tabular_data, _convert_tokens_to_text


def calcualte_reward(synth_data, pipeline):
    X_synth = synth_data.iloc[:, :-1]
    y_synth = pd.to_numeric(synth_data.iloc[:, -1])
    #y_synth = REAL_LABEL_ENCODER.transform(y_synth)

    # Apply preprocessing pipeline
    #print(X_synth.dtypes)

    # Get predicted class probabilities
    y_pred_proba = pipeline.predict_proba(X_synth)[:, 1]

    rewards = 1 - np.abs(y_pred_proba - y_synth)

    # Format rewards for PPO trainer
    # the PPO trainer expects a list of tensors
    return [torch.tensor(r) for r in rewards]

# Custom reward function
def reward_function(output_text, pipeline, columns, num_cols):
    # Replace with your actual reward computation logic
    synth_data = _convert_text_to_tabular_data(output_text, columns)
    # Remove rows where we have not generated anything
    #synth_data = synth_data[~(synth_data == "placeholder").any(axis=1)]

    # Remove rows where all values are NaN
    #synth_data = synth_data.dropna(how="all")

    # Remove rows with flawed numerical values but keep NaNs
    #print(len(num_cols), synth_data.shape,num_cols)
    for i_num_cols in num_cols:
        synth_data[i_num_cols] = pd.to_numeric(
            synth_data[i_num_cols], errors="coerce"
        )
        #print("coerced_series",coerced_series)
        #print("synth_data[i_num_cols]",synth_data[i_num_cols])
        #print("i_num_cols",i_num_cols)
        #synth_data = synth_data[
        #    coerced_series.notnull() | synth_data[i_num_cols].isna()
        #]
    #print(synth_data.shape)

    # Convert numerical columns to float
    synth_data[num_cols] = synth_data[num_cols].astype(float)
    #print(synth_data.dtypes)
    if len(synth_data) > 0:
        return calcualte_reward(synth_data, pipeline)
    else:
        return [torch.tensor(0.0, dtype=torch.float32)]

In [21]:
import random

def df_to_string_list(df, fraction_to_select=1):
    string_list = []
    for _, row in df.iterrows():
        # Calculate the number of cells to select based on the fraction
        if fraction_to_select < 1:
            num_to_select = max(1, int(len(row) * fraction_to_select))  # Ensure at least one cell is selected
            
            # Randomly select the specified fraction of cells from the row
            selected_columns = random.sample(list(row.index), num_to_select)
        else:
            selected_columns = df.columns
        
        # Create the row string using only the selected columns
        row_string = ", ".join([f"{col} is {row[col]}" for col in selected_columns])
        string_list.append(row_string)
    return string_list

def df_cells_to_string_list(df):
    cell_string_list = []
    for col in df.columns:
        for val in df[col]:
            cell_string_list.append(f"{col} is {val}")
    return cell_string_list


In [22]:
numerical_features

Index(['age', 'residence', 'city', 'emui_dev', 'device_name', 'device_size',
       'task_id', 'adv_id', 'creat_type_cd', 'adv_prim_id', 'slot_id',
       'spread_app_id', 'hispace_app_tags', 'app_second_class', 'pt_d',
       'u_refreshTimes_x', 'u_feedLifeCycle_y', 'u_refreshTimes_y', 'i_cat',
       'i_upTimes', 'e_m', 'e_pl', 'user_id'],
      dtype='object')

In [23]:
real_columns[-1]

'label'

In [24]:
df_text = df_to_string_list(train)

# Test reconstruction of table from text
rewards = reward_function(df_text, pipeline_real, real_columns, list(numerical_features))
print(torch.mean(torch.tensor([r.item() for r in rewards])))

tensor(0.7642)


## Make Training Data In DPO Format

In [25]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from trl import PPOTrainer, PPOConfig
from trl.models.modeling_value_head import AutoModelForCausalLMWithValueHead
import torch
from torch.utils.data import DataLoader

llm = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(llm, padding_side='left')
tokenizer.pad_token = tokenizer.eos_token

# Load the trained base model into AutoModelForCausalLMWithValueHead
model = AutoModelForCausalLMWithValueHead.from_pretrained(f"./trained_base_model_{data_name}_{test_idx}")

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Define dataset. Use different columns as condition to improve diversity
# or use target to simplify training
# or use feature to get target
#starting_df = train.drop(columns=[target])
starting_df = train[[target]]
dataset = CustomDataset(df_to_string_list(starting_df,1))
batch_size = 8
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Define PPO Configuration
ppo_config = PPOConfig(
    model_name=llm,
    learning_rate=5e-7,  # Slightly increased learning rate
    batch_size=batch_size,
    ppo_epochs=6,
    mini_batch_size=4,
    gradient_accumulation_steps=2,
)

# Initialize the PPO Trainer
ppo_trainer = PPOTrainer(
    config=ppo_config,
    model=model,
    ref_model=None,  # Reference model
    tokenizer=tokenizer,
    dataset=dataset
)

In [25]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [26]:
# PPO Training loop

#https://huggingface.co/docs/trl/main/en/ppo_trainer

#import warnings
#warnings.filterwarnings('ignore')

for epoch in range(ppo_config.ppo_epochs):
    epoch_rewards = []

    for batch in data_loader:
        try:
            # Tokenize inputs and move to device
            inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True).to(device)
            input_ids = inputs['input_ids'].to(device)

            # Generate outputs from the model
            # Set max_length to the desired length of the generated text
            max_length = max_seq_len  # Adjust as needed for your application
            generated_ids = model.generate(input_ids, max_length=max_length, do_sample=True,temperature=0.70, pad_token_id=50256)

            generated_texts = [tokenizer.decode(generated_ids[i], skip_special_tokens=True) for i in range(generated_ids.size(0))]

            #rewards = [reward_function([generated_text], pipeline, real_columns, list(numerical_features)) for generated_text in generated_texts]
            rewards = reward_function(generated_texts, pipeline_real, real_columns, list(numerical_features))
            #print(len(generated_texts),len(rewards))

            #rewards = [item for sublist in rewards for item in sublist]

            epoch_rewards.extend(rewards)

            queries = [input_ids[i] for i in range(input_ids.size(0))]
            responses = [generated_ids[i] for i in range(generated_ids.size(0))]
            rewards = [reward.clone().detach() for reward in rewards]

            ppo_trainer.step(queries, responses, rewards)
        except Exception as e:
            print(f"Error: {e}")
            continue
        #break

    avg_reward = torch.mean(torch.stack(epoch_rewards)).item()
    print(f"Epoch: {epoch+1}/{ppo_config.ppo_epochs}, Average Reward: {avg_reward}")

print("PPO training completed!")


You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch: 1/6, Average Reward: 0.553227961063385




Epoch: 2/6, Average Reward: 0.5562756061553955




Epoch: 3/6, Average Reward: 0.5386244058609009




Error: Unable to parse string "1 0" at position 1




Epoch: 4/6, Average Reward: 0.5276709198951721




Error: Unable to parse string "0)" at position 1




Error: Unable to parse string "0) pt_d" at position 5
Error: Unable to parse string "1).0" at position 0




Error: Unable to parse string "1 1" at position 3




Error: Unable to parse string "0) __index_level_0__" at position 3




Epoch: 5/6, Average Reward: -2.611746311187744




Error: Unable to parse string "1 0" at position 0
Error: Unable to parse string "1)." at position 5




Error: Unable to parse string "0) hispace_app_tags" at position 5




Error: Unable to parse string "1)" at position 2
Error: Unable to parse string "0 0" at position 3
Error: Unable to parse string "0)" at position 3




Error: Unable to parse string "0)" at position 0
Error: Unable to parse string "1 0" at position 7
Error: Unable to parse string "0); device_name" at position 5




Error: Unable to parse string "1 1" at position 7




Error: Unable to parse string "1)" at position 7




Error: Unable to parse string "0)." at position 5
Error: Unable to parse string "1 0" at position 7




Error: Unable to parse string "0).0" at position 5
Error: Unable to parse string "1 1.0" at position 2
Error: Unable to parse string "1) pt_d" at position 0
Epoch: 6/6, Average Reward: 0.5482426285743713
PPO training completed!


In [None]:
save_directory = f"RL_trained_{data_name}_{test_idx}"
model.save_pretrained(save_directory)

## Step 4 (TODO)

Finally we load the trained parameters back to GReaT model, generate synthetic data, train another XGBoost on new synthtic data and observe changes its utility

In [None]:
# Run this cell if we are returning after RL training.
save_directory = f"RL_trained_{data_name}_{test_idx}"
model = AutoModelForCausalLMWithValueHead.from_pretrained(save_directory)

llm = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(llm, padding_side='left')
tokenizer.pad_token = tokenizer.eos_token

Some weights of the model checkpoint at RL_trained_census1000_100 were not used when initializing GPT2LMHeadModel: ['v_head.summary.bias', 'v_head.summary.weight']
- This IS expected if you are initializing GPT2LMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2LMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
model_great.parameters = model.parameters
new_synthetic_data = model_great.sample(n_samples=len(train),max_length=duration)

new_df_text = df_to_string_list(new_synthetic_data)

new_rewards = reward_function(new_df_text, pipeline, real_columns, list(numerical_features)+[real_columns[-1]])

print(torch.mean(torch.stack(new_rewards)))

188it [00:03, 59.82it/s]                        

tensor(0.6004)





In [None]:
new_synthetic_data.to_csv(f"{data_name}_GReaTRL_default_{test_idx}.csv",index=False)

In [None]:
x_synth_new, y_synth_new = new_synthetic_data.iloc[:,:-1], new_synthetic_data.iloc[:,-1]


classifier_synth = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
pipeline_synth_new = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', classifier_synth)])

pipeline_synth_new.fit(x_synth_new, y_synth_new)
y_pred = pipeline_synth_new.predict(X_test)
from sklearn.metrics import accuracy_score, classification_report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
# Evaluation as sanity check
# from sklearn.metrics import accuracy_score, classification_report
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.6363636363636364
Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.91      0.71        22
           1       0.80      0.36      0.50        22

    accuracy                           0.64        44
   macro avg       0.69      0.64      0.61        44
weighted avg       0.69      0.64      0.61        44



In [None]:
new_synthetic_data

Unnamed: 0,residence,city,emui_dev,device_name,device_size,task_id,adv_id,creat_type_cd,adv_prim_id,slot_id,...,pt_d,u_refreshTimes_x,u_feedLifeCycle_y,u_refreshTimes_y,i_cat,i_upTimes,e_m,e_pl,user_id,label
0,26.0,172.0,20.0,248.0,2032.0,17116.0,21776.0,8.0,1852.0,17.0,...,2.022061e+11,0.0,16.0,0.0,98.0,0.0,73.0,1509.0,212965.0,0.0
1,18.0,297.0,21.0,183.0,2401.0,31706.0,12646.0,8.0,1036.0,54.0,...,2.022060e+11,5.0,17.0,5.0,98.0,9.0,705.0,2835.0,189469.0,0.0
2,29.0,116.0,20.0,153.0,2032.0,21812.0,18340.0,8.0,2066.0,38.0,...,2.022061e+11,8.0,17.0,8.0,98.0,9.0,1444.0,1853.0,124834.0,1.0
3,21.0,291.0,35.0,351.0,2032.0,26452.0,22532.0,2.0,1524.0,38.0,...,2.022061e+11,6.0,17.0,6.0,171.0,0.0,1205.0,2182.0,148931.0,1.0
4,20.0,328.0,29.0,337.0,1656.0,28290.0,13224.0,8.0,1036.0,17.0,...,2.022061e+11,7.0,17.0,7.0,98.0,9.0,1482.0,332.0,241069.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1708,39.0,429.0,21.0,333.0,2117.0,12092.0,16444.0,3.0,1482.0,54.0,...,2.022061e+11,0.0,17.0,0.0,17.0,9.0,1194.0,2999.0,203992.0,0.0
1709,21.0,220.0,35.0,319.0,2117.0,19358.0,19175.0,8.0,1909.0,59.0,...,2.022060e+11,8.0,17.0,8.0,98.0,9.0,1277.0,656.0,216007.0,0.0
1710,21.0,434.0,37.0,252.0,2117.0,222881.0,12620.0,3.0,1557.0,26.0,...,2.022061e+11,8.0,17.0,7.0,98.0,0.0,1097.0,797.0,260086.0,0.0
1711,33.0,319.0,11.0,151.0,2032.0,33257.0,13886.0,10.0,1036.0,16.0,...,2.022061e+11,6.0,17.0,6.0,98.0,0.0,591.0,2893.0,206589.0,0.0
