In [14]:
%pip install be-great datasets transformers trl sdmetric

[31mERROR: Could not find a version that satisfies the requirement sdmetric (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for sdmetric[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


## Step 0: load dataset

First we load the table we want to synthesize.

In [15]:
#from google.colab import drive
#drive.mount('/content/gdrive')

In [16]:
#data_path = "/content/gdrive/MyDrive/stats261/ctr_subset.csv"
data_path = "ctr_subset_imbalanced.csv"

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv(data_path,index_col=0)

In [18]:
# Adjust order of columns to make target label the last
labels = data.pop('label')
data['label'] = labels
real_columns = data.columns

# Separate the rows where label is 1
data_1 = data[data['label'] == 1]
data_0 = data[data['label'] == 0]

# Randomly sample from rows where label is 0
data_0_sampled = data_0.sample(n=len(data_1), random_state=42)

# Combine the two dataframes
balanced_data = pd.concat([data_1, data_0_sampled])

train, test = train_test_split(balanced_data, test_size=0.2, random_state=42)

In [19]:
balanced_data['label'].value_counts()

label
1    2773
0    2773
Name: count, dtype: int64

In [20]:
balanced_data.to_csv("ctrTaskBalanced.csv",index=False)

In [21]:
print(balanced_data.shape, balanced_data.columns)

(5546, 24) Index(['age', 'residence', 'city', 'series_dev', 'series_group', 'emui_dev',
       'device_name', 'device_size', 'slot_id', 'pt_d', 'u_refreshTimes_x',
       'u_feedLifeCycle_x', 'u_browserLifeCycle', 'u_browserMode',
       'u_feedLifeCycle_y', 'u_refreshTimes_y', 'i_cat', 'i_dislikeTimes',
       'i_upTimes', 'e_m', 'e_po', 'e_pl', 'user_id', 'label'],
      dtype='object')


## Step 1: supervised-finetuning for table generation

In this step, we finetune a distillgpt2 model to perform synthetic table generation.

In [22]:
from be_great import GReaT
from transformers import AutoModelForCausalLM

duration = 500
max_seq_len = 500

#trained_checkpoint = None
trained_checkpoint = "./great_checkpoint_balanced"

model_great = GReaT(llm='distilgpt2', batch_size=32,  epochs=duration, fp16=True,save_steps=30000)
if trained_checkpoint is not None:
    model_great.load_from_dir(trained_checkpoint)

In [23]:
model_great.fit(train)

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.8108
1000,0.6624
1500,0.642
2000,0.6269
2500,0.615
3000,0.6038
3500,0.59
4000,0.5771
4500,0.5677
5000,0.5609


<be_great.great_trainer.GReaTTrainer at 0x7f79713375e0>

In [24]:
model_great.save("./great_checkpoint_balanced")



In [25]:
base_model = model_great.model
base_model.save_pretrained("./trained_base_model_balanced")

In [26]:
synthetic_data = model_great.sample(n_samples=len(train),max_length=max_seq_len)

4528it [01:45, 43.02it/s]                          


In [27]:
synthetic_data.to_csv("ctrTaskBalanced_GReaT_default_5546.csv",index=False)

## Step 2: reward model training

Our reward model is a powerful classifier trained on the real tabular data. We apply it on synthetic table rows, and the reward is maximize when the distance between synthetic and predicted class probabilities are minimized. The idea is that the synthetic data should preserved the feature-target relationship as found in the real data, by powerful classsifiers such as XGboost.

In [28]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb

X_train, X_test = train.iloc[:, :-1], test.iloc[:, :-1]
y_train, y_test = train.iloc[:, -1],  test.iloc[:, -1]  # assume y is already encoded as 0 and 1

# Identify numerical and categorical columns
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])


# Define the model and preprocessors
classifier = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', classifier)])


pipeline.fit(X_train, y_train)

# Evaluation as sanity check
y_pred = pipeline.predict(X_test)
from sklearn.metrics import accuracy_score, classification_report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8162162162162162
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.83      0.82       542
           1       0.83      0.80      0.82       568

    accuracy                           0.82      1110
   macro avg       0.82      0.82      0.82      1110
weighted avg       0.82      0.82      0.82      1110



In [29]:
# Now try fitting a model from synthetic data and evaluate on test set
x_synth, y_synth = synthetic_data.iloc[:, :-1],synthetic_data.iloc[:, -1]

classifier_synth = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
pipeline_synth = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', classifier_synth)])
pipeline_synth.fit(x_synth, y_synth)
y_pred = pipeline_synth.predict(X_test)
from sklearn.metrics import accuracy_score, classification_report
print("Synthetic Accuracy:", accuracy_score(y_test, y_pred))
print("Synthetic Classification Report:\n", classification_report(y_test, y_pred))

Synthetic Accuracy: 0.7684684684684685
Synthetic Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.69      0.74       542
           1       0.74      0.85      0.79       568

    accuracy                           0.77      1110
   macro avg       0.77      0.77      0.77      1110
weighted avg       0.77      0.77      0.77      1110



## Step 3: reward finetuning

Now we define a reward function that does the following


1.   Given a generated text, reverse it back to a table row.
2.   Use the pre-trained classifier to predict its target based generated features.
3.   Given the distance between predicted and synthetic targets, calculate its reward.

Then we finetune the model trained above in PPO setting.

Cross check to see if PPO and RL work correctly



In [30]:
import numpy as np
import torch
from be_great.great_utils import _convert_text_to_tabular_data, _convert_tokens_to_text


def calcualte_reward(synth_data, classifier, preprocessor):
    X_synth = synth_data.iloc[:, :-1]
    y_synth = synth_data.iloc[:, -1]

    # Apply preprocessing pipeline
    #print(X_synth)
    X_synth_transformed = preprocessor.transform(X_synth)

    # Get predicted class probabilities
    y_pred_proba = classifier.predict_proba(X_synth_transformed)[:, 1]

    rewards = 1 - np.abs(y_pred_proba - y_synth)

    # Format rewards for PPO trainer
    # the PPO trainer expects a list of tensors
    return [torch.tensor(r) for r in rewards]

# Custom reward function
def reward_function(output_text, clasifier, preprocessor, columns, num_cols):
    # Replace with your actual reward computation logic
    synth_data = _convert_text_to_tabular_data(output_text, columns)
    # Remove rows where we have not generated anything
    synth_data = synth_data[~(synth_data == "placeholder").any(axis=1)]

    # Remove rows where all values are NaN
    synth_data = synth_data.dropna(how="all")

    # Remove rows with flawed numerical values but keep NaNs
    #print(len(num_cols), synth_data.shape)
    for i_num_cols in num_cols:
        coerced_series = pd.to_numeric(
            synth_data[i_num_cols], errors="coerce"
        )
        synth_data = synth_data[
            coerced_series.notnull() | synth_data[i_num_cols].isna()
        ]

    # Convert numerical columns to float
    synth_data[num_cols] = synth_data[num_cols].astype(float)
    if len(synth_data) > 0:
        return calcualte_reward(synth_data, clasifier, preprocessor)
    else:
        return [torch.tensor(0.0, dtype=torch.float32)]

In [31]:
def df_to_string_list(df):
    string_list = []
    for _, row in df.iterrows():
        row_string = ", ".join([f"{col} is {val}" for col, val in row.items()])
        string_list.append(row_string)
    return string_list

def df_cells_to_string_list(df):
    cell_string_list = []
    for col in df.columns:
        for val in df[col]:
            cell_string_list.append(f"{col} is {val}")
    return cell_string_list


In [32]:
numerical_features

Index(['age', 'residence', 'city', 'series_dev', 'series_group', 'emui_dev',
       'device_name', 'device_size', 'slot_id', 'pt_d', 'u_refreshTimes_x',
       'u_feedLifeCycle_x', 'u_browserLifeCycle', 'u_browserMode',
       'u_feedLifeCycle_y', 'u_refreshTimes_y', 'i_cat', 'i_dislikeTimes',
       'i_upTimes', 'e_m', 'e_po', 'e_pl', 'user_id'],
      dtype='object')

In [33]:
real_columns[-1]

'label'

In [34]:
df_text = df_to_string_list(synthetic_data)

# Test reconstruction of table from text
rewards = reward_function(df_text, classifier, preprocessor, real_columns, list(numerical_features)+[real_columns[-1]])

In [35]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from trl import PPOTrainer, PPOConfig
from trl.models.modeling_value_head import AutoModelForCausalLMWithValueHead
import torch
from torch.utils.data import DataLoader

llm = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(llm, padding_side='left')
tokenizer.pad_token = tokenizer.eos_token

# Load the trained base model into AutoModelForCausalLMWithValueHead
model = AutoModelForCausalLMWithValueHead.from_pretrained("./trained_base_model_balanced")

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Define dataset. Use different columns as condition to improve diversity
dataset = CustomDataset(df_cells_to_string_list(train))
batch_size = 32
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Define PPO Configuration
ppo_config = PPOConfig(
    model_name=llm,
    learning_rate=1e-5,  # Slightly increased learning rate
    batch_size=batch_size,
    ppo_epochs=4,
    mini_batch_size=16,
    gradient_accumulation_steps=2,
)

# Initialize the PPO Trainer
ppo_trainer = PPOTrainer(
    config=ppo_config,
    model=model,
    ref_model=None,  # Reference model
    tokenizer=tokenizer,
    dataset=dataset
)

In [36]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [37]:
# PPO Training loop

#https://huggingface.co/docs/trl/main/en/ppo_trainer

import warnings
warnings.filterwarnings('ignore')

for epoch in range(ppo_config.ppo_epochs):
    epoch_rewards = []

    for batch in data_loader:
        try:
            # Tokenize inputs and move to device
            inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True).to(device)
            input_ids = inputs['input_ids'].to(device)

            # Generate outputs from the model
            # Set max_length to the desired length of the generated text
            max_length = 300  # Adjust as needed for your application
            generated_ids = model.generate(input_ids, max_length=max_length, do_sample=True,temperature=0.70, pad_token_id=50256)

            generated_texts = [tokenizer.decode(generated_ids[i], skip_special_tokens=True) for i in range(generated_ids.size(0))]

            rewards = [reward_function([generated_text], classifier, preprocessor, real_columns, list(numerical_features)+[real_columns[-1]]) for generated_text in generated_texts]

            rewards = [item for sublist in rewards for item in sublist]

            epoch_rewards.extend(rewards)

            queries = [input_ids[i] for i in range(input_ids.size(0))]
            responses = [generated_ids[i] for i in range(generated_ids.size(0))]
            rewards = [reward.clone().detach() for reward in rewards]

            ppo_trainer.step(queries, responses, rewards)
        except:
            continue

    avg_reward = torch.mean(torch.stack(epoch_rewards)).item()
    print(f"Epoch: {epoch+1}/{ppo_config.ppo_epochs}, Average Reward: {avg_reward}")

print("PPO training completed!")


You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch: 1/4, Average Reward: 0.0025789199862629175
Epoch: 2/4, Average Reward: 0.0
Epoch: 3/4, Average Reward: 0.0
Epoch: 4/4, Average Reward: 0.0
PPO training completed!


In [38]:
save_directory = "RL_trained_balanced"
model.save_pretrained(save_directory)

## Step 4 (TODO)

Finally we load the trained parameters back to GReaT model, generate synthetic data, train another XGBoost on new synthtic data and observe changes its utility

In [39]:
# Run this cell if we are returning after RL training.
save_directory = "RL_trained_balanced"
model = AutoModelForCausalLMWithValueHead.from_pretrained(save_directory)

llm = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(llm, padding_side='left')
tokenizer.pad_token = tokenizer.eos_token

Some weights of the model checkpoint at RL_trained_balanced were not used when initializing GPT2LMHeadModel: ['v_head.summary.bias', 'v_head.summary.weight']
- This IS expected if you are initializing GPT2LMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2LMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [40]:
model_great.parameters = model.parameters
new_synthetic_data = model_great.sample(n_samples=len(train),max_length=duration)

new_df_text = df_to_string_list(new_synthetic_data)

new_rewards = reward_function(df_text, classifier, preprocessor, real_columns, list(numerical_features)+[real_columns[-1]])



4523it [01:44, 43.47it/s]                          


In [41]:
new_synthetic_data.to_csv("ctrTaskBalanced_GReaTRL_default_5546.csv",index=False)

In [42]:
x_synth_new, y_synth_new = new_synthetic_data.iloc[:,:-1], new_synthetic_data.iloc[:,-1]

classifier_synth = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
pipeline_synth = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', classifier_synth)])

pipeline_synth.fit(x_synth_new, y_synth_new)
y_pred = pipeline_synth.predict(X_test)
from sklearn.metrics import accuracy_score, classification_report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
# Evaluation as sanity check
# from sklearn.metrics import accuracy_score, classification_report
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7810810810810811
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.71      0.76       542
           1       0.76      0.85      0.80       568

    accuracy                           0.78      1110
   macro avg       0.79      0.78      0.78      1110
weighted avg       0.78      0.78      0.78      1110



In [43]:
new_synthetic_data

Unnamed: 0,age,residence,city,series_dev,series_group,emui_dev,device_name,device_size,slot_id,pt_d,...,u_feedLifeCycle_y,u_refreshTimes_y,i_cat,i_dislikeTimes,i_upTimes,e_m,e_po,e_pl,user_id,label
0,8.0,20.0,170.0,16.0,5.0,35.0,324.0,2401.0,54.0,2.022060e+11,...,17.0,7.0,98.0,0.0,9.0,1319.0,6.0,607.0,125659.0,1.0
1,3.0,20.0,372.0,30.0,3.0,35.0,351.0,1656.0,16.0,2.022061e+11,...,17.0,7.0,216.0,0.0,0.0,1205.0,6.0,1705.0,269415.0,0.0
2,8.0,21.0,220.0,11.0,8.0,20.0,215.0,2103.0,16.0,2.022061e+11,...,17.0,6.0,108.0,0.0,9.0,1025.0,6.0,1347.0,244655.0,0.0
3,8.0,39.0,429.0,31.0,3.0,20.0,346.0,2117.0,22.0,2.022061e+11,...,17.0,6.0,98.0,0.0,9.0,565.0,6.0,2883.0,240624.0,1.0
4,6.0,33.0,319.0,27.0,2.0,11.0,351.0,1656.0,16.0,2.022060e+11,...,17.0,6.0,98.0,0.0,9.0,1205.0,6.0,1350.0,213802.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4431,7.0,16.0,120.0,31.0,3.0,21.0,151.0,2451.0,17.0,2.022060e+11,...,11.0,0.0,10.0,0.0,9.0,591.0,6.0,2478.0,123769.0,1.0
4432,3.0,26.0,380.0,34.0,7.0,20.0,217.0,1565.0,38.0,2.022060e+11,...,17.0,4.0,219.0,0.0,9.0,1347.0,6.0,2016.0,244655.0,1.0
4433,2.0,20.0,328.0,17.0,4.0,18.0,288.0,2032.0,16.0,2.022060e+11,...,17.0,8.0,199.0,0.0,0.0,1300.0,2.0,140.0,190725.0,1.0
4434,2.0,18.0,297.0,21.0,4.0,30.0,125.0,1505.0,40.0,2.022061e+11,...,15.0,0.0,218.0,5.0,9.0,1364.0,7.0,723.0,270369.0,1.0
