In [1]:
model_name = 'princeton-nlp/Sheared-LLaMA-1.3B'
dataset_name = 'tab'
target_epsilon = 'inf'
model_config = f'{model_name.replace("/", "_")}_{dataset_name}_DP_{target_epsilon}'
synthetic_data_path = f'./data/synthetic/{model_config}_outputs-final.csv'# Path to the CSV file where the outputs are saved

### Downstream Utility Evaluation

In [2]:
import pandas as pd
import json
from transformers import TrainingArguments as HfTrainingArguments
from syntheval.eval.downstream.classify.train_classifier import TrainingArguments, ModelArguments, Classifier, Arguments
from syntheval.utils.utils import create_classification_dataset
from syntheval.utils.filtering import process_df

2025-03-23 21:34:43.719124: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742780083.737107  127092 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742780083.742749  127092 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-23 21:34:43.762190: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/kramesh3/nltk_data...
[nltk_data]   Package stopwords is a

#### Classification: Creating the dataset

Filtering data and creating a structured format out of raw synthetic text.

We have assumed that the synthetic text is generated with labels (the labels typically serve as control codes in most setups).

In [3]:
# Create mapping from the original data
# And creating a test set for evaluating the model once it is trained
from datasets import load_from_disk, concatenate_datasets
from syntheval.utils.utils import encode_labels

In [4]:
tab_data = load_from_disk('./data/generator/data/tab/')
col_names = [i for i in tab_data['train'].column_names if i not in ['country', 'text', 'year']]
tab_data = tab_data.remove_columns(col_names)
tab_data['train'] = concatenate_datasets([tab_data['train'], tab_data['validation'], tab_data['test']])

_, _ = encode_labels(tab_data['train'], label_column = 'country', json_mapping_exists = False, 
                                json_mapping_path = f'./data/benchmark/classification/data/{dataset_name}-mapping.json', multilabel=False)


Converting to pd.Dataframe format...
Saving label mapping to./data/benchmark/classification/data/tab-mapping.json...


In [5]:
tab_data = load_from_disk('./data/generator/data/tab/')
df, _ = encode_labels(tab_data['validation'], label_column = 'country', json_mapping_exists = True, 
                                json_mapping_path = f'./data/benchmark/classification/data/{dataset_name}-mapping.json', multilabel=False)
test_file_path = f'./data/benchmark/classification/data/test/{dataset_name}/test.csv'
print(f"Saving test file to: {test_file_path}")
df.to_csv(test_file_path)

Converting to pd.Dataframe format...
Label mapping already exists.
Saving test file to: ./data/benchmark/classification/data/test/tab/test.csv


In [12]:
# Use this mapping for converting the synthetic data to a structured format
df = pd.read_csv(synthetic_data_path)
df = process_df(df, text_column = 'output_text')
_, _, _ = create_classification_dataset(df, label_column = 'country', json_mapping_path = f'./data/benchmark/classification/data/{dataset_name}-mapping.json', json_mapping_exists = True,
                                        output_dir = f'./data/benchmark/classification/data/{model_config}', multilabel = False, train_ratio = 0.7, test_ratio = 0.15, val_ratio = 0.15)

Label mapping already exists.
Data saved to ./data/benchmark/classification/data/princeton-nlp_Sheared-LLaMA-1.3B_tab_DP_inf
Train: 1577 samples, Validation: 338 samples, Test: 339 samples


In [14]:
with open(f'./data/benchmark/classification/data/{dataset_name}-mapping.json') as f:
    data = json.load(f)
    n_labels_task = len(data)
print(f"Number of labels: {n_labels_task}")

Number of labels: 17


#### Classification: Training the model

This can also be run as a script. Sample script provided in eval.downstream.classify

In [None]:
if __name__ == "__main__":
        train_args, model_args = TrainingArguments(), ModelArguments()

        model_args.model_name = 'bert-base-uncased'
        model_args.text_field = 'output_text'
        model_args.label_field = 'Label'
        model_args.path_to_dataset = f'./data/benchmark/classification/data/{model_config}'
        model_args.path_to_model = f'./data/benchmark/classification/models/{model_args.model_name}_{model_config}'
        model_args.n_labels = n_labels_task
        model_args.is_train = True
        model_args.problem_type = 'single_label_classification'
        args = Arguments(train=train_args, model=model_args)

        print("Training:\n")
        obj = Classifier(args = args)
        obj.finetune_model()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Training:

Loading training and validation data.


Generating train split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)


Generating validation split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)


Loading base model for fine-tuning...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preprocessing dataset!


Running tokenizer on dataset:   0%|          | 0/1577 [00:00<?, ? examples/s]



Running tokenizer on dataset:   0%|          | 0/338 [00:00<?, ? examples/s]

Model training begins...




Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.717368,0.31351,0.38009,0.340342,0.849112
2,No log,0.23602,0.74872,0.769231,0.758321,0.982249
3,No log,0.161291,0.896011,0.918552,0.904642,0.994083


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### Classification: Testing the model

In [18]:
if __name__ == "__main__":
        train_args, model_args = TrainingArguments(), ModelArguments()
        model_args.is_train = False
        model_args.is_test = True
        model_args.text_field = 'text'
        model_args.label_field = 'Label'

        model_args.model_name = 'bert-base-uncased'
        model_args.path_to_model = f'./data/benchmark/classification/models/{model_args.model_name}_{model_config}'
        model_args.path_to_dataset = f'./data/benchmark/classification/data/test/{dataset_name}/test.csv'
        model_args.path_to_output_csv = f'./data/benchmark/classification/test-results/{model_args.model_name}_{model_config}_test_outputs.csv'
        model_args.path_to_aggregated_results = './data/benchmark/classification/compiled_benchmark_results.csv'

        model_args.n_labels = n_labels_task
        model_args.problem_type = "single_label_classification"
        model_args.retain_columns = ['country', 'year']

        args = Arguments(train=train_args, model=model_args)
        print("Testing:\n")
        obj = Classifier(args = args)
        obj.test_model()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Testing:

Loading test data
Checkpoint exists:  ./data/benchmark/classification/models/bert-base-uncased_princeton-nlp_Sheared-LLaMA-1.3B_tab_DP_inf 
Loading model from the checkpoint...
Preprocessing dataset...


Running tokenizer on dataset:   0%|          | 0/127 [00:00<?, ? examples/s]



Model evaluation begins...




Saving file!
Evaluation results:  {'eval_loss': 0.1836952269077301, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_accuracy': 1.0, 'eval_runtime': 0.4263, 'eval_samples_per_second': 297.888, 'eval_steps_per_second': 7.037}


#### Classification: Fairness auditing of the trained classifier

In [19]:
from syntheval.eval.downstream.classify.visualize import tabulate_results

path_to_test_output = f'./data/benchmark/classification/test-results/{model_args.model_name}_{model_config}_test_outputs.csv'
tabulate_results([path_to_test_output], n_labels = n_labels_task, print_fairness=True, subgroup_type="country", problem_type = "multiclass")

Evaluation Results:
+-----------+--------+-------+----------+---------------+---------------+
| Precision | Recall |  F1   | Accuracy | F1-micro-diff | F1-macro-diff |
+-----------+--------+-------+----------+---------------+---------------+
|   1.000   | 1.000  | 1.000 |  1.000   |     0.00      |     0.00      |
+-----------+--------+-------+----------+---------------+---------------+

Fairness Results:
+---+----------+------------+----------------+----------+----------+
|   | Accuracy | Group Type | Num of Samples | f1_macro | f1_micro |
+---+----------+------------+----------------+----------+----------+
| 0 |   1.0    |    DNK     |       4        |   1.0    |   1.0    |
| 1 |   1.0    |    GBR     |       61       |   1.0    |   1.0    |
| 2 |   1.0    |    IRL     |       1        |   1.0    |   1.0    |
| 3 |   1.0    |    NOR     |       1        |   1.0    |   1.0    |
| 4 |   1.0    |    POL     |       7        |   1.0    |   1.0    |
| 5 |   1.0    |    SWE     |       11 

### Descriptive Analysis of Synthetic Data

In [3]:
import nltk
import pandas as pd
from syntheval.eval.descriptive.descriptor import TextDescriptor
from syntheval.eval.descriptive.arguments import TextDescriptorArgs
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /home/kramesh3/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/kramesh3/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/kramesh3/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [6]:
from datasets import load_from_disk
synth_df = pd.read_csv(synthetic_data_path)
real_texts = load_from_disk('./data/generator/data/tab')
len_samples = len(synth_df) if len(synth_df)<len(real_texts['train']) else len(real_texts['train'])
synth_df = synth_df.head(len_samples)
real_texts = real_texts['train'].select(range(len_samples))
real_texts = real_texts[ 'text']

#### Text length and distributional comparisons

In [14]:
desc_analyze = TextDescriptor(texts = synth_df['output_text'].tolist(), args = TextDescriptorArgs(produce_plot=True), reference_texts = real_texts)

In [15]:
desc_analyze._compare_to_reference_distribution(metrics = ['text-length', 'jaccard', 'cosine'])

Comparing text length...
+-------------------+-----------------------+-----------------------+--------------+
| Metric            |   Text Distribution 1 |   Text Distribution 2 |   Difference |
| Avg. Length       |               181.798 |              1343.42  |    -1161.62  |
+-------------------+-----------------------+-----------------------+--------------+
| Min Length        |               146     |               185     |      -39     |
+-------------------+-----------------------+-----------------------+--------------+
| Max Length        |               219     |              5144     |    -4925     |
+-------------------+-----------------------+-----------------------+--------------+
| Avg. Unique Words |               114.62  |               487.966 |     -373.346 |
+-------------------+-----------------------+-----------------------+--------------+
Comparing distributions...
Jaccard similarity: 0.148
Cosine similarity: 0.416


In [31]:
import pyLDAvis
pyLDAvis.enable_notebook()
tm = desc_analyze._topic_modeling_display(num_topics=3)

In [33]:
tm

### Privacy Leakage Assessment

#### Privacy: Defining the entities

In [35]:
import re
import pandas as pd
from datasets import load_from_disk

real_texts = load_from_disk('./data/generator/data/tab')
real_texts = real_texts['train']
synth_df = pd.read_csv(synthetic_data_path)

In [40]:
entities = []
for i in real_texts['annotations']:
    try:
        for annotator in i:
            for entity in i[annotator]['entity_mentions']:
                if(entity['entity_type'] in ['PERSON', 'DATETIME']):
                    entities.append(entity['span_text'])
    except Exception as e:
        continue

In [41]:
print(len(entities))

2990


#### Evaluating leakage of entities

In [42]:
from syntheval.eval.privacy.metrics import entity_leakage, search_and_compute_EPO

In [43]:
total_leakage, privacy_analysis = entity_leakage(synth_df['output_text'].tolist(), entities, 'privacy-leakage.pkl')

In [47]:
print(f"Percentage of leaked entities: {100*total_leakage:.3f} %")

Percentage of leaked entities: 1.801 %


#### Evaluating span memorization

Doing this only for 15 entities as it is time-intensive

In [None]:
entities = entities[:15]

In [None]:
t_df = pd.DataFrame({'text': synth_df['output_text'].tolist()[:10]})


search_and_compute_EPO(synth_file = synth_df, ref_file = t_df, 
                       synth_phrase_file_path = 'synth-outputs.csv', ref_phrase_file_path = 'ref-outputs.csv',
                       entity_patterns = fake_entities, max_window_len = 3,
                       text_field = text_field)

Length: 10
Total number of entities 2990


In [22]:
print(f"Percentage of leaked entity contexts: {100*total_leakage:.3f} %")

Percentage of leaked entity contexts: 9.306 %


### Qualitative Evaluation Against Real Data

In [2]:
import pandas as pd
from syntheval.eval.text_quality.metrics import TextQualityEval
from syntheval.eval.text_quality.arguments import MauveArgs, LMArgs, FrechetArgs
from dataclasses import dataclass
from datasets import load_from_disk


df = pd.DataFrame({})
synthetic_samples = pd.read_csv(synthetic_data_path)
real_samples = load_from_disk('./data/generator/data/tab')
len_samples = len(synthetic_samples) if len(synthetic_samples)<len(real_samples['train']) else len(real_samples['train'])
synthetic_samples = synthetic_samples.head(len_samples)
real_samples = real_samples['train'].select(range(len_samples))

df['source'] = synthetic_samples['output_text']
df['reference'] = real_samples['text']

@dataclass
class args_temp:
    FrechetArgs:FrechetArgs
    MauveArgs:MauveArgs
    LMArgs:LMArgs

args_ = args_temp(FrechetArgs, MauveArgs, LMArgs)
qual_estimator = TextQualityEval(args_)

2025-03-21 16:28:51.622122: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742588931.640903   93933 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742588931.646692   93933 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-21 16:28:51.668562: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
qual_estimator.calculate_perplexity(df)
qual_estimator.calculate_fid_score(df)

Using the latest cached version of the module from /home/kramesh3/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--perplexity/8ab643ad86f568b7d1d5f7822373fa7401ff5ff0297ccf114b0ca6a33be96bc0 (last modified on Thu Dec 19 16:22:06 2024) since it couldn't be found locally at evaluate-metric--perplexity, or remotely on the Hugging Face Hub.


  0%|          | 0/64 [00:00<?, ?it/s]

In [4]:
qual_estimator.print_metrics(qual_estimator.return_results())

Automated Open-Ended Text Evaluation Metrics:
+-------+------------+
|  fid  | perplexity |
+-------+------------+
| 0.771 |   16.846   |
+-------+------------+
