In [1]:

# import .py files
import os
import pandas as pd
import PIL
import json
# import train_model
# import generate_synthetic
# import train_classifier
# import test_imgs
import split_dataset as split
import train_model
import numpy as np
import torch
from datetime import datetime
import shutil


In [2]:
path_home = '/home/pathorad3090/Documents/Hadar/SyntheticEvaluation'
path_models = path_home + "/models"
path_raw_data = path_home + "/data/mnist_images"

In [3]:
# set parameters for which train_size, gen_size, synthetic/real ratio, GAN train cutoff
# set initial dir for experiments

# Set parameters as lists of integer values
seed = 42
train_sizes = [50, 100, 400, 800, 1000, 5000, 10000]  # List of different training sizes
gen_sizes = [30]      # List of different generation sizes
synthetic_real_ratio = 0.5          # Ratio of synthetic to real data
gan_train_cutoff = 5000             # Number of GAN training iterations before switching
train_ratio = 0.8


# Print out the configured parameters for verification
print("Training Configuration:")
print(f"Train Size: {train_sizes}")
print(f"Generation Size: {gen_sizes}")
print(f"Synthetic/Real Ratio: {synthetic_real_ratio}")
print(f"StyleGAN2-ADA Training Cutoff: {gan_train_cutoff}")
# print(f"Experiment Run Directory: {experiment_run_dir}")
print("-" * 40)  # Separator for clarity


Training Configuration:
Train Size: [50, 100, 400, 800, 1000, 5000, 10000]
Generation Size: [30]
Synthetic/Real Ratio: 0.5
StyleGAN2-ADA Training Cutoff: 5000
----------------------------------------


In [4]:
# Use preprocessing.ipynb to create a proper dataset
# Distribute files to relevant subfolders + create JSON


In [5]:
# split into train/test

full_json = path_home + '/data/dataset_full.json'

# split into test/train subsets
train_json = full_json

train_df, test_df = split.split_train_test(train_json, train_ratio, seed=seed)

train_output_file = f"{path_raw_data}/train_data.json"
test_output_file = f"{path_raw_data}/test_data.json"

split.save_data(train_df, train_output_file)
split.save_data(test_df, test_output_file)

split.print_class_distribution(train_df, "Train")
split.print_class_distribution(test_df, "Test")


Splitting data: 100%|██████████| 10/10 [00:00<00:00, 105.23class/s]


Train data statistics:
Total number of samples: 47995
0    4738
1    5393
2    4766
3    4904
4    4673
5    4336
6    4734
7    5012
8    4680
9    4759
Name: label, dtype: int64
Test data statistics:
Total number of samples: 12005
0    1185
1    1349
2    1192
3    1227
4    1169
5    1085
6    1184
7    1253
8    1171
9    1190
Name: label, dtype: int64


In [6]:
# train MLP/CNN classifier, test for benchmark using test_imgs.py
# V

In [7]:
!export MKL_SERVICE_FORCE_INTEL=1

In [None]:
# for loop: create dir, create dataset, train model, generate syn-images, test

# from random import seed
train_json = path_raw_data + '/train_data.json'
# input_file = path_home + '/data/dataset.json'



# train_sizes = [40000]
# train_sizes = [50]
for train_size in train_sizes:
    # Create a name for the subset and the model
    model_name = f"model_{float(train_size) // 1000}K_{datetime.now().strftime('%Y%m%d_%H-%M')}"
    path_model = os.path.join(path_models, model_name)
    path_experiments = os.path.join(path_model, 'experiments')
    path_dataset = os.path.join(path_model, 'dataset')
    path_subset_json = f"{path_model}/dataset_subset_size_{train_size}_seed_{seed}.json"
    
    _,_,path_model_images = split.open_folders(model_name, path_model)
    subset_df = split.subset_data(train_json, train_size, seed)
    
    # print('subset_df: ',subset_df)
    print('subset_df, path_raw_data, path_model_images: ', path_raw_data, path_model_images)
    split.copy_images_to_model_and_dataset(subset_df, path_raw_data, path_model_images)
    split.save_data(subset_df, path_subset_json)

    # split the subset train images into separate dirs by class 
    split.distribute_files_to_label_dirs(path_model_images)
    # Get dataset.json labels for the subset
    split.generate_labels_json(path_model_images, path_model_images, "dataset.json")
    
    print(f"Creating dataset for {model_name}...")
    train_model.create_dataset(path_home, path_model_images, path_dataset)

    print(f"Training {model_name}...")
    train_model.run_stylegan_training(path_home, path_experiments, path_dataset, snap=10)
    
    
    split.delete_images_and_dataset_dirs(path_model)
    
    path_latest_pkl_file = split.get_latest_pkl_file(path_model)
    if path_latest_pkl_file:
        print(f"Most recent .pkl file: {path_latest_pkl_file}")
        for gen_size in gen_sizes:
            gen_size = gen_size // 10
            path_generations = os.path.join(os.path.join(path_model, 'generations_'), str(gen_size*10))
            print(f"Generating synthetic images for {model_name}...")
            train_model.generate_stylegan_images(path_home,path_latest_pkl_file ,path_generations , f"0-{gen_size}")


Subsetting classes: 100%|██████████| 10/10 [00:00<00:00, 2167.15class/s]

subset_df, path_raw_data, path_model_images:  /home/pathorad3090/Documents/Hadar/SyntheticEvaluation/data/mnist_images /home/pathorad3090/Documents/Hadar/SyntheticEvaluation/models/model_0.0K_20241103_22-59/images
Starting to generate labels JSON file...
Base directory: /home/pathorad3090/Documents/Hadar/SyntheticEvaluation/models/model_0.0K_20241103_22-59/images
Generated labels JSON file with 50 entries.
Creating dataset for model_0.0K_20241103_22-59...
Creating dataset with command: python /home/pathorad3090/Documents/Hadar/SyntheticEvaluation/stylegan2-ada-pytorch/dataset_tool.py --source /home/pathorad3090/Documents/Hadar/SyntheticEvaluation/models/model_0.0K_20241103_22-59/images --dest /home/pathorad3090/Documents/Hadar/SyntheticEvaluation/models/model_0.0K_20241103_22-59/dataset



  resample = { 'box': PIL.Image.BOX, 'lanczos': PIL.Image.LANCZOS }[resize_filter]
  resample = { 'box': PIL.Image.BOX, 'lanczos': PIL.Image.LANCZOS }[resize_filter]
100%|██████████| 50/50 [00:00<00:00, 5658.19it/s]


Training model_0.0K_20241103_22-59...
Running command: python /home/pathorad3090/Documents/Hadar/SyntheticEvaluation/stylegan2-ada-pytorch/train.py --snap 10 --cond=1 --outdir /home/pathorad3090/Documents/Hadar/SyntheticEvaluation/models/model_0.0K_20241103_22-59/experiments --data /home/pathorad3090/Documents/Hadar/SyntheticEvaluation/models/model_0.0K_20241103_22-59/dataset --kimg=1000

Training options:
{
  "num_gpus": 1,
  "image_snapshot_ticks": 10,
  "network_snapshot_ticks": 10,
  "metrics": [
    "fid50k_full"
  ],
  "random_seed": 0,
  "training_set_kwargs": {
    "class_name": "training.dataset.ImageFolderDataset",
    "path": "/home/pathorad3090/Documents/Hadar/SyntheticEvaluation/models/model_0.0K_20241103_22-59/dataset",
    "use_labels": true,
    "max_size": 50,
    "xflip": false,
    "resolution": 32
  },
  "data_loader_kwargs": {
    "pin_memory": true,
    "num_workers": 3,
    "prefetch_factor": 2
  },
  "G_kwargs": {
    "class_name": "training.networks.Generator",

AttributeError: module 'split_dataset' has no attribute 'delete_subdirs_and_move_file'

In [None]:
print(torch.cuda.is_available())
print(torch.cuda.device_count())


In [None]:
# show results - relevant metric: avg/class accuracy, f1, precision, recall, AUC-ROC...
# graph/tabular


In [None]:
# delete datasets (keep logs)

In [None]:
# def open_image_folder(source_dir, *, max_images=100000):
#     input_images = [str(f) for f in sorted(Path(source_dir).rglob('*')) if is_image_ext(f) and os.path.isfile(f)]

#     # Load labels.
#     labels = {}
#     meta_fname = os.path.join(source_dir, 'dataset.json')
#     if os.path.isfile(meta_fname):
#         with open(meta_fname, 'r') as file:
#             labels = json.load(file)['labels']
#             if labels is not None:
#                 labels = { x[0]: x[1] for x in labels }
#             else:
#                 labels = {}


#     def iterate_images():
#         for idx, fname in enumerate(input_images):
#             arch_fname = os.path.relpath(fname, source_dir)
#             arch_fname = arch_fname.replace('\\', '/')
#             img = np.array(PIL.Image.open(fname))
#             yield dict(img=img, label=labels.get(arch_fname))
#             if idx >= max_idx-1:
#                 break
#     return max_idx, iterate_images()
