In [None]:

# import .py files
import os
import pandas as pd
import PIL
import json
# import train_model
# import generate_synthetic
# import train_classifier
# import test_imgs
import split_dataset as split
import train_model
import numpy as np
import torch
from datetime import datetime
import shutil


In [None]:
path_home = '/home/pathorad3090/Documents/Hadar/SyntheticEvaluation'
path_models = path_home + "/models"
path_raw_data = path_home + "/data/mnist_images"

In [None]:
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Set parameters for which train_size, gen_size, synthetic/real ratio, GAN train cutoff
# Set initial dir for experiments

# Set parameters as lists of integer values
seed = 42
train_sizes = [50, 100, 400, 800, 1000, 5000, 10000]  # List of different training sizes
gen_sizes = [30]  # List of different generation sizes
synthetic_real_ratio = 0.5  # Ratio of synthetic to real data
gan_train_cutoff = 5000  # Number of GAN training iterations before switching
train_ratio = 0.8

# Log the configured parameters for verification
logger.info("Training Configuration:")
logger.info(f"Train Sizes: {train_sizes}")
logger.info(f"Generation Sizes: {gen_sizes}")
logger.info(f"Synthetic/Real Ratio: {synthetic_real_ratio}")
logger.info(f"StyleGAN2-ADA Training Cutoff: {gan_train_cutoff}")
logger.info(f"Train Ratio: {train_ratio}")
logger.info("-" * 40)  # Separator for clarity


In [None]:
# Use preprocessing.ipynb to create a proper dataset
# Distribute files to relevant subfolders + create JSON


In [None]:
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Define paths and parameters
full_json = path_home + '/data/dataset_full.json'
train_json = full_json

logger.info(f"Loading data from {full_json} for splitting into train/test subsets.")

# Split into train/test subsets
train_df, test_df = split.split_train_test(train_json, train_ratio, seed=seed)

train_output_file = f"{path_raw_data}/train_data.json"
test_output_file = f"{path_raw_data}/test_data.json"

logger.info(f"Saving training data to {train_output_file}.")
split.save_data(train_df, train_output_file)

logger.info(f"Saving testing data to {test_output_file}.")
split.save_data(test_df, test_output_file)

logger.info("Printing class distribution for training and testing datasets.")
split.print_class_distribution(train_df, "Train")
split.print_class_distribution(test_df, "Test")

logger.info("Data splitting and saving completed successfully.")


In [None]:
# train MLP/CNN classifier, test for benchmark using test_imgs.py
# V

In [None]:
!export MKL_SERVICE_FORCE_INTEL=1

In [None]:
import os
import logging
from datetime import datetime
from random import seed

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

train_json = path_raw_data + '/train_data.json'

# Example of train_sizes; ensure it's defined somewhere
train_sizes = [40000]  # Adjust this as needed

for train_size in train_sizes:
    # Create a name for the subset and the model
    model_name = f"model_{float(train_size) // 1000}K_{datetime.now().strftime('%Y%m%d_%H-%M')}"
    path_model = os.path.join(path_models, model_name)
    path_experiments = os.path.join(path_model, 'experiments')
    path_dataset = os.path.join(path_model, 'dataset')
    path_subset_json = f"{path_model}/dataset_subset_size_{train_size}_seed_{seed}.json"
    
    logger.info(f"Creating directories for model: {model_name}")
    _, _, path_model_images = split.open_folders(model_name, path_model)
    
    logger.info(f"Generating subset data from {train_json} with size {train_size} and seed {seed}")
    subset_df = split.subset_data(train_json, train_size, seed)
    
    logger.info(f"Subset DataFrame created. Path raw data: {path_raw_data}, Path model images: {path_model_images}")
    split.copy_images_to_model_and_dataset(subset_df, path_raw_data, path_model_images)
    
    logger.info(f"Saving subset data to {path_subset_json}")
    split.save_data(subset_df, path_subset_json)

    logger.info(f"Distributing files into label directories at {path_model_images}")
    split.distribute_files_to_label_dirs(path_model_images)
    
    logger.info("Generating labels JSON for the subset.")
    split.generate_labels_json(path_model_images, path_model_images, "dataset.json")
    
    logger.info(f"Creating dataset for {model_name}...")
    train_model.create_dataset(path_home, path_model_images, path_dataset)

    logger.info(f"Training model {model_name}...")
    train_model.run_stylegan_training(path_home, path_experiments, path_dataset, snap=10)
    
    logger.info(f"Cleaning up model directories for {model_name}")
    split.delete_images_and_dataset_dirs(path_model)
    
    path_latest_pkl_file = split.get_latest_pkl_file(path_model)
    if path_latest_pkl_file:
        logger.info(f"Most recent .pkl file: {path_latest_pkl_file}")
        for gen_size in gen_sizes:
            gen_size = gen_size // 10
            path_generations = os.path.join(path_model, 'generations_', str(gen_size * 10))
            logger.info(f"Generating synthetic images for {model_name} with generation size {gen_size * 10}...")
            train_model.generate_stylegan_images(path_home, path_latest_pkl_file, path_generations, f"0-{gen_size}")
    else:
        logger.warning("No .pkl file found for generation.")


In [None]:
print(torch.cuda.is_available())
print(torch.cuda.device_count())


In [None]:
# show results - relevant metric: avg/class accuracy, f1, precision, recall, AUC-ROC...
# graph/tabular


In [None]:
# delete datasets (keep logs)

In [None]:
# def open_image_folder(source_dir, *, max_images=100000):
#     input_images = [str(f) for f in sorted(Path(source_dir).rglob('*')) if is_image_ext(f) and os.path.isfile(f)]

#     # Load labels.
#     labels = {}
#     meta_fname = os.path.join(source_dir, 'dataset.json')
#     if os.path.isfile(meta_fname):
#         with open(meta_fname, 'r') as file:
#             labels = json.load(file)['labels']
#             if labels is not None:
#                 labels = { x[0]: x[1] for x in labels }
#             else:
#                 labels = {}


#     def iterate_images():
#         for idx, fname in enumerate(input_images):
#             arch_fname = os.path.relpath(fname, source_dir)
#             arch_fname = arch_fname.replace('\\', '/')
#             img = np.array(PIL.Image.open(fname))
#             yield dict(img=img, label=labels.get(arch_fname))
#             if idx >= max_idx-1:
#                 break
#     return max_idx, iterate_images()
