In [59]:
import torch
import random
import os, shutil, glob
from PIL import Image
from tqdm import tqdm
from pathlib import Path

In [60]:
art_image_dir = '../../../datasets/art'
if not os.path.exists('../../../datasets/art/train'):
    os.mkdir('../../../datasets/art/train')
    os.mkdir('../../../datasets/art/train/ai-generated')
    os.mkdir('../../../datasets/art/train/web-scraping')
if not os.path.exists('../../../datasets/art/test'): 
    os.mkdir('../../../datasets/art/test')
    os.mkdir('../../../datasets/art/test/ai-generated')
    os.mkdir('../../../datasets/art/test/web-scraping')
if not os.path.exists('../../../datasets/art/validation'):
    os.mkdir('../../../datasets/art/validation')
    os.mkdir('../../../datasets/art/validation/ai-generated')
    os.mkdir('../../../datasets/art/validation/web-scraping')
excluded_directories = [os.path.join(art_image_dir, "train"), os.path.join(art_image_dir, "test"), os.path.join(art_image_dir, "validation")]


In [61]:
ai_images_dir = os.path.join(art_image_dir, 'AiArtData')
real_images_dir =  os.path.join(art_image_dir, 'RealArt')


In [62]:
pattern = '*.*:Zone.Identifier'
files_to_delete = glob.glob(pattern)
for ai_file in os.listdir(ai_images_dir):
    if ai_file in files_to_delete:
        os.remove(ai_file)
for real_file in os.listdir(real_images_dir):
    if real_file in files_to_delete:
        os.remove(real_file)

### Convert the images to a standard format - PNG

In [63]:
for folder in os.listdir(art_image_dir):
    folder_path = os.path.join(art_image_dir, folder)
    if os.path.join(art_image_dir, folder) not in excluded_directories:
        for file in tqdm(os.listdir(folder_path)):
            if file.endswith('.jpg') or file.endswith('.jpeg') or file.endswith('.gif'):
                file_path = os.path.join(folder_path, file)
                img = Image.open(file_path)
                new_file_name = os.path.splitext(file_path)[0] + '.png'
                img.save(new_file_name, 'PNG')

100%|██████████| 537/537 [00:00<00:00, 1816404.23it/s]
100%|██████████| 435/435 [00:00<00:00, 1747626.67it/s]


### Remove Duplicates

In [64]:
for folder in os.listdir(art_image_dir):
    folder_path = os.path.join(art_image_dir, folder)
    if os.path.join(art_image_dir, folder) not in excluded_directories:
        for file in tqdm(os.listdir(folder_path)):
            if file.endswith('.jpg') or file.endswith('.jpeg') or file.endswith('.gif'):
                file_path = os.path.join(folder_path, file)
                os.remove(file_path)
                print(f"Removed: {file_path}")

100%|██████████| 537/537 [00:00<00:00, 1806207.90it/s]
100%|██████████| 435/435 [00:00<00:00, 1351497.96it/s]


## Resize the images while maintaining aspect ratio

In [65]:
width = 800
for folder in os.listdir(art_image_dir):
    if os.path.join(art_image_dir, folder) not in excluded_directories:
        folder_path = os.path.join(art_image_dir, folder)
        print("----working on folder----", folder_path)
        for file in tqdm(os.listdir(folder_path)):
            img = Image.open(os.path.join(folder_path, file))
            aspect_ratio = img.width / img.height
            height = int(width / aspect_ratio)
            if img.size[0] != width:
                resized_image = img.resize((width, height))
                resized_image.save(os.path.join(folder_path, file))



----working on folder---- ../../../datasets/art/AiArtData


100%|██████████| 537/537 [00:00<00:00, 25475.80it/s]


----working on folder---- ../../../datasets/art/RealArt


100%|██████████| 435/435 [00:00<00:00, 23322.54it/s]


### Copy 70% of data to train, 20% to test, 10% to validation

In [66]:
aiart_data_dir = '../../../datasets/art/AiArtData'
real_art_dir = '../../../datasets/art/RealArt'

# Set the destination directory
ai_train_dir = '../../../datasets/art/train/ai-generated'
real_train_dir = '../../../datasets/art/train/web-scraping'
ai_test_dir = '../../../datasets/art/test/ai-generated'
real_test_dir = '../../../datasets/art/test/web-scraping'
ai_val_dir = '../../../datasets/art/validation/ai-generated'
real_val_dir = '../../../datasets/art/validation/web-scraping'


# Get the list of files in the AI Art Data directory
aiart_files = os.listdir(aiart_data_dir)

# Get the list of files in the Real Art directory
real_art_files = os.listdir(real_art_dir)

# Calculate the number of files to copy from each directory
aiart_train_count = int(0.7 * len(aiart_files))
real_art_train_count = int(0.7 * len(real_art_files))

aiart_test_count = int(0.2 * len(aiart_files))
real_art_test_count = int(0.2 * len(real_art_files))

aiart_val_count = int(0.1 * len(aiart_files))
real_art_val_count = int(0.1 * len(real_art_files))


# Randomly select the files to copy from the AI Art Data directory
aiart_train_files = aiart_files[:aiart_train_count]
real_train_files = real_art_files[:real_art_train_count]

aiart_test_files = aiart_files[aiart_train_count:aiart_train_count + aiart_test_count]
real_test_files = real_art_files[real_art_train_count:real_art_train_count + real_art_test_count]

aiart_val_files = aiart_files[aiart_train_count + aiart_test_count:]
real_val_files = real_art_files[real_art_train_count + real_art_test_count:]

# Copy the files from the AI Art Data directory to the training directory
for file in aiart_train_files:
    src = os.path.join(aiart_data_dir, file)
    dst = os.path.join(real_train_dir, file)
    shutil.copy(src, dst)

for file in aiart_test_files:
    src = os.path.join(aiart_data_dir, file)
    dst = os.path.join(ai_test_dir, file)
    shutil.copy(src, dst)

for file in aiart_val_files:
    src = os.path.join(aiart_data_dir, file)
    dst = os.path.join(ai_val_dir, file)
    shutil.copy(src, dst)

# Copy the files from the Real Art directory to the training directory
for file in real_train_files:
    src = os.path.join(real_art_dir, file)
    dst = os.path.join(real_train_dir, file)
    shutil.copy(src, dst)

for file in real_test_files:
    src = os.path.join(real_art_dir, file)
    dst = os.path.join(real_test_dir, file)
    shutil.copy(src, dst)

for file in real_val_files:
    src = os.path.join(real_art_dir, file)
    dst = os.path.join(real_val_dir, file)
    shutil.copy(src, dst)


FileNotFoundError: [Errno 2] No such file or directory: '../../../datasets/art/AIArtData'

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [None]:
!nvidia-smi

Sat Apr 13 18:19:08 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.60.01              Driver Version: 551.76         CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3080 Ti     On  |   00000000:01:00.0  On |                  N/A |
| 44%   42C    P8             34W /  400W |     699MiB /  12288MiB |     15%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
def walk_through_dir(dir_path):
    for dirpath, dirnames, filesnames in os.walk(dir_path):
        print(f"There are {len(dirnames)} directories and {len(filesnames)} images in '{dirpath}'")

walk_through_dir(art_image_dir)

There are 2 directories and 0 images in '../../../datasets/art'
There are 0 directories and 534 images in '../../../datasets/art/AiArtData'
There are 0 directories and 431 images in '../../../datasets/art/RealArt'


### Create a new folder for holding all images and split into training, test, and validation