In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import PIL

In [2]:
mapping_file_path = "data/CelebAMask-HQ/CelebA-HQ-Identity.csv"

# load csv
id_map = pd.read_csv(mapping_file_path)
id_map.head(10)

Unnamed: 0,image_id,orig_file,identity
0,0.jpg,119614.jpg,7423
1,1.jpg,099095.jpg,7319
2,2.jpg,200122.jpg,6632
3,3.jpg,081060.jpg,3338
4,4.jpg,202041.jpg,9178
5,5.jpg,000615.jpg,6461
6,6.jpg,050916.jpg,1725
7,7.jpg,166546.jpg,774
8,8.jpg,143862.jpg,5866
9,9.jpg,101742.jpg,7556


In [61]:
id_map['orig_file']

0        119614.jpg
1        099095.jpg
2        200122.jpg
3        081060.jpg
4        202041.jpg
            ...    
29995    052546.jpg
29996    086632.jpg
29997    170416.jpg
29998    074714.jpg
29999    116055.jpg
Name: orig_file, Length: 30000, dtype: object

# CelebA-RAW

In [3]:
# Path to the attribute file
file_path = 'data/CelebA-RAW/text_data/CelebA_Attributes.txt'

# Read the lines
with open(file_path, 'r') as f:
    lines = f.readlines()

# Skip the first line (number of entries)
# Second line is the header (attribute names)
# Remaining lines are data
attribute_names = lines[1].strip().split()
data = [line.strip().split() for line in lines[2:]]

# Create DataFrame
CelebA_Attribute_df = pd.DataFrame(data, columns=['image_id'] + attribute_names)
for attr in attribute_names:
    CelebA_Attribute_df[attr] = CelebA_Attribute_df[attr].astype('Int16')

CelebA_Attribute_df

Unnamed: 0,image_id,5_o_Clock_Shadow,Arched_Eyebrows,Attractive,Bags_Under_Eyes,Bald,Bangs,Big_Lips,Big_Nose,Black_Hair,...,Sideburns,Smiling,Straight_Hair,Wavy_Hair,Wearing_Earrings,Wearing_Hat,Wearing_Lipstick,Wearing_Necklace,Wearing_Necktie,Young
0,000001.jpg,-1,1,1,-1,-1,-1,-1,-1,-1,...,-1,1,1,-1,1,-1,1,-1,-1,1
1,000002.jpg,-1,-1,-1,1,-1,-1,-1,1,-1,...,-1,1,-1,-1,-1,-1,-1,-1,-1,1
2,000003.jpg,-1,-1,-1,-1,-1,-1,1,-1,-1,...,-1,-1,-1,1,-1,-1,-1,-1,-1,1
3,000004.jpg,-1,-1,1,-1,-1,-1,-1,-1,-1,...,-1,-1,1,-1,1,-1,1,1,-1,1
4,000005.jpg,-1,1,1,-1,-1,-1,1,-1,-1,...,-1,-1,-1,-1,-1,-1,1,-1,-1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202594,202595.jpg,-1,-1,1,-1,-1,-1,1,-1,-1,...,-1,-1,-1,-1,-1,-1,1,-1,-1,1
202595,202596.jpg,-1,-1,-1,-1,-1,1,1,-1,-1,...,-1,1,1,-1,-1,-1,-1,-1,-1,1
202596,202597.jpg,-1,-1,-1,-1,-1,-1,-1,-1,1,...,-1,1,-1,-1,-1,-1,-1,-1,-1,1
202597,202598.jpg,-1,1,1,-1,-1,-1,1,-1,1,...,-1,1,-1,1,1,-1,1,-1,-1,1


In [None]:
# save dataframe
CelebA_Attribute_df.to_csv('data/CelebA-RAW/CelebA-ATRRIBUTES-CSV.csv', index=False)

In [None]:
# Directory containing the images
image_dir = 'data/CelebA-RAW/images'

# List to store image metadata
image_data = []
valid_extensions = ('.jpg', '.jpeg', '.png')

# Iterate through each image in the directory
for filename in os.listdir(image_dir):
    if filename.lower().endswith(valid_extensions):
        image_id = os.path.splitext(filename)[0]
        image_path = os.path.join(image_dir, filename)
        
        # Open the image to get dimensions
        with PIL.Image.open(image_path) as img:
            width, height = img.size
        
        # Append the metadata to the list
        image_data.append({
            'image_id': image_id,
            'width': width,
            'height': height
        })

# Create a DataFrame
CelebA_res_df = pd.DataFrame(image_data)
CelebA_res_df.to_csv("data/CelebA-RAW/CelebA-ALL-RES.csv", index=False)

CelebA_res_df

Unnamed: 0,image_id,width,height
0,000001,409,687
1,000002,423,594
2,000003,500,281
3,000004,1648,2464
4,000005,610,826
...,...,...,...
202594,202595,1920,1440
202595,202596,400,620
202596,202597,228,285
202597,202598,535,373


In [None]:
df = CelebA_res_df.copy()

# Define resolution conditions
high_res_condition = (df['width'] >= 1024) & (df['height'] >= 1024)
mid_res_condition = (df['width'] >= 512) & (df['height'] >= 512) & ~high_res_condition
low_res_condition = ~high_res_condition & ~mid_res_condition


# Create the new columns
df['high_res'] = high_res_condition.astype(int)
df['mid_res'] = mid_res_condition.astype(int)
df['low_res'] = low_res_condition.astype(int)

df

Unnamed: 0,image_id,width,height,high_res,mid_res,low_res
0,000001,409,687,0,0,1
1,000002,423,594,0,0,1
2,000003,500,281,0,0,1
3,000004,1648,2464,1,0,0
4,000005,610,826,0,1,0
...,...,...,...,...,...,...
202594,202595,1920,1440,1,0,0
202595,202596,400,620,0,0,1
202596,202597,228,285,0,0,1
202597,202598,535,373,0,0,1


In [48]:
# DataFrame for high-resolution images
high_res_df = df[df['high_res'] == 1][['image_id', 'width', 'height']]

# DataFrame for mid-resolution images
mid_res_df = df[df['mid_res'] == 1][['image_id', 'width', 'height']]

# DataFrame for low-resolution images
low_res_df = df[df['low_res'] == 1][['image_id', 'width', 'height']]

if any(high_res_df['width']<1024) or any(mid_res_df['width']<512):
    print("Warning: Some images are not at the expected resolution.")
    if len(high_res_df) + len(mid_res_df) + len(low_res_df) != len(df):
        print("Warning: Some images are missing resolution information.")
else: print("All images are at the expected resolution.")

All images are at the expected resolution.


In [49]:
high_res_path = 'data/CelebA-RAW/CelebA-HIGH-RES.csv'
mid_res_path = 'data/CelebA-RAW/CelebA-MID-RES.csv'
low_res_path = 'data/CelebA-RAW/CelebA-LOW-RES.csv'

# save dataframe
high_res_df.to_csv(high_res_path, index=False)
mid_res_df.to_csv(mid_res_path, index=False)
low_res_df.to_csv(low_res_path, index=False)

In [50]:
high_res_df

Unnamed: 0,image_id,width,height
3,000004,1648,2464
8,000009,1333,2000
23,000024,1600,1200
43,000044,2436,2661
53,000054,1800,1800
...,...,...,...
202563,202564,1200,1807
202571,202572,1024,1535
202581,202582,1280,1916
202590,202591,2164,3000


In [51]:
mid_res_df

Unnamed: 0,image_id,width,height
4,000005,610,826
7,000008,637,896
18,000019,570,751
24,000025,737,1024
29,000030,1000,1299
...,...,...,...
202536,202537,630,790
202556,202557,586,572
202579,202580,825,750
202591,202592,968,1290


In [52]:
low_res_df

Unnamed: 0,image_id,width,height
0,000001,409,687
1,000002,423,594
2,000003,500,281
5,000006,410,594
6,000007,334,500
...,...,...,...
202592,202593,373,500
202593,202594,410,595
202595,202596,400,620
202596,202597,228,285


In [53]:
mid_res_df[(mid_res_df['width'] > 1024) | (mid_res_df['height'] > 1024)] # may fall in high res

Unnamed: 0,image_id,width,height
29,000030,1000,1299
99,000100,859,1124
120,000121,900,1350
128,000129,817,1222
149,000150,698,1056
...,...,...,...
202494,202495,800,1100
202502,202503,850,1183
202514,202515,936,1163
202523,202524,769,1153


# Copy Images to Specific Folder

In [None]:
import os
import shutil

# Function to copy images
def copy_images(df,source_dir, dest_folder):
    print("processing...")
    for image_id in df['image_id']:
        # Find the actual file in source directory (could be .jpg, .jpeg, .png)
        for ext in ['.jpg', '.jpeg', '.png']:
            src_file = os.path.join(source_dir, image_id + ext)
            if os.path.exists(src_file):
                shutil.copy2(src_file, os.path.join(dest_folder, os.path.basename(src_file)))
                break  # Found and copied, move to next image

In [55]:
# Source image directory
source_dir = 'data/CelebA-RAW/images'

# Destination directories
dest_base = 'data/CelebA-RAW/processed'
high_dir = os.path.join(dest_base, 'high_res')
mid_dir = os.path.join(dest_base, 'mid_res')
low_dir = os.path.join(dest_base, 'low_res')

# Create directories if they don't exist
os.makedirs(high_dir, exist_ok=True)
os.makedirs(mid_dir, exist_ok=True)
os.makedirs(low_dir, exist_ok=True)

In [57]:
# Copy images to their respective folders
copy_images(
    df=high_res_df,
    source_dir=source_dir,
    dest_folder=high_dir
)

print("High-resolution images copied to their respective folders.")

High-resolution images copied to their respective folders.


In [60]:
copy_images(
    df=mid_res_df,
    source_dir=source_dir,
    dest_folder=mid_dir
)