In [8]:
import os
import numpy as np
import pandas as pd
from PIL import Image

def preprocess_and_save_images_pil(csv_path, raw_image_folder, processed_folder, image_size=(224, 224)):
    os.makedirs(processed_folder, exist_ok=True)

    df = pd.read_csv(csv_path)
    used_filenames = df['image'].dropna().unique()

    for filename in used_filenames:
        raw_path = os.path.join(raw_image_folder, filename)
        save_name = os.path.splitext(filename)[0] + ".npy"
        save_path = os.path.join(processed_folder, save_name)

        if not os.path.exists(raw_path):
            print(f"⚠️ Missing image: {raw_path}")
            continue

        try:
            # Open image using PIL and convert to RGB
            img = Image.open(raw_path).convert("RGB")

            # Resize image
            img = img.resize(image_size)

            # Convert to NumPy and normalize to [0, 1]
            img_array = np.array(img).astype("float32") / 255.0

            # Save as .npy
            np.save(save_path, img_array)

        except Exception as e:
            print(f"❌ Error processing {filename}: {e}")

    print(f"✅ Finished preprocessing {len(used_filenames)} images into: {processed_folder}")


In [12]:
preprocess_and_save_images_pil(
    csv_path="/Users/Shai/OneDrive/Documents/THESIS/thesis-airq/data_matching/matched/7_24_matched.csv",
    raw_image_folder="/Users/Shai/OneDrive/Desktop/THESIS_data/images/724 images/",
    processed_folder="/Users/Shai/OneDrive/Documents/THESIS/thesis-airq/preprocessing_images/processed_images/7_24/",
    image_size=(224, 224)
)

preprocess_and_save_images_pil(
    csv_path="/Users/Shai/OneDrive/Documents/THESIS/thesis-airq/data_matching/matched/10_19_matched.csv",
    raw_image_folder="/Users/Shai/OneDrive/Desktop/THESIS_data/images/1019 images/",
    processed_folder="/Users/Shai/OneDrive/Documents/THESIS/thesis-airq/preprocessing_images/processed_images/10_19/",
    image_size=(224, 224)
)

preprocess_and_save_images_pil(
    csv_path="/Users/Shai/OneDrive/Documents/THESIS/thesis-airq/data_matching/matched/11_10_matched.csv",
    raw_image_folder="/Users/Shai/OneDrive/Desktop/THESIS_data/images/1110 images/",
    processed_folder="/Users/Shai/OneDrive/Documents/THESIS/thesis-airq/preprocessing_images/processed_images/11_10/",
    image_size=(224, 224)
)


✅ Finished preprocessing 36 images into: /Users/Shai/OneDrive/Documents/THESIS/thesis-airq/preprocessing_images/processed_images/7_24/
✅ Finished preprocessing 37 images into: /Users/Shai/OneDrive/Documents/THESIS/thesis-airq/preprocessing_images/processed_images/10_19/
✅ Finished preprocessing 33 images into: /Users/Shai/OneDrive/Documents/THESIS/thesis-airq/preprocessing_images/processed_images/11_10/


In [13]:
#CHECK 1019 IMAGES
df = pd.read_csv("/Users/Shai/OneDrive/Documents/THESIS/thesis-airq/data_matching/matched/10_19_matched.csv")
print(df['image'].nunique())
print(df['image'].value_counts())


37
image
2019-10-19 104440.JPG    28511
2019-10-19 111435.JPG    22284
2019-10-19 104604.JPG    13514
2019-10-19 122032.jpg    12249
2019-10-19 124454.jpg    10916
2019-10-19 115942.JPG    10572
2019-10-19 130733.jpg     8602
2019-10-19 132416.jpg     8219
2019-10-19 134327.jpg     5622
2019-10-19 140059.jpg     1029
2019-10-19 111323.JPG      521
2019-10-19 111233.JPG      270
2019-10-19 104534.JPG      259
2019-10-19 104445.JPG      217
2019-10-19 104511.JPG      197
2019-10-19 111424.JPG       95
2019-10-19 115933.jpg       81
2019-10-19 111305.JPG       78
2019-10-19 111314.JPG       77
2019-10-19 115923.jpg       63
2019-10-19 124448.jpg       53
2019-10-19 134320.jpg       41
2019-10-19 130729.jpg       32
2019-10-19 115930.jpg       27
2019-10-19 122029.jpg       26
2019-10-19 122026.jpg       26
2019-10-19 124443.jpg       26
2019-10-19 140050.jpg       26
2019-10-19 132413.jpg       25
2019-10-19 130723.jpg       25
2019-10-19 132410.jpg       25
2019-10-19 130726.jpg       25