In [None]:
import glob
import re
import sqlite3
import time
import traceback
from pathlib import Path

import cv2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial import distance
from sklearn.model_selection import train_test_split

In [None]:
FILENAME_MAPPINGS = "data/gz2_filename_mapping.csv"
HART16 = "data/gz2_hart16.csv"
ORIGINAL_IMAGES_DIR = "data/images/"
PROCESSED_IMAGES_DIR = "data/images_processed/"
RANDOM_STATE = 32
TRAIN_IMAGES_DIR = "data/train_images/"
TEST_IMAGES_DIR = "data/test_images/"

In [None]:
dtype_hart = {
    "dr7objid": "Int64",
    "ra": float,
    "dec": float,
    "rastring": str,
    "decstring": str,
    "sample": str,
    "gz2_class": str,
    "total_classifications": int,
    "total_votes": int,
    "t01_smooth_or_features_a01_smooth_count": int,
    "t01_smooth_or_features_a01_smooth_weight": float,
    "t01_smooth_or_features_a01_smooth_fraction": float,
    "t01_smooth_or_features_a01_smooth_weighted_fraction": float,
    "t01_smooth_or_features_a01_smooth_debiased": float,
    "t01_smooth_or_features_a01_smooth_flag": int,
    "t01_smooth_or_features_a02_features_or_disk_count": int,
    "t01_smooth_or_features_a02_features_or_disk_weight": float,
    "t01_smooth_or_features_a02_features_or_disk_fraction": float,
    "t01_smooth_or_features_a02_features_or_disk_weighted_fraction": float,
    "t01_smooth_or_features_a02_features_or_disk_debiased": float,
    "t01_smooth_or_features_a02_features_or_disk_flag": int,
    "t01_smooth_or_features_a03_star_or_artifact_count": int,
    "t01_smooth_or_features_a03_star_or_artifact_weight": float,
    "t01_smooth_or_features_a03_star_or_artifact_fraction": float,
    "t01_smooth_or_features_a03_star_or_artifact_weighted_fraction": float,
    "t01_smooth_or_features_a03_star_or_artifact_debiased": float,
    "t01_smooth_or_features_a03_star_or_artifact_flag": int,
    "t02_edgeon_a04_yes_count": int,
    "t02_edgeon_a04_yes_weight": float,
    "t02_edgeon_a04_yes_fraction": float,
    "t02_edgeon_a04_yes_weighted_fraction": float,
    "t02_edgeon_a04_yes_debiased": float,
    "t02_edgeon_a04_yes_flag": int,
    "t02_edgeon_a05_no_count": int,
    "t02_edgeon_a05_no_weight": float,
    "t02_edgeon_a05_no_fraction": float,
    "t02_edgeon_a05_no_weighted_fraction": float,
    "t02_edgeon_a05_no_debiased": float,
    "t02_edgeon_a05_no_flag": int,
    "t03_bar_a06_bar_count": int,
    "t03_bar_a06_bar_weight": float,
    "t03_bar_a06_bar_fraction": float,
    "t03_bar_a06_bar_weighted_fraction": float,
    "t03_bar_a06_bar_debiased": float,
    "t03_bar_a06_bar_flag": int,
    "t03_bar_a07_no_bar_count": int,
    "t03_bar_a07_no_bar_weight": float,
    "t03_bar_a07_no_bar_fraction": float,
    "t03_bar_a07_no_bar_weighted_fraction": float,
    "t03_bar_a07_no_bar_debiased": float,
    "t03_bar_a07_no_bar_flag": int,
    "t04_spiral_a08_spiral_count": int,
    "t04_spiral_a08_spiral_weight": float,
    "t04_spiral_a08_spiral_fraction": float,
    "t04_spiral_a08_spiral_weighted_fraction": float,
    "t04_spiral_a08_spiral_debiased": float,
    "t04_spiral_a08_spiral_flag": int,
    "t04_spiral_a09_no_spiral_count": int,
    "t04_spiral_a09_no_spiral_weight": float,
    "t04_spiral_a09_no_spiral_fraction": float,
    "t04_spiral_a09_no_spiral_weighted_fraction": float,
    "t04_spiral_a09_no_spiral_debiased": float,
    "t04_spiral_a09_no_spiral_flag": int,
    "t05_bulge_prominence_a10_no_bulge_count": int,
    "t05_bulge_prominence_a10_no_bulge_weight": float,
    "t05_bulge_prominence_a10_no_bulge_fraction": float,
    "t05_bulge_prominence_a10_no_bulge_weighted_fraction": float,
    "t05_bulge_prominence_a10_no_bulge_debiased": float,
    "t05_bulge_prominence_a10_no_bulge_flag": int,
    "t05_bulge_prominence_a11_just_noticeable_count": int,
    "t05_bulge_prominence_a11_just_noticeable_weight": float,
    "t05_bulge_prominence_a11_just_noticeable_fraction": float,
    "t05_bulge_prominence_a11_just_noticeable_weighted_fraction": float,
    "t05_bulge_prominence_a11_just_noticeable_debiased": float,
    "t05_bulge_prominence_a11_just_noticeable_flag": int,
    "t05_bulge_prominence_a12_obvious_count": int,
    "t05_bulge_prominence_a12_obvious_weight": float,
    "t05_bulge_prominence_a12_obvious_fraction": float,
    "t05_bulge_prominence_a12_obvious_weighted_fraction": float,
    "t05_bulge_prominence_a12_obvious_debiased": float,
    "t05_bulge_prominence_a12_obvious_flag": int,
    "t05_bulge_prominence_a13_dominant_count": int,
    "t05_bulge_prominence_a13_dominant_weight": float,
    "t05_bulge_prominence_a13_dominant_fraction": float,
    "t05_bulge_prominence_a13_dominant_weighted_fraction": float,
    "t05_bulge_prominence_a13_dominant_debiased": float,
    "t05_bulge_prominence_a13_dominant_flag": int,
    "t06_odd_a14_yes_count": int,
    "t06_odd_a14_yes_weight": float,
    "t06_odd_a14_yes_fraction": float,
    "t06_odd_a14_yes_weighted_fraction": float,
    "t06_odd_a14_yes_debiased": float,
    "t06_odd_a14_yes_flag": int,
    "t06_odd_a15_no_count": int,
    "t06_odd_a15_no_weight": float,
    "t06_odd_a15_no_fraction": float,
    "t06_odd_a15_no_weighted_fraction": float,
    "t06_odd_a15_no_debiased": float,
    "t06_odd_a15_no_flag": int,
    "t07_rounded_a16_completely_round_count": int,
    "t07_rounded_a16_completely_round_weight": float,
    "t07_rounded_a16_completely_round_fraction": float,
    "t07_rounded_a16_completely_round_weighted_fraction": float,
    "t07_rounded_a16_completely_round_debiased": float,
    "t07_rounded_a16_completely_round_flag": int,
    "t07_rounded_a17_in_between_count": int,
    "t07_rounded_a17_in_between_weight": float,
    "t07_rounded_a17_in_between_fraction": float,
    "t07_rounded_a17_in_between_weighted_fraction": float,
    "t07_rounded_a17_in_between_debiased": float,
    "t07_rounded_a17_in_between_flag": int,
    "t07_rounded_a18_cigar_shaped_count": int,
    "t07_rounded_a18_cigar_shaped_weight": float,
    "t07_rounded_a18_cigar_shaped_fraction": float,
    "t07_rounded_a18_cigar_shaped_weighted_fraction": float,
    "t07_rounded_a18_cigar_shaped_debiased": float,
    "t07_rounded_a18_cigar_shaped_flag": int,
    "t08_odd_feature_a19_ring_count": int,
    "t08_odd_feature_a19_ring_weight": float,
    "t08_odd_feature_a19_ring_fraction": float,
    "t08_odd_feature_a19_ring_weighted_fraction": float,
    "t08_odd_feature_a19_ring_debiased": float,
    "t08_odd_feature_a19_ring_flag": int,
    "t08_odd_feature_a20_lens_or_arc_count": int,
    "t08_odd_feature_a20_lens_or_arc_weight": float,
    "t08_odd_feature_a20_lens_or_arc_fraction": float,
    "t08_odd_feature_a20_lens_or_arc_weighted_fraction": float,
    "t08_odd_feature_a20_lens_or_arc_debiased": float,
    "t08_odd_feature_a20_lens_or_arc_flag": int,
    "t08_odd_feature_a21_disturbed_count": int,
    "t08_odd_feature_a21_disturbed_weight": float,
    "t08_odd_feature_a21_disturbed_fraction": float,
    "t08_odd_feature_a21_disturbed_weighted_fraction": float,
    "t08_odd_feature_a21_disturbed_debiased": float,
    "t08_odd_feature_a21_disturbed_flag": int,
    "t08_odd_feature_a22_irregular_count": int,
    "t08_odd_feature_a22_irregular_weight": float,
    "t08_odd_feature_a22_irregular_fraction": float,
    "t08_odd_feature_a22_irregular_weighted_fraction": float,
    "t08_odd_feature_a22_irregular_debiased": float,
    "t08_odd_feature_a22_irregular_flag": int,
    "t08_odd_feature_a23_other_count": int,
    "t08_odd_feature_a23_other_weight": float,
    "t08_odd_feature_a23_other_fraction": float,
    "t08_odd_feature_a23_other_weighted_fraction": float,
    "t08_odd_feature_a23_other_debiased": float,
    "t08_odd_feature_a23_other_flag": int,
    "t08_odd_feature_a24_merger_count": int,
    "t08_odd_feature_a24_merger_weight": float,
    "t08_odd_feature_a24_merger_fraction": float,
    "t08_odd_feature_a24_merger_weighted_fraction": float,
    "t08_odd_feature_a24_merger_debiased": float,
    "t08_odd_feature_a24_merger_flag": int,
    "t08_odd_feature_a38_dust_lane_count": int,
    "t08_odd_feature_a38_dust_lane_weight": float,
    "t08_odd_feature_a38_dust_lane_fraction": float,
    "t08_odd_feature_a38_dust_lane_weighted_fraction": float,
    "t08_odd_feature_a38_dust_lane_debiased": float,
    "t08_odd_feature_a38_dust_lane_flag": int,
    "t09_bulge_shape_a25_rounded_count": int,
    "t09_bulge_shape_a25_rounded_weight": float,
    "t09_bulge_shape_a25_rounded_fraction": float,
    "t09_bulge_shape_a25_rounded_weighted_fraction": float,
    "t09_bulge_shape_a25_rounded_debiased": float,
    "t09_bulge_shape_a25_rounded_flag": int,
    "t09_bulge_shape_a26_boxy_count": int,
    "t09_bulge_shape_a26_boxy_weight": float,
    "t09_bulge_shape_a26_boxy_fraction": float,
    "t09_bulge_shape_a26_boxy_weighted_fraction": float,
    "t09_bulge_shape_a26_boxy_debiased": float,
    "t09_bulge_shape_a26_boxy_flag": int,
    "t09_bulge_shape_a27_no_bulge_count": int,
    "t09_bulge_shape_a27_no_bulge_weight": float,
    "t09_bulge_shape_a27_no_bulge_fraction": float,
    "t09_bulge_shape_a27_no_bulge_weighted_fraction": float,
    "t09_bulge_shape_a27_no_bulge_debiased": float,
    "t09_bulge_shape_a27_no_bulge_flag": int,
    "t10_arms_winding_a28_tight_count": int,
    "t10_arms_winding_a28_tight_weight": float,
    "t10_arms_winding_a28_tight_fraction": float,
    "t10_arms_winding_a28_tight_weighted_fraction": float,
    "t10_arms_winding_a28_tight_debiased": float,
    "t10_arms_winding_a28_tight_flag": int,
    "t10_arms_winding_a29_medium_count": int,
    "t10_arms_winding_a29_medium_weight": float,
    "t10_arms_winding_a29_medium_fraction": float,
    "t10_arms_winding_a29_medium_weighted_fraction": float,
    "t10_arms_winding_a29_medium_debiased": float,
    "t10_arms_winding_a29_medium_flag": int,
    "t10_arms_winding_a30_loose_count": int,
    "t10_arms_winding_a30_loose_weight": float,
    "t10_arms_winding_a30_loose_fraction": float,
    "t10_arms_winding_a30_loose_weighted_fraction": float,
    "t10_arms_winding_a30_loose_debiased": float,
    "t10_arms_winding_a30_loose_flag": int,
    "t11_arms_number_a31_1_count": int,
    "t11_arms_number_a31_1_weight": float,
    "t11_arms_number_a31_1_fraction": float,
    "t11_arms_number_a31_1_weighted_fraction": float,
    "t11_arms_number_a31_1_debiased": float,
    "t11_arms_number_a31_1_flag": int,
    "t11_arms_number_a32_2_count": int,
    "t11_arms_number_a32_2_weight": float,
    "t11_arms_number_a32_2_fraction": float,
    "t11_arms_number_a32_2_weighted_fraction": float,
    "t11_arms_number_a32_2_debiased": float,
    "t11_arms_number_a32_2_flag": int,
    "t11_arms_number_a33_3_count": int,
    "t11_arms_number_a33_3_weight": float,
    "t11_arms_number_a33_3_fraction": float,
    "t11_arms_number_a33_3_weighted_fraction": float,
    "t11_arms_number_a33_3_debiased": float,
    "t11_arms_number_a33_3_flag": int,
    "t11_arms_number_a34_4_count": int,
    "t11_arms_number_a34_4_weight": float,
    "t11_arms_number_a34_4_fraction": float,
    "t11_arms_number_a34_4_weighted_fraction": float,
    "t11_arms_number_a34_4_debiased": float,
    "t11_arms_number_a34_4_flag": int,
    "t11_arms_number_a36_more_than_4_count": int,
    "t11_arms_number_a36_more_than_4_weight": float,
    "t11_arms_number_a36_more_than_4_fraction": float,
    "t11_arms_number_a36_more_than_4_weighted_fraction": float,
    "t11_arms_number_a36_more_than_4_debiased": float,
    "t11_arms_number_a36_more_than_4_flag": int,
    "t11_arms_number_a37_cant_tell_count": int,
    "t11_arms_number_a37_cant_tell_weight": float,
    "t11_arms_number_a37_cant_tell_fraction": float,
    "t11_arms_number_a37_cant_tell_weighted_fraction": float,
    "t11_arms_number_a37_cant_tell_debiased": float,
    "t11_arms_number_a37_cant_tell_flag": int
}

hart_keep = ["dr7objid", "ra", "dec", "gz2_class"]
for col_name in dtype_hart.keys():
    if col_name.endswith("_debiased"):
        hart_keep.append(col_name)

In [None]:
df_mappings = pd.read_csv(FILENAME_MAPPINGS,
                          header=0,
                          dtype={"objid": "Int64",
                                 "sample": str,
                                 "asset_id": int})
df_mappings.info()

In [None]:
df_hart16 = pd.read_csv(HART16,
                        header=0,
                        dtype=dtype_hart,
                        usecols=hart_keep)
df_hart16.info()

In [None]:
df_mappings.duplicated("objid").value_counts()

In [None]:
df_hart16.duplicated("dr7objid").value_counts()

In [None]:
_dupe_df_mappings = df_mappings[df_mappings.duplicated("objid", keep=False)]\
                                .sort_values("asset_id")\
                                .reset_index(drop=True)
_dupe_id_first = _dupe_df_mappings['asset_id'].iloc[0]
_dupe_id_last = _dupe_df_mappings['asset_id'].iloc[-1]
print(f"asset_id in objid duplicates: First: {_dupe_id_first}, Last: {_dupe_id_last}, diff:{_dupe_id_last-_dupe_id_first}, count:{_dupe_df_mappings.shape[0]}")

In [None]:
df_mappings_clean = df_mappings[df_mappings["asset_id"] < 295306]
df_mappings_clean.duplicated("objid").value_counts()

In [None]:
df_joined = pd.merge(df_mappings_clean, df_hart16,
                     left_on="objid",
                     right_on="dr7objid",
                     how="inner",
                     validate="one_to_one",
                     sort=False)
print(df_joined.shape)
print(df_joined.isnull().any(axis=1).sum())
df_joined.dropna(inplace=True)
print(df_joined.shape)
df_joined.head()

In [None]:
image_files = glob.glob(ORIGINAL_IMAGES_DIR + "*.jpg")
print(len(image_files))

regex = re.compile(r"images/(\d*)\.jpg")
image_names = [int(re.search(regex, img).group(1)) for img in image_files]
image_names.sort()
image_names[0:10]

In [None]:
table_asset = df_joined["asset_id"].to_list()

not_in_images = list(set(table_asset) - set(image_names))
print(len(not_in_images))

not_in_table = list(set(image_names) - set(table_asset))
print(len(not_in_table))

In [None]:
df_joined_clean = df_joined[~df_joined["asset_id"].isin(not_in_images)]
df_joined_clean.shape

In [None]:
gz2class_vc = df_hart16["gz2_class"].value_counts()
print("Total classes:", gz2class_vc.shape[0])

very_rare_classes = gz2class_vc.loc[gz2class_vc < 12]
very_rare_classes

In [None]:
regex_spiral_arm_and_winding = re.compile(r"([1234+?][tml])")

def convert_rare_class(original_class_name: str) -> str:
    if not original_class_name.endswith(")"):
        return original_class_name
  
    if original_class_name.startswith("E"):
        return original_class_name

    rare_feature = original_class_name[-3:]
    
    re_match = re.search(regex_spiral_arm_and_winding, original_class_name)
    if re_match is not None:
        shape_bar_bulge = original_class_name[:-5]
        return shape_bar_bulge + rare_feature
    else:
        return original_class_name

very_rare_class_mapping = {}
for orig_class_name in very_rare_classes.index.to_list():
    new_name = convert_rare_class(orig_class_name)
    very_rare_class_mapping[orig_class_name] = new_name

print(len(very_rare_classes.index.to_list()))
print(len(set(very_rare_class_mapping.values())))

In [None]:
df_joined_clean["class_reduced"] = df_joined_clean["gz2_class"].replace(very_rare_class_mapping)
print(df_joined_clean["class_reduced"].value_counts().shape[0])

In [None]:
vc = df_joined_clean["class_reduced"].value_counts()
vc_super_rare = vc[vc <= 3]
super_rare_class_mapping = {}
for orig_class_name in vc_super_rare.index.to_list():
    super_rare_class_mapping[orig_class_name] = "SuperRare"

df_joined_clean["class_reduced"] = df_joined_clean["class_reduced"].replace(super_rare_class_mapping)
print(df_joined_clean["class_reduced"].value_counts().shape[0])

In [None]:
print(df_joined_clean.shape)
df_joined_clean.head()

In [None]:
connection = sqlite3.connect("data/galaxy_data.sqlite")
df_joined_clean.to_sql("galaxy_data", connection, index=False, if_exists="replace")
connection.close()

In [None]:
labels = [text.replace("_debiased", "") for text in hart_keep[4:]]
feature_distribution = (df_joined_clean[hart_keep[4:]] > 0.5).sum()
display(feature_distribution)
ax = feature_distribution.plot.bar(figsize=(16, 6))
ax.set_xticklabels(labels)
plt.show()

In [None]:
MIN_CONTOUR_AREA = 100  
IMAGE_CENTER = (212, 212)
RECT_106_START = IMAGE_CENTER[0] - 106//2
RECT_212_START = IMAGE_CENTER[0] - 212//2
RECT_106_END = RECT_106_START + 106
RECT_212_END = RECT_212_START + 212
TARGET_SIZE = (106, 106)

def process_image(image_name: str, save_dir: str = None, visualize: bool = False):
    image_orig = cv2.imread(f"{ORIGINAL_IMAGES_DIR}{image_name}.jpg")

    gray = cv2.cvtColor(image_orig, cv2.COLOR_BGR2GRAY)

    blurred = cv2.GaussianBlur(gray, (5, 5), 0)

    ret, thresh = cv2.threshold(blurred, 30, 255, 0)  

    element = cv2.getStructuringElement(cv2.MORPH_RECT, ksize=(5, 5))
    dilated = cv2.dilate(thresh, element, iterations=1)

    contours, _ = cv2.findContours(dilated.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    contour_info = []
    acceptable_contours = []
    for contour in contours:
        contour_area = cv2.contourArea(contour)
        if contour_area < MIN_CONTOUR_AREA:
            continue

        M = cv2.moments(contour)
        if M["m00"] != 0:
            cX = int(M["m10"] / M["m00"])
            cY = int(M["m01"] / M["m00"])
            contour_center = (cX, cY)
        else:
            continue
        dist_to_center = distance.euclidean(IMAGE_CENTER, contour_center)
        contour_info.append((contour, dist_to_center))
        if visualize:
            acceptable_contours.append(contour)

    closest_contour = min(contour_info, key = lambda x: x[1])[0]

    if visualize:
        img_contour = image_orig.copy()
        cv2.drawContours(img_contour, acceptable_contours, -1, (0,255,0), 1)
        cv2.drawContours(img_contour, [closest_contour], -1, (0, 0, 255), 2)

    br_x, br_y, br_w, br_h = cv2.boundingRect(closest_contour)
    if visualize:
        img_bounding_rect = image_orig.copy()
        cv2.drawContours(img_bounding_rect, [closest_contour], -1, (0, 0, 255), 2)
        cv2.rectangle(img_bounding_rect, (br_x, br_y), (br_x + br_w, br_y + br_h), (0, 0, 255), 2)

   
    if visualize:
        img_targets = image_orig.copy()
        cv2.rectangle(img_targets, (br_x, br_y), (br_x + br_w, br_y + br_h), (0, 0, 255), 2)  
        cv2.rectangle(img_targets, (RECT_106_START, RECT_106_START), (RECT_106_END, RECT_106_END), (0, 255, 0), 2)  
        cv2.rectangle(img_targets, (RECT_212_START, RECT_212_START), (RECT_212_END, RECT_212_END), (0, 255, 255), 2)  

    in_106_rect = False
    in_212_rect = False
    if br_x >= RECT_106_START and br_y >= RECT_106_START and br_x + br_w <= RECT_106_END and br_y + br_h <= RECT_106_END:
        in_106_rect = True
    elif br_x >= RECT_212_START and br_y >= RECT_212_START and br_x + br_w <= RECT_212_END and br_y + br_h <= RECT_212_END:
        in_212_rect = True

    if in_106_rect:
        final_image = gray[RECT_106_START:RECT_106_END, RECT_106_START:RECT_106_END]
    elif in_212_rect:
        cropped_image = gray[RECT_212_START:RECT_212_END, RECT_212_START:RECT_212_END]
        final_image = cv2.resize(cropped_image, TARGET_SIZE)
    else:
        final_image = cv2.resize(gray, TARGET_SIZE)

    if save_dir is not None:
        cv2.imwrite(f"{save_dir}{image_name}.png", final_image)

    if visualize:
        print("In 106:", in_106_rect, "In 212", in_212_rect)
        plt.figure()
        f, ax = plt.subplots(2, 4, figsize=(13, 6.2))
        ax[0][0].imshow(cv2.cvtColor(image_orig, cv2.COLOR_BGR2RGB))
        ax[0][1].imshow(blurred, cmap='gray', vmin=0, vmax=255)
        ax[0][2].imshow(thresh, cmap='gray', vmin=0, vmax=255)
        ax[0][3].imshow(dilated, cmap='gray', vmin=0, vmax=255)
        ax[1][0].imshow(cv2.cvtColor(img_contour, cv2.COLOR_BGR2RGB))
        ax[1][1].imshow(cv2.cvtColor(img_bounding_rect, cv2.COLOR_BGR2RGB))
        ax[1][2].imshow(cv2.cvtColor(img_targets, cv2.COLOR_BGR2RGB))
        ax[1][3].imshow(final_image, cmap='gray', vmin=0, vmax=255)

        ax[0][0].set_title("Orijinal")
        ax[0][1].set_title("Gaussian Blur")
        ax[0][2].set_title("Eşikleme")
        ax[0][3].set_title("Genleşme")
        ax[1][0].set_title("Kontur")
        ax[1][1].set_title("Sınırlayıcı Kutu")
        ax[1][2].set_title("Hedefi Kırp/Ölçekleme")
        ax[1][3].set_title("Son Görüntü")
        plt.show()

In [None]:
process_image("1541", save_dir=None, visualize=True)
process_image("4002", save_dir=None, visualize=True)

In [None]:
if len(glob.glob(PROCESSED_IMAGES_DIR + "*.png")) >= 1\
    or len(glob.glob(TRAIN_IMAGES_DIR + "*.png")) >= 1\
    or len(glob.glob(TEST_IMAGES_DIR + "*.png")) >= 1:
    raise KeyboardInterrupt

start_process_time = time.time()

current_image = 0
for image in image_names:
    if image in not_in_table:
        continue
    try:
        process_image(image, save_dir=PROCESSED_IMAGES_DIR, visualize=False)
    except Exception as e:
        print(f"❌ Image `{image}` failed to process (current_image int= `{current_image}`)")
        traceback.print_exc()
        break
    current_image += 1
    if current_image % 10_000 == 0:
        print(f"  Processed {current_image:,} image files")
        

_hr, _remainder = divmod(time.time() - start_process_time, 3600)
_min, _sec = divmod(_remainder, 60)
print(f"--- Time Taken: {int(_hr):02d}:{int(_min):02d}:{int(_sec):02d} ---")

In [None]:
processed_image_files = glob.glob(PROCESSED_IMAGES_DIR + "*.png")
print(len(processed_image_files))

In [None]:
if not (len(glob.glob(PROCESSED_IMAGES_DIR + "*.png")) >= 1
        and len(glob.glob(TRAIN_IMAGES_DIR + "*.png")) == 0
        and len(glob.glob(TEST_IMAGES_DIR + "*.png")) == 0):
      raise KeyboardInterrupt

stratify_data = df_joined_clean["class_reduced"].values
x_image_id_names = df_joined_clean["asset_id"]
_junk_y = np.zeros((x_image_id_names.shape[0], 2), dtype=np.int8)
print("asset_id column is sorted:", x_image_id_names.is_monotonic_increasing)

X_train_assets, X_test_assets, _yj_train, _yj_test = train_test_split(x_image_id_names,
                                                                      _junk_y,
                                                                      random_state=RANDOM_STATE,
                                                                      stratify=stratify_data)

for image_name in X_train_assets:
    Path(f"{PROCESSED_IMAGES_DIR}{image_name}.png").rename(f"{TRAIN_IMAGES_DIR}{image_name}.png")
for image_name in X_test_assets:
    Path(f"{PROCESSED_IMAGES_DIR}{image_name}.png").rename(f"{TEST_IMAGES_DIR}{image_name}.png")