In [None]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import pathlib as Path
import shutil

# First I want to take the CelebA data set and sort it by Hat and No Hat and create a new dataset of sorted images
### This code does not need to be run and will be commented out as dataset is already created as code takes significant time as it must sort 200K images
#### Double # are comments

In [None]:
## Create directories for test images
## One will hold images with Hat value of 1 and the other with hat value of -1 (No Hat)
## Declare full paths
# test_set_path = '/kaggle/working/test_set'
# hat_set_path = '/kaggle/working/test_set/hat_set'
# no_hat_set_path = '/kaggle/working/test_set/no_hat_set'

## Create Directories using paths
# os.makedirs(test_set_path, exist_ok=True)
# os.makedirs(hat_set_path, exist_ok=True)
# os.makedirs(no_hat_set_path, exist_ok=True)

In [None]:
## Sort Images from celeba dataset into the newly created directories. 
## Will use the CSV to sort based on Has_hat to sort the images into the directories

## Read the CSV that holds the attributes to each image
# attributes = pd.read_csv("/kaggle/input/celeba-dataset/list_attr_celeba.csv")

## Path to the folder that holds all the images
# image_path = '/kaggle/input/celeba-dataset/img_align_celeba/img_align_celeba'

## Create a new dataframe that only has 2 columns, image ID and the hat attribute
# test_images_df = attributes[['image_id', 'Wearing_Hat']]

## Check size of the frame to ensure it looks correct by viewing how many of each attribute are present
# print(test_images_df['Wearing_Hat'].value_counts())

In [None]:

## Iterate through the dataframe using iterrows
# for index, row in test_images_df.iterrows():
    ## image name is set to the ID and wearing hat is set to the value of the hat attribute
    # img_name = row['image_id']
    # wearing_hat = row['Wearing_Hat']

    ## Need the full path so taht the images can be sorted to respective folders
    # full_path = os.path.join(image_path, img_name)

    ## If the person is not wearing a hat the image destination is set to hat path
    # if wearing_hat == -1: 
        # destination = os.path.join(no_hat_set_path, img_name)
    ## If the person is wearing a hat path is set to hat path instead
    # else:  
        # destination = os.path.join(hat_set_path, img_name)

    ## Using Shutil the image is then copied into each respective folder
    ## This will allow images to be labeled as they are binary and sorted
    # shutil.copy(full_path, destination)


In [None]:

## Create zip files for both directories
## I did this so that the files could be stored as a new dataset in Kaggle
## Save time by not needing to repeat sort the images each time code is run
# shutil.make_archive('/kaggle/working/test_set/hat_set', 'zip', hat_set_path)
# shutil.make_archive('/kaggle/working/test_set/no_hat_set', 'zip', no_hat_set_path)


# Prepare Test data 

In [None]:
# Need to create a data frame to process images to have access to IDs 
test_image_folder = '/kaggle/input/hat-or-no-hat-that-is-the-question-spring-25/test_set/test_set'

# Gather the paths for the folder
files = os.listdir(test_image_folder)

# Once the image paths are gathered, remove the extension (.jpg)
test_image_ids =  [os.path.splitext(file)[0] for file in files]
# These are the IDs used for the submission



In [None]:
# Create a df for the test so that the images can be passed through a generator
Test_df = pd.DataFrame(files, columns=['id'])

Test_df['path'] = Test_df['id'].apply(lambda x: os.path.join(test_image_folder, x))

path = Test_df.iat[60,1]

path

# Preprocess Images

In [None]:
# Create a gen for the test data
test_gen = tf.keras.preprocessing.image.ImageDataGenerator()

In [None]:
# Create training and validation data from the images that are provided

# For training and validation, I realized that the directory name is test_set, HOWEVER, it should be train set
# LEft this way to avoid having to rename entire set and reupload to Kaggle

training_data = tf.keras.utils.image_dataset_from_directory(
    '/kaggle/input/no-hat-and-hat/test_set', # Provide directory where images are stored
    labels = "inferred", # Inferred labels = labels are based on folders
    label_mode = "binary", # binary as we are dealing with yes or no (hat/no hat)
    batch_size = 32,  # set the batch size
    image_size = (224, 224), # Resize the images
    shuffle = True, # Shuffle data to ensure randomness
    seed = 42,  # Seed so it can be replicated
    validation_split = .2, # create a validation split to test how well model learns
    subset = "training", # Name of the subset
)

val_data = tf.keras.utils.image_dataset_from_directory(
    '/kaggle/input/no-hat-and-hat/test_set',
    labels = "inferred",
    label_mode = "binary",
    batch_size = 32,
    image_size = (224, 224),
    shuffle = True,
    seed = 42, 
    validation_split = .2,
    subset = "validation", 
)

test_data = test_gen.flow_from_dataframe(
    dataframe= Test_df,
    x_col ='path',
    target_size = (224, 224),
    color_mode="rgb",
    class_mode = None, 
    batch_size = 32,
    shuffle = False
)

In [None]:
import matplotlib.pyplot as plt


# Get a batch from the training data
# Want to visualize the images and see the labels to ensure that Hats and No hats are labeled correctly
for images, labels in training_data.take(2):  # take(1) gets only the first batch
    # Only show first 5 images 
    for i in range(4):
        plt.figure(figsize=(4,4))
        plt.imshow(images[i].numpy().astype("uint8")) # show images by converting to NP array
        plt.title(labels[i].numpy()) # Label each image with respective label
        plt.show() 

## Hats are labeled as 0 and No Hats are labeled as 1

# Now we create the model 

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Input, MaxPool2D, Flatten, Dense, Rescaling, Dropout,GlobalAveragePooling2D
from tensorflow.keras.applications import ResNet50, ResNet101

In [None]:
    # Add Conv2d and Pool layers
    #Conv2D(32, (3,3), activation ='relu'),
    #MaxPool2D((2,2)),
    #Conv2D(64, (3,3), activation = 'relu'),
    #MaxPool2D((2,2)),
    #Conv2D(128, (3,3), activation = 'relu'),
    #MaxPool2D((2,2)),
    #Conv2D(256, (3,3), activation = 'relu'),
    #MaxPool2D((2,2)),
    #Conv2D(512, (3, 3), activation='relu'),
    #MaxPool2D((2, 2)),

In [None]:
# Create an instance of the resnet model 
base_model = tf.keras.applications.ResNet101(include_top = False,
                                            weights = 'imagenet',
                                           input_shape = (224, 224, 3),)
for layer in base_model.layers[-30:]:
    layer.trainable = True

In [None]:
resnet_model = Sequential([
    # Rescale the images to ensure normalization
    Rescaling(1./255, input_shape = (224, 224,3)),
    # Add the ResNet Model
    base_model,
    # Add flatten layer before Dense layers
    GlobalAveragePooling2D(),
    # Add Dense layers
    Dense(256, activation = 'relu'),
    Dropout(.4),
    Dense(128, activation = 'relu'),
    Dropout(.4),
    # Binary classification means we want to use sigmoid
    Dense(1, activation = 'sigmoid')
])

In [None]:
# Check to make sure model is created properly
resnet_model.summary()

# Train Model

In [None]:
# Compile model, use binary crossentropy (since output is binary)
resnet_model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics =['accuracy'])

In [None]:
# fit the model and run (train)
resnet_model.fit(
    training_data, 
    validation_data = val_data, 
    epochs =15, 
)

In [None]:
# These are the IDs used for the submission
#test_image_ids

# Test Model using test data
predictions = resnet_model.predict(test_data)

print(predictions[:10])  # Print the first 10 predictions

## Hats are labeled as 0 and No Hats are labeled as 1
# Need to create labels, so if the value of the prediction is >.5 round to 0 (Hat)
# If greater than .5 (no Hat)
predicted_class = [1 if pred >= .5 else 0 for pred in predictions]

# if the value of class is 0 (Hat) if the value is 1 (No Hat)
predicted_labels = ['Hat' if prcls == 0 else 'No Hat' for prcls in predicted_class]

# Create data Frame for submission (id, prediction)
submission_df = pd.DataFrame({
    'id': test_image_ids,
    'class': predicted_labels
})

In [None]:
print(submission_df['class'].value_counts())

In [None]:
# Save as CSV
submission_df.to_csv("submission.csv", index=False)
print("submission saved as df")