In [1]:
!pip install kaggle boto3

Collecting kaggle
  Downloading kaggle-1.6.17.tar.gz (82 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25ldone
[?25h  Created wheel for kaggle: filename=kaggle-1.6.17-py3-none-any.whl size=105786 sha256=53e3c4dd0e14c468d4afe1fff6941206c01f61bea93442947ae659f805e3b5d1
  Stored in directory: /home/sagemaker-user/.cache/pip/wheels/ff/55/fb/b27a466be754d2a06ffe0e37b248d844f090a63b51becea85d
Successfully built kaggle
Installing collected packages: kaggle
Successfully installed kaggle-1.6.17


### Kaggle API
This block of code uses the Kaggle API to grab the images. It is possible to import them a different way, but for repeatability this was the most straightforward. Follow the Kaggle API instructions for getting the API access token setup https://www.kaggle.com/docs/api.

In [2]:
# Grabbing dataset from kaggle api
import os 
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi() 
api.authenticate()

dataset_name = "msambare/fer2013"
download_dir = './fer2013'

api.dataset_download_files(dataset_name, path=download_dir, unzip=True)

Dataset URL: https://www.kaggle.com/datasets/msambare/fer2013


In [6]:
# This uploads all the images from the Kaggle Dataset to S3 
import os
import boto3
from tqdm import tqdm
import sagemaker

# Initialize S3 client
s3_client = boto3.client('s3')

# Initialize SageMaker session and S3 path
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
account_id = boto3.client("sts").get_caller_identity().get("Account")

# Define local path and S3 destination path
local_path = './fer2013'  # Path where the FER-2013 dataset is stored locally
s3_fer_path = f"s3://{bucket}/group-5/fer"
print(f"Uploading data to: {s3_fer_path}")

# Function to upload files maintaining the folder structure
def upload_directory_to_s3(local_path, s3_bucket_path):
    for subdir, dirs, files in tqdm(os.walk(local_path)):
        for file in files:
            local_file_path = os.path.join(subdir, file)
            
            # Construct the S3 path by preserving the folder structure
            relative_path = os.path.relpath(local_file_path, local_path)
            s3_file_path = os.path.join(s3_bucket_path, relative_path).replace("\\", "/")

            # Upload the file to S3 using boto3 client
            s3_client.upload_file(local_file_path, bucket, s3_file_path)
            print(f"Uploaded: {local_file_path} to {s3_file_path}")

# Call the function to upload the FER-2013 dataset
upload_directory_to_s3(local_path, s3_fer_path)

# Store the S3 path in a variable for later use
%store s3_fer_path


Uploading data to: s3://sagemaker-us-east-1-399018723364/group-5/fer


0it [00:00, ?it/s]

Uploaded: ./fer2013/test/angry/PrivateTest_10131363.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/angry/PrivateTest_10131363.jpg
Uploaded: ./fer2013/test/angry/PrivateTest_10304478.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/angry/PrivateTest_10304478.jpg
Uploaded: ./fer2013/test/angry/PrivateTest_1054527.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/angry/PrivateTest_1054527.jpg
Uploaded: ./fer2013/test/angry/PrivateTest_10590091.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/angry/PrivateTest_10590091.jpg
Uploaded: ./fer2013/test/angry/PrivateTest_1109992.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/angry/PrivateTest_1109992.jpg
Uploaded: ./fer2013/test/angry/PrivateTest_11296953.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/angry/PrivateTest_11296953.jpg
Uploaded: ./fer2013/test/angry/PrivateTest_12000629.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/angry/PrivateTest_12

3it [00:37, 12.62s/it]

Uploaded: ./fer2013/test/angry/PublicTest_99509833.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/angry/PublicTest_99509833.jpg
Uploaded: ./fer2013/test/angry/PublicTest_99607072.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/angry/PublicTest_99607072.jpg
Uploaded: ./fer2013/test/angry/PublicTest_99646813.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/angry/PublicTest_99646813.jpg
Uploaded: ./fer2013/test/angry/PublicTest_99743122.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/angry/PublicTest_99743122.jpg
Uploaded: ./fer2013/test/disgust/PrivateTest_11895083.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/disgust/PrivateTest_11895083.jpg
Uploaded: ./fer2013/test/disgust/PrivateTest_19671520.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/disgust/PrivateTest_19671520.jpg
Uploaded: ./fer2013/test/disgust/PrivateTest_21629266.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/disgust/Privat

4it [00:42,  9.92s/it]

Uploaded: ./fer2013/test/disgust/PublicTest_9727119.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/disgust/PublicTest_9727119.jpg
Uploaded: ./fer2013/test/disgust/PublicTest_97476336.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/disgust/PublicTest_97476336.jpg
Uploaded: ./fer2013/test/disgust/PublicTest_98815442.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/disgust/PublicTest_98815442.jpg
Uploaded: ./fer2013/test/disgust/PublicTest_99162116.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/disgust/PublicTest_99162116.jpg
Uploaded: ./fer2013/test/disgust/PublicTest_9982221.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/disgust/PublicTest_9982221.jpg
Uploaded: ./fer2013/test/fear/PrivateTest_10153550.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/fear/PrivateTest_10153550.jpg
Uploaded: ./fer2013/test/fear/PrivateTest_10254684.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/fear/PrivateT

5it [01:22, 19.67s/it]

Uploaded: ./fer2013/test/fear/PublicTest_99607417.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/fear/PublicTest_99607417.jpg
Uploaded: ./fer2013/test/fear/PublicTest_99801821.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/fear/PublicTest_99801821.jpg
Uploaded: ./fer2013/test/happy/PrivateTest_10077120.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/happy/PrivateTest_10077120.jpg
Uploaded: ./fer2013/test/happy/PrivateTest_10470092.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/happy/PrivateTest_10470092.jpg
Uploaded: ./fer2013/test/happy/PrivateTest_10513598.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/happy/PrivateTest_10513598.jpg
Uploaded: ./fer2013/test/happy/PrivateTest_10516065.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/happy/PrivateTest_10516065.jpg
Uploaded: ./fer2013/test/happy/PrivateTest_10613684.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/happy/PrivateTest_106136

6it [02:35, 36.04s/it]

Uploaded: ./fer2013/test/happy/PublicTest_99849498.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/happy/PublicTest_99849498.jpg
Uploaded: ./fer2013/test/neutral/PrivateTest_10086748.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/neutral/PrivateTest_10086748.jpg
Uploaded: ./fer2013/test/neutral/PrivateTest_10767287.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/neutral/PrivateTest_10767287.jpg
Uploaded: ./fer2013/test/neutral/PrivateTest_11123843.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/neutral/PrivateTest_11123843.jpg
Uploaded: ./fer2013/test/neutral/PrivateTest_11164800.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/neutral/PrivateTest_11164800.jpg
Uploaded: ./fer2013/test/neutral/PrivateTest_11239107.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/neutral/PrivateTest_11239107.jpg
Uploaded: ./fer2013/test/neutral/PrivateTest_11262548.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/t

7it [03:24, 40.16s/it]

Uploaded: ./fer2013/test/neutral/PublicTest_99340375.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/neutral/PublicTest_99340375.jpg
Uploaded: ./fer2013/test/neutral/PublicTest_99462811.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/neutral/PublicTest_99462811.jpg
Uploaded: ./fer2013/test/neutral/PublicTest_99491200.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/neutral/PublicTest_99491200.jpg
Uploaded: ./fer2013/test/neutral/PublicTest_99862107.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/neutral/PublicTest_99862107.jpg
Uploaded: ./fer2013/test/sad/PrivateTest_10247676.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/sad/PrivateTest_10247676.jpg
Uploaded: ./fer2013/test/sad/PrivateTest_10455506.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/sad/PrivateTest_10455506.jpg
Uploaded: ./fer2013/test/sad/PrivateTest_10658656.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/sad/PrivateTest_10

8it [04:15, 43.44s/it]

Uploaded: ./fer2013/test/sad/PublicTest_9967827.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/sad/PublicTest_9967827.jpg
Uploaded: ./fer2013/test/sad/PublicTest_99688200.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/sad/PublicTest_99688200.jpg
Uploaded: ./fer2013/test/sad/PublicTest_99741160.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/sad/PublicTest_99741160.jpg
Uploaded: ./fer2013/test/sad/PublicTest_99767171.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/sad/PublicTest_99767171.jpg
Uploaded: ./fer2013/test/surprise/PrivateTest_10072988.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/surprise/PrivateTest_10072988.jpg
Uploaded: ./fer2013/test/surprise/PrivateTest_10089743.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/surprise/PrivateTest_10089743.jpg
Uploaded: ./fer2013/test/surprise/PrivateTest_104142.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/surprise/PrivateTest_104142.j

9it [04:51, 41.12s/it]

Uploaded: ./fer2013/test/surprise/PublicTest_98567249.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/surprise/PublicTest_98567249.jpg
Uploaded: ./fer2013/test/surprise/PublicTest_98972870.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/surprise/PublicTest_98972870.jpg
Uploaded: ./fer2013/test/surprise/PublicTest_99242645.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/surprise/PublicTest_99242645.jpg
Uploaded: ./fer2013/test/surprise/PublicTest_99446963.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/test/surprise/PublicTest_99446963.jpg
Uploaded: ./fer2013/train/angry/Training_10118481.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/angry/Training_10118481.jpg
Uploaded: ./fer2013/train/angry/Training_10120469.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/angry/Training_10120469.jpg
Uploaded: ./fer2013/train/angry/Training_10131352.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/angry/Tra

11it [07:33, 59.66s/it]

Uploaded: ./fer2013/train/angry/Training_99966135.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/angry/Training_99966135.jpg
Uploaded: ./fer2013/train/angry/Training_99982465.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/angry/Training_99982465.jpg
Uploaded: ./fer2013/train/disgust/Training_10371709.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/disgust/Training_10371709.jpg
Uploaded: ./fer2013/train/disgust/Training_10598340.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/disgust/Training_10598340.jpg
Uploaded: ./fer2013/train/disgust/Training_1070239.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/disgust/Training_1070239.jpg
Uploaded: ./fer2013/train/disgust/Training_11050021.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/disgust/Training_11050021.jpg
Uploaded: ./fer2013/train/disgust/Training_11550217.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/disgust/Training_11550217

12it [07:51, 49.36s/it]

Uploaded: ./fer2013/train/disgust/Training_99142151.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/disgust/Training_99142151.jpg
Uploaded: ./fer2013/train/disgust/Training_9948239.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/disgust/Training_9948239.jpg
Uploaded: ./fer2013/train/disgust/Training_99747227.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/disgust/Training_99747227.jpg
Uploaded: ./fer2013/train/disgust/Training_99947220.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/disgust/Training_99947220.jpg
Uploaded: ./fer2013/train/fear/Training_10018621.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/fear/Training_10018621.jpg
Uploaded: ./fer2013/train/fear/Training_10031494.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/fear/Training_10031494.jpg
Uploaded: ./fer2013/train/fear/Training_10110501.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/fear/Training_10110501.jpg
Uploa

13it [10:37, 79.92s/it]

Uploaded: ./fer2013/train/fear/Training_99943180.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/fear/Training_99943180.jpg
Uploaded: ./fer2013/train/fear/Training_99950628.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/fear/Training_99950628.jpg
Uploaded: ./fer2013/train/fear/Training_99979234.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/fear/Training_99979234.jpg
Uploaded: ./fer2013/train/fear/Training_99979665.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/fear/Training_99979665.jpg
Uploaded: ./fer2013/train/fear/Training_99984859.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/fear/Training_99984859.jpg
Uploaded: ./fer2013/train/fear/Training_99999696.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/fear/Training_99999696.jpg
Uploaded: ./fer2013/train/happy/Training_10019449.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/happy/Training_10019449.jpg
Uploaded: ./fer2013/train

14it [15:31, 138.35s/it]

Uploaded: ./fer2013/train/happy/Training_99966265.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/happy/Training_99966265.jpg
Uploaded: ./fer2013/train/happy/Training_99971684.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/happy/Training_99971684.jpg
Uploaded: ./fer2013/train/happy/Training_99973350.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/happy/Training_99973350.jpg
Uploaded: ./fer2013/train/happy/Training_99976548.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/happy/Training_99976548.jpg
Uploaded: ./fer2013/train/happy/Training_99988263.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/happy/Training_99988263.jpg
Uploaded: ./fer2013/train/neutral/Training_10002154.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/neutral/Training_10002154.jpg
Uploaded: ./fer2013/train/neutral/Training_10031781.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/neutral/Training_10031781.jpg
Uploa

15it [18:51, 155.41s/it]

Uploaded: ./fer2013/train/neutral/Training_99917717.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/neutral/Training_99917717.jpg
Uploaded: ./fer2013/train/neutral/Training_99959472.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/neutral/Training_99959472.jpg
Uploaded: ./fer2013/train/neutral/Training_99968496.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/neutral/Training_99968496.jpg
Uploaded: ./fer2013/train/neutral/Training_99969020.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/neutral/Training_99969020.jpg
Uploaded: ./fer2013/train/sad/Training_10022789.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/sad/Training_10022789.jpg
Uploaded: ./fer2013/train/sad/Training_10031481.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/sad/Training_10031481.jpg
Uploaded: ./fer2013/train/sad/Training_10048646.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/sad/Training_10048646.jpg
Uploaded:

16it [22:06, 166.94s/it]

Uploaded: ./fer2013/train/sad/Training_99943858.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/sad/Training_99943858.jpg
Uploaded: ./fer2013/train/sad/Training_99950687.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/sad/Training_99950687.jpg
Uploaded: ./fer2013/train/sad/Training_99987906.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/sad/Training_99987906.jpg
Uploaded: ./fer2013/train/sad/Training_99996831.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/sad/Training_99996831.jpg
Uploaded: ./fer2013/train/surprise/Training_10013223.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/surprise/Training_10013223.jpg
Uploaded: ./fer2013/train/surprise/Training_1002457.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/surprise/Training_1002457.jpg
Uploaded: ./fer2013/train/surprise/Training_10028230.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/surprise/Training_10028230.jpg
Uploaded: ./f

17it [24:11, 85.35s/it] 

Uploaded: ./fer2013/train/surprise/Training_99916297.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/surprise/Training_99916297.jpg
Uploaded: ./fer2013/train/surprise/Training_99924420.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/surprise/Training_99924420.jpg
Uploaded: ./fer2013/train/surprise/Training_99937001.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/surprise/Training_99937001.jpg
Uploaded: ./fer2013/train/surprise/Training_99951755.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/surprise/Training_99951755.jpg
Uploaded: ./fer2013/train/surprise/Training_99984132.jpg to s3://sagemaker-us-east-1-399018723364/group-5/fer/train/surprise/Training_99984132.jpg
Stored 's3_fer_path' (str)





In [None]:
# This only uploads the first 100 images of each emotion (reduces costs associated with storage)
# import os
# import boto3
# from tqdm import tqdm
# import sagemaker
# import shutil

# # Initialize S3 client
# s3_client = boto3.client('s3')

# # Initialize SageMaker session and S3 path
# sess = sagemaker.Session()
# bucket = sess.default_bucket()
# role = sagemaker.get_execution_role()
# region = boto3.Session().region_name
# account_id = boto3.client("sts").get_caller_identity().get("Account")

# # Define local path and S3 destination path
# local_path = './fer2013'  # Path where the FER-2013 dataset is stored locally
# s3_fer_path = f"s3://{bucket}/group-5/fer"
# print(f"Uploading data to: {s3_fer_path}")

# # Function to upload files maintaining the folder structure, limited to 100 images per emotion
# def upload_directory_to_s3(local_path, s3_bucket_path, max_images_per_class=100):
#     # A dictionary to track the number of images uploaded per emotion
#     uploaded_count = {emotion: 0 for emotion in os.listdir(local_path)}

#     # Walk through the dataset directory
#     for subdir, dirs, files in tqdm(os.walk(local_path)):
#         for file in files:
#             emotion = os.path.basename(subdir)
            
#             # Check if we've already uploaded 100 images for this emotion
#             if uploaded_count[emotion] < max_images_per_class:
#                 local_file_path = os.path.join(subdir, file)
                
#                 # Construct the S3 path by preserving the folder structure
#                 relative_path = os.path.relpath(local_file_path, local_path)
#                 s3_file_path = os.path.join(s3_bucket_path, relative_path).replace("\\", "/")

#                 # Upload the file to S3 using boto3 client
#                 s3_client.upload_file(local_file_path, bucket, s3_file_path)
#                 uploaded_count[emotion] += 1
#                 print(f"Uploaded: {local_file_path} to {s3_file_path}")
#             else:
#                 # Stop uploading images for this emotion once the limit is reached
#                 continue

# # Call the function to upload the FER-2013 dataset
# upload_directory_to_s3(local_path, s3_fer_path)

# # Store the S3 path in a variable for later use
# %store s3_fer_path


In [7]:
import os
from PIL import Image
import pandas as pd

# Directory where the images are stored
image_dir = './fer2013'

# Prepare an empty list to store metadata
metadata = []

# Loop over all the images in the directory (keeping folder structure intact)
for subdir, dirs, files in os.walk(image_dir):
    for file in files:
        if file.endswith('.jpg') or file.endswith('.jpeg'):
            # Get the file path
            file_path = os.path.join(subdir, file)
            
            # Open the image to extract metadata
            img = Image.open(file_path)
            width, height = img.size
            file_size = os.path.getsize(file_path)  # Size in bytes
            
            # Optionally, you can also extract labels if stored in the folder structure or filename
            label = subdir.split('/')[-1]  # Assuming folder names are the labels
            
            # Append metadata to the list
            metadata.append({
                'image_filename': file,
                'image_filepath': file_path,
                'label': label,
                'width': width,
                'height': height,
                'file_size': file_size
            })

# Convert the metadata list to a pandas DataFrame
df_metadata = pd.DataFrame(metadata)

# Save the metadata to a CSV file
df_metadata.to_csv('image_metadata.csv', index=False)
# Also save to parquet since this is a relatively large dataset
df_metadata.to_parquet('image_metadata.parquet')

In [9]:
# Add metadata file to S3 (Useful for Athena Queries and feature store)

# Initialize S3 client
s3_client = boto3.client('s3')

# Define S3 bucket and path
# bucket_name = 'image-metadata'
s3_metadata_path = 's3://sagemaker-us-east-1-399018723364/group-5/fer/metadata/image_metadata.parquet'.format(bucket)

# Upload the CSV/Parquet file to S3
s3_client.upload_file('image_metadata.parquet', bucket, 'group-5/fer/metadata/image_metadata.parquet')

In [20]:
# Initialize Athena client
athena_client = boto3.client('athena')

# Athena settings
database_name = 'FERFeatureStore'
output_bucket = 's3://sagemaker-us-east-1-399018723364/group-5/fer/athena-results/'

# SQL query to create the Athena table for metadata (using Parquet format)
create_table_query_parquet = """
CREATE EXTERNAL TABLE IF NOT EXISTS image_metadata (
    image_filename STRING,
    image_filepath STRING,
    label STRING,
    width INT,
    height INT,
    file_size INT
)
STORED AS PARQUET
LOCATION 's3://sagemaker-us-east-1-399018723364/group-5/fer/metadata/';
"""

# Function to run the query in Athena
def run_athena_query(query, database_name, output_bucket):
    response = athena_client.start_query_execution(
        QueryString=query,
        QueryExecutionContext={
            'Database': database_name
        },
        ResultConfiguration={
            'OutputLocation': output_bucket
        }
    )
    return response

# Run the query to create the table
response = run_athena_query(create_table_query_parquet, database_name, output_bucket)

# Get the query execution ID to track the query status
query_execution_id = response['QueryExecutionId']

print(f"Created Athena table. Query Execution ID: {query_execution_id}")



Created Athena table. Query Execution ID: 3ddb108c-4539-40f5-86a6-37d362799fcc


In [21]:
import time

# Initialize Athena client
athena_client = boto3.client('athena')

# Athena settings
database_name = 'FERFeatureStore'
output_bucket = 's3://sagemaker-us-east-1-399018723364/group-5/fer/athena-results/'

# Function to list all tables in a database
def list_tables(database_name):
    try:
        # Start the query execution to list tables
        query = f"SHOW TABLES IN {database_name}"
        response = athena_client.start_query_execution(
            QueryString=query,
            QueryExecutionContext={'Database': database_name},
            ResultConfiguration={'OutputLocation': output_bucket}
        )
        query_execution_id = response['QueryExecutionId']
        print(f"Query started with Execution ID: {query_execution_id}")
        
        # Check query status
        status = 'RUNNING'
        while status in ['RUNNING', 'QUEUED']:
            print("Query is still running...")
            time.sleep(5)
            status = check_query_status(query_execution_id)

        # Fetch results
        results = fetch_query_results(query_execution_id)
        return results
    except Exception as e:
        print(f"Error listing tables: {e}")
        return None

# Function to check query execution status
def check_query_status(query_execution_id):
    try:
        response = athena_client.get_query_execution(
            QueryExecutionId=query_execution_id
        )
        status = response['QueryExecution']['Status']['State']
        return status
    except Exception as e:
        print(f"Error checking query status: {e}")
        return None

# Function to fetch query results
def fetch_query_results(query_execution_id):
    try:
        response = athena_client.get_query_results(
            QueryExecutionId=query_execution_id
        )
        return response['ResultSet']['Rows']
    except Exception as e:
        print(f"Error fetching query results: {e}")
        return None

# List tables in the database
tables = list_tables(database_name)
if tables:
    print("Tables in Athena Database:", tables)
else:
    print("No tables found or failed to list tables.")



Query started with Execution ID: 93ba095f-0765-4757-8303-48567448c678
Query is still running...
Error fetching query results: 'ResultSet'
No tables found or failed to list tables.


## EDA
This section preforms one example of EDA to understand the distribution of the data. This is done by analyzing the labels of the data and understanding the distribution of those labels.

In [22]:
import time
# Initialize Athena client
athena_client = boto3.client('athena')

# Athena settings
database_name = 'FERFeatureStore'
output_bucket = 's3://sagemaker-us-east-1-399018723364/group-5/fer/athena-results/'

# SQL query to get label distribution
query = """
SELECT label, COUNT(*) AS label_count
FROM image_metadata
GROUP BY label
ORDER BY label_count DESC;
"""

# Function to run the query
def run_athena_query(query, database_name, output_bucket):
    response = athena_client.start_query_execution(
        QueryString=query,
        QueryExecutionContext={
            'Database': database_name
        },
        ResultConfiguration={
            'OutputLocation': output_bucket
        }
    )
    return response

# Run the query
response = run_athena_query(query, database_name, output_bucket)

# Get the query execution ID to track the query status
query_execution_id = response['QueryExecutionId']
print(f"Query started with Execution ID: {query_execution_id}")

# Function to check query status
def check_query_status(query_execution_id):
    response = athena_client.get_query_execution(
        QueryExecutionId=query_execution_id
    )
    status = response['QueryExecution']['Status']['State']
    return status

# Check the query status until it is completed
status = check_query_status(query_execution_id)
while status in ['RUNNING', 'QUEUED']:
    print("Query is still running...")
    time.sleep(5)
    status = check_query_status(query_execution_id)

# When the query completes, fetch the results
def fetch_query_results(query_execution_id):
    results = athena_client.get_query_results(
        QueryExecutionId=query_execution_id
    )
    return results

# Fetch the results
results = fetch_query_results(query_execution_id)

# Parse the results into a Pandas DataFrame
rows = results['ResultSet']['Rows']
columns = [col['VarCharValue'] for col in rows[0]['Data']]
data = [
    [col['VarCharValue'] for col in row['Data']]
    for row in rows[1:]  # Skip the header row
]

df = pd.DataFrame(data, columns=columns)
df['label_count'] = pd.to_numeric(df['label_count'])

# Display the DataFrame
print(df)


Query started with Execution ID: 5b239d96-25d0-4880-81f6-355f0723e5a6
Query is still running...


InvalidRequestException: An error occurred (InvalidRequestException) when calling the GetQueryResults operation: Query did not finish successfully. Final query state: FAILED

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the style for the plot
sns.set(style="whitegrid")

# Plot the distribution of labels
plt.figure(figsize=(10, 6))
sns.barplot(x='label_count', y='label', data=df, palette='viridis')

# Add title and labels
plt.title('Distribution of Labels in the FER-2013 Dataset', fontsize=16)
plt.xlabel('Number of Images', fontsize=12)
plt.ylabel('Label', fontsize=12)

# Show the plot
plt.show()


## Feature Store
Feature engineering is used to extract features from the raw image data. This includes resizing, normalization and other feature extraction techniques.

In [None]:
from sagemaker.feature_store.feature_group import FeatureGroup
from datetime import datetime

# Define the schema for your feature group
feature_group_name = 'fer2013-feature-group'
feature_definitions = [
    {'FeatureName': 'emotion', 'FeatureType': 'String'},
    {'FeatureName': 'pixel_values', 'FeatureType': 'String'},
    # Add more features as necessary
]

# Create the feature group
feature_group = FeatureGroup(name=feature_group_name,
                             feature_definitions=feature_definitions,
                             s3_uri='s3://{}/group-5/fer/feature-store/')
feature_group.create()
