# Transform excel sheet of labels into a csv for creating an aws manifest

In [11]:
import pandas as pd
import numpy as np

rs = np.random.RandomState(0)

# Read all sheets of the Excel file
file_path = "./labels_from_google_drive/Women's Health Classification Labels_10Jan_D.xlsx"
sheets_dict = pd.read_excel(file_path, sheet_name=None)

test_frac = 0.2

label_sheets = [
    "BreastExam",
    "Menstruation",
    "ReproductiveHealth",
    "Suggestive",
    "WomensHealthGeneral",
    "PregnantNonSuggestive",
    "PregnantSuggestive",
]

# Display the sheet names and the corresponding DataFrames
df_list = []
for sheet_name in label_sheets:
    df = sheets_dict[sheet_name]
    print(f"Sheet name: {sheet_name}")

    # randomly split each folder of images into train and test set
    train_or_test = rs.choice(["train", "test"], len(df), p=[1.0 - test_frac, test_frac])

    # add sheet name as prefix to file name
    df.loc[:, "File Name"] = sheet_name + "/" + df["File Name"]
    
    df.loc[:, "split"] = train_or_test

    # create csv-like string for labels
    label_cols = [f"Label {i}" for i in range(1, 11)]
    df.loc[:, "labels"] = df.apply(lambda x: ",".join([x[l] for l in label_cols if not pd.isna(x[l])]), axis=1)
    print(df.head(2))  # Display the first few rows of each sheet
    df_list.append(df.loc[:, ["File Name", "labels", "split"]])
    print()

# concatenate dfs
label_df = pd.concat(df_list)
print("label df:")
print(label_df.head())

Sheet name: BreastExam
                                    File Name      Label 1         Label 2  \
0  BreastExam/dreamstimemaximum_345386762.jpg  breast_exam  health_context   
1  BreastExam/dreamstimemaximum_333223096.jpg  breast_exam  health_context   

          Label 3            Label 4 Label 5 Label 6  Label 7  Label 8  \
0  non_suggestive  athletic_clothing     NaN     NaN      NaN      NaN   
1  non_suggestive                NaN     NaN     NaN      NaN      NaN   

   Label 9  Label 10  split                                             labels  
0      NaN       NaN  train  breast_exam,health_context,non_suggestive,athl...  
1      NaN       NaN  train          breast_exam,health_context,non_suggestive  

Sheet name: Menstruation
                                           File Name       Label 1  \
0  Menstruation/What_are_heavy_periods_Kk8hREp.wi...  menstruation   
1  Menstruation/pexels-karolina-grabowska-7692051...  menstruation   

          Label 2 Label 3         Label

# write manifest with all labels

alternate between train and test set. 

In [12]:
all_label_files = []

In [13]:
# write final csv from this df
from datetime import datetime
import csv


# write file with image paths and labels
timestamp = datetime.now().isoformat()
train_filename = f"./manifest/all-labels_train_{timestamp}.csv"
test_filename = f"./manifest/all-labels_test_{timestamp}.csv"


with open(train_filename, 'w') as f:
    for image_filename, labels, split in label_df[label_df["split"] == "train"].itertuples(index=False):
        f.write(f"{image_filename},{labels}\n")

with open(test_filename, 'w') as f:
    for image_filename, labels, split in label_df[label_df["split"] == "test"].itertuples(index=False):
        f.write(f"{image_filename},{labels}\n")

all_label_files.append(train_filename)
all_label_files.append(test_filename)

In [14]:
# write only suggestive and non-suggestive labels

# write file with image paths and labels
timestamp = datetime.now().isoformat()
train_filename = f"./manifest/suggestive-non-suggestive_train_{timestamp}.csv"
test_filename = f"./manifest/suggestive-non-suggestive_test_{timestamp}.csv"


with open(train_filename, 'w') as f:
    for image_filename, labels, split in label_df[label_df["split"] == "train"].itertuples(index=False):
        if "non_suggestive" in labels:
            label = "non_suggestive"
        elif "suggestive" in labels:
            label = "suggestive"
        else:
            label = ""
        f.write(f"{image_filename},{label}\n")

with open(test_filename, 'w') as f:
    for image_filename, labels, split in label_df[label_df["split"] == "test"].itertuples(index=False):
        if "non_suggestive" in labels:
            label = "non_suggestive"
        elif "suggestive" in labels:
            label = "suggestive"
        else:
            label = ""
        f.write(f"{image_filename},{label}\n")
        
all_label_files.append(train_filename)
all_label_files.append(test_filename)

In [15]:
all_label_files

['./manifest/all-labels_train_2025-01-10T21:09:28.589169.csv',
 './manifest/all-labels_test_2025-01-10T21:09:28.589169.csv',
 './manifest/suggestive-non-suggestive_train_2025-01-10T21:09:29.103691.csv',
 './manifest/suggestive-non-suggestive_test_2025-01-10T21:09:29.103691.csv']

In [16]:
# transform csv into manifest
from csv_to_multilabel_manifest import create_manifest_file
import os 

for filename in all_label_files:
    manifest_filename = "./manifest/" + "manifest_" + os.path.basename(filename)[:-4]
    create_manifest_file(filename, manifest_filename, s3_path="s3://reliabl-image-labeling-demo/resized/")

# create dataset from manifest file

In [17]:
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0

import argparse
import logging
import time
import boto3

from botocore.exceptions import ClientError

logger = logging.getLogger(__name__)

def create_empty_dataset(rek_client, project_arn, dataset_type):
    """
    Creates an empty Amazon Rekognition Custom Labels dataset.
    :param rek_client: The Amazon Rekognition Custom Labels Boto3 client.
    :param project_arn: The ARN of the project in which you want to create a dataset.
    :param dataset_type: The type of the dataset that you want to create (train or test).
    """

    try:
        #Create the dataset.
        logger.info("Creating empty %s dataset for project %s",
            dataset_type, project_arn)

        dataset_type=dataset_type.upper()

        response = rek_client.create_dataset(
            ProjectArn=project_arn, DatasetType=dataset_type
        )

        dataset_arn=response['DatasetArn']

        logger.info("dataset ARN: %s", dataset_arn)

        finished=False
        while finished is False:

            dataset=rek_client.describe_dataset(DatasetArn=dataset_arn)

            status=dataset['DatasetDescription']['Status']
            
            if status == "CREATE_IN_PROGRESS":
                
                logger.info(("Creating dataset: %s ", dataset_arn))
                time.sleep(5)
                continue

            if status == "CREATE_COMPLETE":
                logger.info("Dataset created: %s", dataset_arn)
                finished=True
                continue

            if status == "CREATE_FAILED":
                error_message = f"Dataset creation failed: {status} : {dataset_arn}"
                logger.exception(error_message)
                raise Exception(error_message)
                
            error_message = f"Failed. Unexpected state for dataset creation: {status} : {dataset_arn}"
            logger.exception(error_message)
            raise Exception(error_message)
            
        return dataset_arn
       
    except ClientError as err:  
        logger.exception("Couldn't create dataset: %s", err.response['Error']['Message'])
        raise

def add_arguments(parser):
    """
    Adds command line arguments to the parser.
    :param parser: The command line parser.
    """

    parser.add_argument(
        "project_arn", help="The ARN of the project in which you want to create the empty dataset."
    )

    parser.add_argument(
        "dataset_type", help="The type of the empty dataset that you want to create (train or test)."
    )


def main():

    logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")

    try:

        # Get command line arguments.
        parser = argparse.ArgumentParser(usage=argparse.SUPPRESS)
        add_arguments(parser)
        args = parser.parse_args()

        print(f"Creating empty {args.dataset_type} dataset for project {args.project_arn}")

        # Create the empty dataset.
        session = boto3.Session(profile_name='custom-labels-access')
        rekognition_client = session.client("rekognition")

        dataset_arn=create_empty_dataset(rekognition_client, 
            args.project_arn,
            args.dataset_type.lower())

        print(f"Finished creating empty dataset: {dataset_arn}")


    except ClientError as err:
        logger.exception("Problem creating empty dataset: %s", err)
        print(f"Problem creating empty dataset: {err}")
    except Exception as err:
        logger.exception("Problem creating empty dataset: %s", err)
        print(f"Problem creating empty dataset: {err}")


In [None]:

    # Create the empty dataset.
    session = boto3.Session(profile_name='custom-labels-access')
    rekognition_client = session.client("rekognition")

    dataset_arn=create_empty_dataset(rekognition_client, 
        args.project_arn,
        args.dataset_type.lower())