In [1]:
import boto3
import pandas as pd

bucket = "ai-bmi-predictor-v2"
key = "test-data/eff_testingA.csv"

s3 = boto3.client("s3")
obj = s3.get_object(Bucket=bucket, Key=key)

data = pd.read_csv(obj["Body"])

data.head()


Unnamed: 0,photo_id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,hip,leg-length,shoulder-breadth,shoulder-to-crotch,thigh,waist,wrist,gender,height_cm,weight_kg
0,e5ae8fe5bbdf611a1e8d06e66e849bdf,0.073159,0.085775,-0.133776,0.881202,0.214236,0.016104,-0.180302,-0.100713,-0.117249,...,106.77469,83.279744,39.922305,70.005128,55.945992,98.25039,20.187082,male,180.0,94.6
1,605a5fd09058c48156b0ef518b63b2de,0.092031,-0.066016,-0.145132,0.687441,0.186508,-0.075221,-0.093846,-0.03584,0.033903,...,102.481633,84.876529,39.974203,73.591637,55.397032,88.003618,17.715785,male,188.9,86.75
2,909c9277309e13ee014e347603aba620,0.057046,-0.051366,-0.148253,0.675916,0.209973,-0.073485,-0.072783,-0.059395,0.00837,...,99.342301,82.275874,36.059983,66.440526,53.742692,82.100598,17.086464,male,179.7,73.85
3,bef6a68bc8dd475c124f6de2413385d3,-0.018792,0.016435,-0.148091,0.464433,0.242849,-0.106556,0.001489,-0.083478,0.096048,...,101.770144,76.081842,34.071748,62.218026,52.396573,83.999124,16.299751,female,166.95,69.05
4,6d7ed4bc4a17546447efed0ca6e2ff11,0.084419,0.065945,-0.153379,0.635377,0.285274,-0.056372,-0.139008,-0.120711,-0.002466,...,94.707063,81.328892,36.834735,64.426273,49.895157,86.020117,16.531431,male,173.2,65.55


In [None]:
# =========================
# Silhouette area features
# =========================

import boto3  # AWS SDK for Python (S3 access)
import numpy as np  # fast array operations
import pandas as pd  # dataframe operations
from PIL import Image  # read PNG images
from io import BytesIO  # convert S3 bytes -> file-like object
from botocore.exceptions import ClientError  # catch S3 missing-key errors

# -------------------------
# CONFIG (edit if needed)
# -------------------------

MASK_BUCKET = "amazon-bodym"  # S3 bucket containing the mask images
FRONT_MASK_PREFIX = "testA/mask/"  # S3 prefix/folder for front masks
SIDE_MASK_PREFIX = "testA/mask_left/"  # S3 prefix/folder for side masks
PHOTO_ID_COL = "photo_id"  # dataframe column containing photo IDs

FRONT_AREA_COL = "front_silhouette_area_norm"  # output column for normalized front area
SIDE_AREA_COL  = "side_silhouette_area_norm"   # output column for normalized side area

# optional debug columns (keep or remove)
FRONT_AREA_RAW_COL = "front_silhouette_area"  # raw (unnormalized) front pixel area
SIDE_AREA_RAW_COL  = "side_silhouette_area"   # raw (unnormalized) side pixel area
FRONT_H_COL = "front_mask_height"  # computed pixel height for front mask
SIDE_H_COL  = "side_mask_height"   # computed pixel height for side mask

# -------------------------
# S3 client (reused)
# -------------------------

print("Creating S3 client...")  # progress log
s3 = boto3.client("s3")  # create S3 client

# -----------------------------------------
# Helper: standardize photo_id -> filename
# -----------------------------------------

print("Preparing photo_id normalization...")  # progress log
photo_id_series = data[PHOTO_ID_COL].astype(str).fillna("")  # ensure string + no NaN
photo_id_series = photo_id_series.str.strip()  # remove whitespace
photo_id_series = photo_id_series.str.replace(".png", "", regex=False)  # drop .png if present
photo_id_series = photo_id_series.str.split(".", n=1).str[0]  # drop trailing decimals like ".0"

pad_len = int(photo_id_series.str.len().max()) if len(photo_id_series) else 0  # infer max length
pad_len = max(pad_len, 4)  # at least 4 (e.g., "0021")
print(f"Using zero-pad length = {pad_len}")  # show chosen pad length

def normalize_photo_id(photo_id, pad_length):  # normalize id to match PNG filenames
    pid = str(photo_id).strip()  # convert to string and trim
    pid = pid.replace(".png", "")  # remove extension if included
    pid = pid.split(".", 1)[0]  # remove any trailing decimals
    if pid.isdigit():  # pad only if numeric
        pid = pid.zfill(pad_length)  # zero-pad on the left
    return pid  # return normalized id

# -----------------------------------------
# Helper: read mask PNG from S3 into boolean
# -----------------------------------------

def load_mask_bool_from_s3(bucket, key):  # loads mask and converts to boolean array
    print(f"    Downloading mask: s3://{bucket}/{key}")  # track downloads
    obj = s3.get_object(Bucket=bucket, Key=key)  # fetch file bytes from S3
    img = Image.open(BytesIO(obj["Body"].read()))  # open image from bytes
    img = img.convert("L")  # convert to grayscale
    arr = np.array(img)  # convert image to numpy array
    return arr > 0  # True for body pixels (white), False otherwise

# -----------------------------------------
# Helper: compute height from mask
# -----------------------------------------

def mask_height(mask_bool):  # compute vertical body extent in pixels
    if mask_bool.size == 0:  # guard: empty image
        return np.nan  # cannot compute
    ys, xs = np.where(mask_bool)  # coordinates of body pixels
    if ys.size == 0:  # guard: no body pixels
        return np.nan  # cannot compute
    return float(ys.max() - ys.min() + 1)  # height = bottom - top + 1

# -----------------------------------------
# Helper: silhouette area + normalization
# -----------------------------------------

def silhouette_area(mask_bool):  # count body pixels in the mask
    if mask_bool.size == 0:  # guard: empty image
        return np.nan  # cannot compute
    return float(np.count_nonzero(mask_bool))  # number of True pixels

def normalized_silhouette_area(mask_bool):  # area normalized by height^2
    a = silhouette_area(mask_bool)  # compute raw pixel area
    h = mask_height(mask_bool)  # compute pixel height
    if np.isnan(a) or np.isnan(h) or h <= 0:  # guard: invalid values
        return np.nan, a, h  # return NaN normalized area + raw diagnostics
    return float(a / (h * h)), a, h  # normalized area, raw area, height

# -----------------------------------------
# Per-record feature computation
# -----------------------------------------

def compute_areas_for_photo_id(photo_id):  # compute front+side areas for one record
    pid = normalize_photo_id(photo_id, pad_len)  # normalize ID to match filenames
    front_key = f"{FRONT_MASK_PREFIX}{pid}.png"  # build S3 key for front mask
    side_key  = f"{SIDE_MASK_PREFIX}{pid}.png"   # build S3 key for side mask

    print(f"  Processing photo_id={photo_id} -> pid={pid}")  # per-record log

    try:
        front_mask = load_mask_bool_from_s3(MASK_BUCKET, front_key)  # load front mask
    except ClientError as e:
        print(f"    ERROR: missing front mask for pid={pid} | {e}")  # missing file log
        return (np.nan, np.nan, np.nan, np.nan, np.nan, np.nan)  # return NaNs

    try:
        side_mask = load_mask_bool_from_s3(MASK_BUCKET, side_key)  # load side mask
    except ClientError as e:
        print(f"    ERROR: missing side mask for pid={pid} | {e}")  # missing file log
        return (np.nan, np.nan, np.nan, np.nan, np.nan, np.nan)  # return NaNs

    front_norm, front_raw, front_h = normalized_silhouette_area(front_mask)  # front features
    side_norm,  side_raw,  side_h  = normalized_silhouette_area(side_mask)   # side features

    print(
        f"    front_raw_area={front_raw} | front_h={front_h} | front_norm={front_norm} "
        f"|| side_raw_area={side_raw} | side_h={side_h} | side_norm={side_norm}"
    )  # computed values log

    return front_norm, side_norm, front_raw, side_raw, front_h, side_h  # return all values

# -----------------------------------------
# Run across the dataframe and add columns
# -----------------------------------------

print("Starting silhouette area computation for all records...")  # progress log

front_norms, side_norms = [], []  # store normalized areas
front_raws, side_raws = [], []  # store raw areas (debug)
front_hs, side_hs = [], []  # store heights (debug)

n = len(data)  # total rows
print(f"Total rows to process: {n}")  # show total count

for i, photo_id in enumerate(data[PHOTO_ID_COL].tolist(), start=1):  # loop through all records
    print(f"\nRecord {i}/{n}")  # record progress
    fn, sn, fr, sr, fh, sh = compute_areas_for_photo_id(photo_id)  # compute features

    front_norms.append(fn)  # save front normalized area
    side_norms.append(sn)  # save side normalized area
    front_raws.append(fr)  # save front raw area
    side_raws.append(sr)  # save side raw area
    front_hs.append(fh)  # save front height
    side_hs.append(sh)  # save side height

    if i % 50 == 0:  # periodic update
        print(f"\nProcessed {i}/{n} records so far...")  # progress log

print("\nAttaching new feature columns to dataframe...")  # progress log
data[FRONT_AREA_COL] = front_norms  # add normalized front area feature
data[SIDE_AREA_COL]  = side_norms   # add normalized side area feature

In [None]:
data.head()

In [None]:
import boto3  # S3 client
from io import StringIO  # in-memory text buffer

print("Preparing to upload updated dataset to S3...")  # track process

s3_out_path = "s3://ai-bmi-predictor-v2/test-data/eff_testingA_v3.csv"  # target S3 path
print("Target:", s3_out_path)  # show target

# ---- parse s3://bucket/key ----
out_bucket = s3_out_path.replace("s3://", "").split("/", 1)[0]  # extract bucket
out_key = s3_out_path.replace("s3://", "").split("/", 1)[1]     # extract key
print(f"Parsed -> bucket={out_bucket}, key={out_key}")  # confirm parsing

# ---- write CSV to memory ----
csv_buffer = StringIO()  # create in-memory buffer
print("Serializing dataframe to CSV (in-memory)...")  # track process
data.to_csv(csv_buffer, index=False)  # write dataframe as CSV text
csv_body = csv_buffer.getvalue()  # get CSV string content
print(f"CSV size (chars): {len(csv_body):,}")  # print rough size

# ---- upload to S3 ----
s3 = boto3.client("s3")  # create S3 client
print("Uploading to S3...")  # track process
s3.put_object(
    Bucket=out_bucket,                 # destination bucket
    Key=out_key,                       # destination key
    Body=csv_body.encode("utf-8"),     # file bytes
    ContentType="text/csv"             # content type
)  # upload

print("Upload complete ✅")  # done
print(f"Saved to: {s3_out_path}")  # confirm final path
