In [1]:
import boto3
import pandas as pd

bucket = "ai-bmi-predictor-v2"
key = "test-data/eff_testingB.csv"

s3 = boto3.client("s3")
obj = s3.get_object(Bucket=bucket, Key=key)

data = pd.read_csv(obj["Body"])

data.head()


Unnamed: 0,photo_id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,hip,leg-length,shoulder-breadth,shoulder-to-crotch,thigh,waist,wrist,gender,height_cm,weight_kg
0,5e09e1b0d43b6c430709a513f594c591,0.109534,-0.016982,-0.146092,0.516881,0.189749,-0.067232,-0.048589,-0.022137,0.04558,...,91.273094,77.553963,36.633358,64.253914,47.014259,77.113548,16.389431,male,170.0,61.8
1,7e6a57e2fcabf518e9007a77d4cc4960,0.094942,0.000988,-0.126504,0.834527,0.315265,0.002205,-0.067837,-0.09369,-0.021319,...,91.70752,76.172325,34.205143,61.72636,48.404037,73.617821,14.480244,female,169.0,54.5
2,4ee8251ae7e4ad42c75644db390fc5c5,-0.032353,-0.051524,-0.129219,0.430257,0.20928,-0.131291,0.039772,-0.062221,0.07262,...,89.962387,74.400047,31.023907,56.728245,47.468864,72.155304,15.670779,female,164.59,52.1
3,6367d327f96b951e4a498c88d615e8a4,0.056546,0.007736,-0.147426,0.731932,0.264191,-0.050348,-0.145324,-0.088448,-0.021515,...,96.964554,76.552322,36.380489,63.264416,47.635548,82.428825,16.714935,male,168.0,67.4
4,8b16daea6768b92fb27997f76215aa5f,0.004628,-0.047994,-0.155691,0.587151,0.235884,-0.101305,-0.065924,-0.134367,0.021547,...,115.346634,85.163284,39.959301,72.7239,61.20694,115.251938,19.627054,male,184.3,111.3


In [2]:
# =========================
# Silhouette area features
# =========================

import boto3  # AWS SDK for Python (S3 access)
import numpy as np  # fast array operations
import pandas as pd  # dataframe operations
from PIL import Image  # read PNG images
from io import BytesIO  # convert S3 bytes -> file-like object
from botocore.exceptions import ClientError  # catch S3 missing-key errors

# -------------------------
# CONFIG (edit if needed)
# -------------------------

MASK_BUCKET = "amazon-bodym"  # S3 bucket containing the mask images
FRONT_MASK_PREFIX = "testB/mask/"  # S3 prefix/folder for front masks
SIDE_MASK_PREFIX = "testB/mask_left/"  # S3 prefix/folder for side masks
PHOTO_ID_COL = "photo_id"  # dataframe column containing photo IDs

FRONT_AREA_COL = "front_silhouette_area_norm"  # output column for normalized front area
SIDE_AREA_COL  = "side_silhouette_area_norm"   # output column for normalized side area

# optional debug columns (keep or remove)
FRONT_AREA_RAW_COL = "front_silhouette_area"  # raw (unnormalized) front pixel area
SIDE_AREA_RAW_COL  = "side_silhouette_area"   # raw (unnormalized) side pixel area
FRONT_H_COL = "front_mask_height"  # computed pixel height for front mask
SIDE_H_COL  = "side_mask_height"   # computed pixel height for side mask

# -------------------------
# S3 client (reused)
# -------------------------

print("Creating S3 client...")  # progress log
s3 = boto3.client("s3")  # create S3 client

# -----------------------------------------
# Helper: standardize photo_id -> filename
# -----------------------------------------

print("Preparing photo_id normalization...")  # progress log
photo_id_series = data[PHOTO_ID_COL].astype(str).fillna("")  # ensure string + no NaN
photo_id_series = photo_id_series.str.strip()  # remove whitespace
photo_id_series = photo_id_series.str.replace(".png", "", regex=False)  # drop .png if present
photo_id_series = photo_id_series.str.split(".", n=1).str[0]  # drop trailing decimals like ".0"

pad_len = int(photo_id_series.str.len().max()) if len(photo_id_series) else 0  # infer max length
pad_len = max(pad_len, 4)  # at least 4 (e.g., "0021")
print(f"Using zero-pad length = {pad_len}")  # show chosen pad length

def normalize_photo_id(photo_id, pad_length):  # normalize id to match PNG filenames
    pid = str(photo_id).strip()  # convert to string and trim
    pid = pid.replace(".png", "")  # remove extension if included
    pid = pid.split(".", 1)[0]  # remove any trailing decimals
    if pid.isdigit():  # pad only if numeric
        pid = pid.zfill(pad_length)  # zero-pad on the left
    return pid  # return normalized id

# -----------------------------------------
# Helper: read mask PNG from S3 into boolean
# -----------------------------------------

def load_mask_bool_from_s3(bucket, key):  # loads mask and converts to boolean array
    print(f"    Downloading mask: s3://{bucket}/{key}")  # track downloads
    obj = s3.get_object(Bucket=bucket, Key=key)  # fetch file bytes from S3
    img = Image.open(BytesIO(obj["Body"].read()))  # open image from bytes
    img = img.convert("L")  # convert to grayscale
    arr = np.array(img)  # convert image to numpy array
    return arr > 0  # True for body pixels (white), False otherwise

# -----------------------------------------
# Helper: compute height from mask
# -----------------------------------------

def mask_height(mask_bool):  # compute vertical body extent in pixels
    if mask_bool.size == 0:  # guard: empty image
        return np.nan  # cannot compute
    ys, xs = np.where(mask_bool)  # coordinates of body pixels
    if ys.size == 0:  # guard: no body pixels
        return np.nan  # cannot compute
    return float(ys.max() - ys.min() + 1)  # height = bottom - top + 1

# -----------------------------------------
# Helper: silhouette area + normalization
# -----------------------------------------

def silhouette_area(mask_bool):  # count body pixels in the mask
    if mask_bool.size == 0:  # guard: empty image
        return np.nan  # cannot compute
    return float(np.count_nonzero(mask_bool))  # number of True pixels

def normalized_silhouette_area(mask_bool):  # area normalized by height^2
    a = silhouette_area(mask_bool)  # compute raw pixel area
    h = mask_height(mask_bool)  # compute pixel height
    if np.isnan(a) or np.isnan(h) or h <= 0:  # guard: invalid values
        return np.nan, a, h  # return NaN normalized area + raw diagnostics
    return float(a / (h * h)), a, h  # normalized area, raw area, height

# -----------------------------------------
# Per-record feature computation
# -----------------------------------------

def compute_areas_for_photo_id(photo_id):  # compute front+side areas for one record
    pid = normalize_photo_id(photo_id, pad_len)  # normalize ID to match filenames
    front_key = f"{FRONT_MASK_PREFIX}{pid}.png"  # build S3 key for front mask
    side_key  = f"{SIDE_MASK_PREFIX}{pid}.png"   # build S3 key for side mask

    print(f"  Processing photo_id={photo_id} -> pid={pid}")  # per-record log

    try:
        front_mask = load_mask_bool_from_s3(MASK_BUCKET, front_key)  # load front mask
    except ClientError as e:
        print(f"    ERROR: missing front mask for pid={pid} | {e}")  # missing file log
        return (np.nan, np.nan, np.nan, np.nan, np.nan, np.nan)  # return NaNs

    try:
        side_mask = load_mask_bool_from_s3(MASK_BUCKET, side_key)  # load side mask
    except ClientError as e:
        print(f"    ERROR: missing side mask for pid={pid} | {e}")  # missing file log
        return (np.nan, np.nan, np.nan, np.nan, np.nan, np.nan)  # return NaNs

    front_norm, front_raw, front_h = normalized_silhouette_area(front_mask)  # front features
    side_norm,  side_raw,  side_h  = normalized_silhouette_area(side_mask)   # side features

    print(
        f"    front_raw_area={front_raw} | front_h={front_h} | front_norm={front_norm} "
        f"|| side_raw_area={side_raw} | side_h={side_h} | side_norm={side_norm}"
    )  # computed values log

    return front_norm, side_norm, front_raw, side_raw, front_h, side_h  # return all values

# -----------------------------------------
# Run across the dataframe and add columns
# -----------------------------------------

print("Starting silhouette area computation for all records...")  # progress log

front_norms, side_norms = [], []  # store normalized areas
front_raws, side_raws = [], []  # store raw areas (debug)
front_hs, side_hs = [], []  # store heights (debug)

n = len(data)  # total rows
print(f"Total rows to process: {n}")  # show total count

for i, photo_id in enumerate(data[PHOTO_ID_COL].tolist(), start=1):  # loop through all records
    print(f"\nRecord {i}/{n}")  # record progress
    fn, sn, fr, sr, fh, sh = compute_areas_for_photo_id(photo_id)  # compute features

    front_norms.append(fn)  # save front normalized area
    side_norms.append(sn)  # save side normalized area
    front_raws.append(fr)  # save front raw area
    side_raws.append(sr)  # save side raw area
    front_hs.append(fh)  # save front height
    side_hs.append(sh)  # save side height

    if i % 50 == 0:  # periodic update
        print(f"\nProcessed {i}/{n} records so far...")  # progress log

print("\nAttaching new feature columns to dataframe...")  # progress log
data[FRONT_AREA_COL] = front_norms  # add normalized front area feature
data[SIDE_AREA_COL]  = side_norms   # add normalized side area feature

Creating S3 client...
Preparing photo_id normalization...
Using zero-pad length = 32
Starting silhouette area computation for all records...
Total rows to process: 1160

Record 1/1160
  Processing photo_id=5e09e1b0d43b6c430709a513f594c591 -> pid=5e09e1b0d43b6c430709a513f594c591
    Downloading mask: s3://amazon-bodym/testB/mask/5e09e1b0d43b6c430709a513f594c591.png
    Downloading mask: s3://amazon-bodym/testB/mask_left/5e09e1b0d43b6c430709a513f594c591.png
    front_raw_area=138365.0 | front_h=886.0 | front_norm=0.17626204464736125 || side_raw_area=81455.0 | side_h=867.0 | side_norm=0.10836263401486519

Record 2/1160
  Processing photo_id=7e6a57e2fcabf518e9007a77d4cc4960 -> pid=7e6a57e2fcabf518e9007a77d4cc4960
    Downloading mask: s3://amazon-bodym/testB/mask/7e6a57e2fcabf518e9007a77d4cc4960.png
    Downloading mask: s3://amazon-bodym/testB/mask_left/7e6a57e2fcabf518e9007a77d4cc4960.png
    front_raw_area=132844.0 | front_h=900.0 | front_norm=0.16400493827160495 || side_raw_area=82615.

In [3]:
data.head()

Unnamed: 0,photo_id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,shoulder-breadth,shoulder-to-crotch,thigh,waist,wrist,gender,height_cm,weight_kg,front_silhouette_area_norm,side_silhouette_area_norm
0,5e09e1b0d43b6c430709a513f594c591,0.109534,-0.016982,-0.146092,0.516881,0.189749,-0.067232,-0.048589,-0.022137,0.04558,...,36.633358,64.253914,47.014259,77.113548,16.389431,male,170.0,61.8,0.176262,0.108363
1,7e6a57e2fcabf518e9007a77d4cc4960,0.094942,0.000988,-0.126504,0.834527,0.315265,0.002205,-0.067837,-0.09369,-0.021319,...,34.205143,61.72636,48.404037,73.617821,14.480244,female,169.0,54.5,0.164005,0.105481
2,4ee8251ae7e4ad42c75644db390fc5c5,-0.032353,-0.051524,-0.129219,0.430257,0.20928,-0.131291,0.039772,-0.062221,0.07262,...,31.023907,56.728245,47.468864,72.155304,15.670779,female,164.59,52.1,0.1681,0.109886
3,6367d327f96b951e4a498c88d615e8a4,0.056546,0.007736,-0.147426,0.731932,0.264191,-0.050348,-0.145324,-0.088448,-0.021515,...,36.380489,63.264416,47.635548,82.428825,16.714935,male,168.0,67.4,0.169669,0.112497
4,8b16daea6768b92fb27997f76215aa5f,0.004628,-0.047994,-0.155691,0.587151,0.235884,-0.101305,-0.065924,-0.134367,0.021547,...,39.959301,72.7239,61.20694,115.251938,19.627054,male,184.3,111.3,0.183146,0.127931


In [4]:
import boto3  # S3 client
from io import StringIO  # in-memory text buffer

print("Preparing to upload updated dataset to S3...")  # track process

s3_out_path = "s3://ai-bmi-predictor-v2/test-data/eff_testingB_v3.csv"  # target S3 path
print("Target:", s3_out_path)  # show target

# ---- parse s3://bucket/key ----
out_bucket = s3_out_path.replace("s3://", "").split("/", 1)[0]  # extract bucket
out_key = s3_out_path.replace("s3://", "").split("/", 1)[1]     # extract key
print(f"Parsed -> bucket={out_bucket}, key={out_key}")  # confirm parsing

# ---- write CSV to memory ----
csv_buffer = StringIO()  # create in-memory buffer
print("Serializing dataframe to CSV (in-memory)...")  # track process
data.to_csv(csv_buffer, index=False)  # write dataframe as CSV text
csv_body = csv_buffer.getvalue()  # get CSV string content
print(f"CSV size (chars): {len(csv_body):,}")  # print rough size

# ---- upload to S3 ----
s3 = boto3.client("s3")  # create S3 client
print("Uploading to S3...")  # track process
s3.put_object(
    Bucket=out_bucket,                 # destination bucket
    Key=out_key,                       # destination key
    Body=csv_body.encode("utf-8"),     # file bytes
    ContentType="text/csv"             # content type
)  # upload

print("Upload complete âœ…")  # done
print(f"Saved to: {s3_out_path}")  # confirm final path


Preparing to upload updated dataset to S3...
Target: s3://ai-bmi-predictor-v2/test-data/eff_testingB_v3.csv
Parsed -> bucket=ai-bmi-predictor-v2, key=test-data/eff_testingB_v3.csv
Serializing dataframe to CSV (in-memory)...
CSV size (chars): 70,510,782
Uploading to S3...
Upload complete âœ…
Saved to: s3://ai-bmi-predictor-v2/test-data/eff_testingB_v3.csv
