1. load the dataset

In [1]:
import boto3
import pandas as pd

bucket = "ai-bmi-predictor-v2"
key = "data/eff_training.csv"

s3 = boto3.client("s3")
obj = s3.get_object(Bucket=bucket, Key=key)

data = pd.read_csv(obj["Body"])

data.head()


Unnamed: 0,photo_id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,hip,leg-length,shoulder-breadth,shoulder-to-crotch,thigh,waist,wrist,gender,height_cm,weight_kg
0,6ab1d061f51c6079633aeceed2faeb0b,6.8e-05,0.108145,-0.138813,0.633156,0.346266,-0.046055,0.016021,-0.058632,0.097968,...,105.3339,76.817467,35.362858,65.993683,54.459591,88.813789,16.764332,female,170.5,72.0
1,e94e2e05fb8b099955bbc4fa5ce81e22,0.020843,0.026005,-0.093442,0.736929,0.240569,0.089982,-0.112391,0.000435,-0.07611,...,101.478989,85.154358,37.25676,65.861588,52.773052,89.176338,15.690955,male,178.3,71.8
2,ba6951a4f37fc9302243370e927a02e2,0.014542,-0.071332,-0.154407,0.577781,0.196485,-0.125341,-0.056713,-0.027295,0.094879,...,97.488243,81.410393,37.503147,66.042679,57.059261,82.201988,16.686253,male,176.25,76.5
3,947d16539d4702427aa74f737329ffb9,0.041775,0.075746,-0.128497,0.48501,0.120409,0.011227,0.017852,-0.089796,-0.011273,...,120.586845,69.361534,34.084633,60.41333,65.0,102.323845,17.693762,female,152.1,88.9
4,9326695bf62926ec22690f576a633bba,0.004397,0.05859,-0.154224,0.52814,0.290956,-0.108486,-0.021441,-0.099909,0.08077,...,110.543564,77.160583,38.086231,68.400543,57.172279,107.378578,16.594791,male,171.5,88.4


2 calculate normalized silhoutte area

In [3]:
# =========================
# Silhouette area features
# =========================

import boto3  # AWS SDK for Python (S3 access)
import numpy as np  # fast array operations
import pandas as pd  # dataframe operations
from PIL import Image  # read PNG images
from io import BytesIO  # convert S3 bytes -> file-like object
from botocore.exceptions import ClientError  # catch S3 missing-key errors

# -------------------------
# CONFIG (edit if needed)
# -------------------------

MASK_BUCKET = "amazon-bodym"  # S3 bucket containing the mask images
FRONT_MASK_PREFIX = "train/mask/"  # S3 prefix/folder for front masks
SIDE_MASK_PREFIX = "train/mask_left/"  # S3 prefix/folder for side masks
PHOTO_ID_COL = "photo_id"  # dataframe column containing photo IDs

FRONT_AREA_COL = "front_silhouette_area_norm"  # output column for normalized front area
SIDE_AREA_COL  = "side_silhouette_area_norm"   # output column for normalized side area

# optional debug columns (keep or remove)
FRONT_AREA_RAW_COL = "front_silhouette_area"  # raw (unnormalized) front pixel area
SIDE_AREA_RAW_COL  = "side_silhouette_area"   # raw (unnormalized) side pixel area
FRONT_H_COL = "front_mask_height"  # computed pixel height for front mask
SIDE_H_COL  = "side_mask_height"   # computed pixel height for side mask

# -------------------------
# S3 client (reused)
# -------------------------

print("Creating S3 client...")  # progress log
s3 = boto3.client("s3")  # create S3 client

# -----------------------------------------
# Helper: standardize photo_id -> filename
# -----------------------------------------

print("Preparing photo_id normalization...")  # progress log
photo_id_series = data[PHOTO_ID_COL].astype(str).fillna("")  # ensure string + no NaN
photo_id_series = photo_id_series.str.strip()  # remove whitespace
photo_id_series = photo_id_series.str.replace(".png", "", regex=False)  # drop .png if present
photo_id_series = photo_id_series.str.split(".", n=1).str[0]  # drop trailing decimals like ".0"

pad_len = int(photo_id_series.str.len().max()) if len(photo_id_series) else 0  # infer max length
pad_len = max(pad_len, 4)  # at least 4 (e.g., "0021")
print(f"Using zero-pad length = {pad_len}")  # show chosen pad length

def normalize_photo_id(photo_id, pad_length):  # normalize id to match PNG filenames
    pid = str(photo_id).strip()  # convert to string and trim
    pid = pid.replace(".png", "")  # remove extension if included
    pid = pid.split(".", 1)[0]  # remove any trailing decimals
    if pid.isdigit():  # pad only if numeric
        pid = pid.zfill(pad_length)  # zero-pad on the left
    return pid  # return normalized id

# -----------------------------------------
# Helper: read mask PNG from S3 into boolean
# -----------------------------------------

def load_mask_bool_from_s3(bucket, key):  # loads mask and converts to boolean array
    print(f"    Downloading mask: s3://{bucket}/{key}")  # track downloads
    obj = s3.get_object(Bucket=bucket, Key=key)  # fetch file bytes from S3
    img = Image.open(BytesIO(obj["Body"].read()))  # open image from bytes
    img = img.convert("L")  # convert to grayscale
    arr = np.array(img)  # convert image to numpy array
    return arr > 0  # True for body pixels (white), False otherwise

# -----------------------------------------
# Helper: compute height from mask
# -----------------------------------------

def mask_height(mask_bool):  # compute vertical body extent in pixels
    if mask_bool.size == 0:  # guard: empty image
        return np.nan  # cannot compute
    ys, xs = np.where(mask_bool)  # coordinates of body pixels
    if ys.size == 0:  # guard: no body pixels
        return np.nan  # cannot compute
    return float(ys.max() - ys.min() + 1)  # height = bottom - top + 1

# -----------------------------------------
# Helper: silhouette area + normalization
# -----------------------------------------

def silhouette_area(mask_bool):  # count body pixels in the mask
    if mask_bool.size == 0:  # guard: empty image
        return np.nan  # cannot compute
    return float(np.count_nonzero(mask_bool))  # number of True pixels

def normalized_silhouette_area(mask_bool):  # area normalized by height^2
    a = silhouette_area(mask_bool)  # compute raw pixel area
    h = mask_height(mask_bool)  # compute pixel height
    if np.isnan(a) or np.isnan(h) or h <= 0:  # guard: invalid values
        return np.nan, a, h  # return NaN normalized area + raw diagnostics
    return float(a / (h * h)), a, h  # normalized area, raw area, height

# -----------------------------------------
# Per-record feature computation
# -----------------------------------------

def compute_areas_for_photo_id(photo_id):  # compute front+side areas for one record
    pid = normalize_photo_id(photo_id, pad_len)  # normalize ID to match filenames
    front_key = f"{FRONT_MASK_PREFIX}{pid}.png"  # build S3 key for front mask
    side_key  = f"{SIDE_MASK_PREFIX}{pid}.png"   # build S3 key for side mask

    print(f"  Processing photo_id={photo_id} -> pid={pid}")  # per-record log

    try:
        front_mask = load_mask_bool_from_s3(MASK_BUCKET, front_key)  # load front mask
    except ClientError as e:
        print(f"    ERROR: missing front mask for pid={pid} | {e}")  # missing file log
        return (np.nan, np.nan, np.nan, np.nan, np.nan, np.nan)  # return NaNs

    try:
        side_mask = load_mask_bool_from_s3(MASK_BUCKET, side_key)  # load side mask
    except ClientError as e:
        print(f"    ERROR: missing side mask for pid={pid} | {e}")  # missing file log
        return (np.nan, np.nan, np.nan, np.nan, np.nan, np.nan)  # return NaNs

    front_norm, front_raw, front_h = normalized_silhouette_area(front_mask)  # front features
    side_norm,  side_raw,  side_h  = normalized_silhouette_area(side_mask)   # side features

    print(
        f"    front_raw_area={front_raw} | front_h={front_h} | front_norm={front_norm} "
        f"|| side_raw_area={side_raw} | side_h={side_h} | side_norm={side_norm}"
    )  # computed values log

    return front_norm, side_norm, front_raw, side_raw, front_h, side_h  # return all values

# -----------------------------------------
# Run across the dataframe and add columns
# -----------------------------------------

print("Starting silhouette area computation for all records...")  # progress log

front_norms, side_norms = [], []  # store normalized areas
front_raws, side_raws = [], []  # store raw areas (debug)
front_hs, side_hs = [], []  # store heights (debug)

n = len(data)  # total rows
print(f"Total rows to process: {n}")  # show total count

for i, photo_id in enumerate(data[PHOTO_ID_COL].tolist(), start=1):  # loop through all records
    print(f"\nRecord {i}/{n}")  # record progress
    fn, sn, fr, sr, fh, sh = compute_areas_for_photo_id(photo_id)  # compute features

    front_norms.append(fn)  # save front normalized area
    side_norms.append(sn)  # save side normalized area
    front_raws.append(fr)  # save front raw area
    side_raws.append(sr)  # save side raw area
    front_hs.append(fh)  # save front height
    side_hs.append(sh)  # save side height

    if i % 50 == 0:  # periodic update
        print(f"\nProcessed {i}/{n} records so far...")  # progress log

print("\nAttaching new feature columns to dataframe...")  # progress log
data[FRONT_AREA_COL] = front_norms  # add normalized front area feature
data[SIDE_AREA_COL]  = side_norms   # add normalized side area feature

Creating S3 client...
Preparing photo_id normalization...
Using zero-pad length = 32
Starting silhouette area computation for all records...
Total rows to process: 6134

Record 1/6134
  Processing photo_id=6ab1d061f51c6079633aeceed2faeb0b -> pid=6ab1d061f51c6079633aeceed2faeb0b
    Downloading mask: s3://amazon-bodym/train/mask/6ab1d061f51c6079633aeceed2faeb0b.png
    Downloading mask: s3://amazon-bodym/train/mask_left/6ab1d061f51c6079633aeceed2faeb0b.png
    front_raw_area=121535.0 | front_h=835.0 | front_norm=0.1743124529384345 || side_raw_area=93270.0 | side_h=829.0 | side_norm=0.13571658268351278

Record 2/6134
  Processing photo_id=e94e2e05fb8b099955bbc4fa5ce81e22 -> pid=e94e2e05fb8b099955bbc4fa5ce81e22
    Downloading mask: s3://amazon-bodym/train/mask/e94e2e05fb8b099955bbc4fa5ce81e22.png
    Downloading mask: s3://amazon-bodym/train/mask_left/e94e2e05fb8b099955bbc4fa5ce81e22.png
    front_raw_area=124875.0 | front_h=852.0 | front_norm=0.17202687958738347 || side_raw_area=84668.0

In [4]:
data.head()

Unnamed: 0,photo_id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,shoulder-breadth,shoulder-to-crotch,thigh,waist,wrist,gender,height_cm,weight_kg,front_silhouette_area_norm,side_silhouette_area_norm
0,6ab1d061f51c6079633aeceed2faeb0b,6.8e-05,0.108145,-0.138813,0.633156,0.346266,-0.046055,0.016021,-0.058632,0.097968,...,35.362858,65.993683,54.459591,88.813789,16.764332,female,170.5,72.0,0.174312,0.135717
1,e94e2e05fb8b099955bbc4fa5ce81e22,0.020843,0.026005,-0.093442,0.736929,0.240569,0.089982,-0.112391,0.000435,-0.07611,...,37.25676,65.861588,52.773052,89.176338,15.690955,male,178.3,71.8,0.172027,0.102244
2,ba6951a4f37fc9302243370e927a02e2,0.014542,-0.071332,-0.154407,0.577781,0.196485,-0.125341,-0.056713,-0.027295,0.094879,...,37.503147,66.042679,57.059261,82.201988,16.686253,male,176.25,76.5,0.168356,0.114739
3,947d16539d4702427aa74f737329ffb9,0.041775,0.075746,-0.128497,0.48501,0.120409,0.011227,0.017852,-0.089796,-0.011273,...,34.084633,60.41333,65.0,102.323845,17.693762,female,152.1,88.9,0.226599,0.153627
4,9326695bf62926ec22690f576a633bba,0.004397,0.05859,-0.154224,0.52814,0.290956,-0.108486,-0.021441,-0.099909,0.08077,...,38.086231,68.400543,57.172279,107.378578,16.594791,male,171.5,88.4,0.183343,0.114689


In [5]:
import boto3  # S3 client
from io import StringIO  # in-memory text buffer

print("Preparing to upload updated dataset to S3...")  # track process

s3_out_path = "s3://ai-bmi-predictor-v2/data/eff_training_v3.csv"  # target S3 path
print("Target:", s3_out_path)  # show target

# ---- parse s3://bucket/key ----
out_bucket = s3_out_path.replace("s3://", "").split("/", 1)[0]  # extract bucket
out_key = s3_out_path.replace("s3://", "").split("/", 1)[1]     # extract key
print(f"Parsed -> bucket={out_bucket}, key={out_key}")  # confirm parsing

# ---- write CSV to memory ----
csv_buffer = StringIO()  # create in-memory buffer
print("Serializing dataframe to CSV (in-memory)...")  # track process
data.to_csv(csv_buffer, index=False)  # write dataframe as CSV text
csv_body = csv_buffer.getvalue()  # get CSV string content
print(f"CSV size (chars): {len(csv_body):,}")  # print rough size

# ---- upload to S3 ----
s3 = boto3.client("s3")  # create S3 client
print("Uploading to S3...")  # track process
s3.put_object(
    Bucket=out_bucket,                 # destination bucket
    Key=out_key,                       # destination key
    Body=csv_body.encode("utf-8"),     # file bytes
    ContentType="text/csv"             # content type
)  # upload

print("Upload complete âœ…")  # done
print(f"Saved to: {s3_out_path}")  # confirm final path


Preparing to upload updated dataset to S3...
Target: s3://ai-bmi-predictor-v2/data/eff_training_v3.csv
Parsed -> bucket=ai-bmi-predictor-v2, key=data/eff_training_v3.csv
Serializing dataframe to CSV (in-memory)...
CSV size (chars): 372,740,797
Uploading to S3...
Upload complete âœ…
Saved to: s3://ai-bmi-predictor-v2/data/eff_training_v3.csv
