In [1]:
import boto3
import pandas as pd

bucket = "ai-bmi-predictor"
key = "test-data/eff_testingA.csv"

s3 = boto3.client("s3")
obj = s3.get_object(Bucket=bucket, Key=key)

data = pd.read_csv(obj["Body"])

data.head()


Unnamed: 0,photo_id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,hip,leg-length,shoulder-breadth,shoulder-to-crotch,thigh,waist,wrist,gender,height_cm,weight_kg
0,e5ae8fe5bbdf611a1e8d06e66e849bdf,0.073159,0.085775,-0.133776,0.881202,0.214236,0.016104,-0.180302,-0.100713,-0.117249,...,106.77469,83.279744,39.922305,70.005128,55.945992,98.25039,20.187082,male,180.0,94.6
1,605a5fd09058c48156b0ef518b63b2de,0.092031,-0.066016,-0.145132,0.687441,0.186508,-0.075221,-0.093846,-0.03584,0.033903,...,102.481633,84.876529,39.974203,73.591637,55.397032,88.003618,17.715785,male,188.9,86.75
2,909c9277309e13ee014e347603aba620,0.057046,-0.051366,-0.148253,0.675916,0.209973,-0.073485,-0.072783,-0.059395,0.00837,...,99.342301,82.275874,36.059983,66.440526,53.742692,82.100598,17.086464,male,179.7,73.85
3,bef6a68bc8dd475c124f6de2413385d3,-0.018792,0.016435,-0.148091,0.464433,0.242849,-0.106556,0.001489,-0.083478,0.096048,...,101.770144,76.081842,34.071748,62.218026,52.396573,83.999124,16.299751,female,166.95,69.05
4,6d7ed4bc4a17546447efed0ca6e2ff11,0.084419,0.065945,-0.153379,0.635377,0.285274,-0.056372,-0.139008,-0.120711,-0.002466,...,94.707063,81.328892,36.834735,64.426273,49.895157,86.020117,16.531431,male,173.2,65.55


In [2]:
# =========================
# Volume feature engineering
# =========================

import boto3  # AWS SDK for Python (S3 access)
import numpy as np  # fast array/mask operations
import pandas as pd  # dataframe ops
from PIL import Image  # read PNG images
from io import BytesIO  # convert S3 bytes -> file-like object
from botocore.exceptions import ClientError  # catch S3 missing-key errors

# -------------------------
# CONFIG (edit if needed)
# -------------------------

MASK_BUCKET = "amazon-bodym"            # S3 bucket that contains mask images
FRONT_MASK_PREFIX = "testA/mask/"      # front masks folder (from your 1st screenshot)
SIDE_MASK_PREFIX = "testA/mask_left/"  # side masks folder (from your 2nd screenshot)
PHOTO_ID_COL = "photo_id"              # column in your CSV that maps to mask filenames
OUTPUT_COL = "volume"                  # name for the new feature column

# -------------------------
# S3 client (reused)
# -------------------------

print("Creating S3 client...")  # track progress
s3 = boto3.client("s3")  # create an S3 client

# -----------------------------------------
# Helper: standardize photo_id -> filename
# -----------------------------------------

print("Inferring photo_id padding length...")  # track progress
photo_id_series = data[PHOTO_ID_COL].astype(str).fillna("")  # ensure string + no NaN
photo_id_series = photo_id_series.str.strip()  # remove whitespace
photo_id_series = photo_id_series.str.replace(".png", "", regex=False)  # drop extension if present
photo_id_series = photo_id_series.str.split(".", n=1).str[0]  # drop trailing ".0" etc (common if numeric CSV)
pad_len = int(photo_id_series.str.len().max()) if len(photo_id_series) else 0  # infer max length
pad_len = max(pad_len, 4)  # default to 4 (matches your example like "0021")
print(f"Using zero-pad length = {pad_len}")  # show chosen padding

def normalize_photo_id(photo_id, pad_length):  # define normalizer
    pid = str(photo_id).strip()  # convert to string and trim spaces
    pid = pid.replace(".png", "")  # remove .png if included
    pid = pid.split(".", 1)[0]  # remove any trailing decimals (e.g., "21.0" -> "21")
    if pid.isdigit():  # only pad if it's purely digits
        pid = pid.zfill(pad_length)  # left-pad with zeros
    return pid  # return normalized id like "0021"

# -----------------------------------------
# Helper: read mask PNG from S3 into boolean
# -----------------------------------------

def load_mask_bool_from_s3(bucket, key):  # function to load a mask file
    print(f"    Downloading: s3://{bucket}/{key}")  # track downloads
    obj = s3.get_object(Bucket=bucket, Key=key)  # fetch object bytes from S3
    img = Image.open(BytesIO(obj["Body"].read()))  # open image from bytes
    img = img.convert("L")  # convert to grayscale
    arr = np.array(img)  # convert to numpy array
    mask_bool = arr > 0  # treat any non-zero pixel as "body"
    return mask_bool  # return boolean mask

# -----------------------------------------
# Helper: compute mean width across rows
# -----------------------------------------

def mean_row_width(mask_bool):  # compute average body width across horizontal rows
    if mask_bool.size == 0:  # guard: empty image
        return np.nan  # cannot compute
    row_has_body = mask_bool.any(axis=1)  # rows where body exists
    if not row_has_body.any():  # guard: no body pixels
        return np.nan  # cannot compute
    first = mask_bool.argmax(axis=1)  # first True index per row (0 if none)
    last = mask_bool.shape[1] - 1 - np.flip(mask_bool, axis=1).argmax(axis=1)  # last True index per row
    widths = (last - first + 1).astype(float)  # width per row (in pixels)
    widths[~row_has_body] = np.nan  # ignore rows with no body
    return float(np.nanmean(widths))  # average width across valid rows

# -----------------------------------------
# Helper: compute height from front mask
# -----------------------------------------

def body_height(mask_bool):  # compute vertical extent (top-to-bottom)
    if mask_bool.size == 0:  # guard: empty image
        return np.nan  # cannot compute
    ys, xs = np.where(mask_bool)  # get coordinates of body pixels
    if ys.size == 0:  # guard: no body pixels
        return np.nan  # cannot compute
    h = (ys.max() - ys.min() + 1).astype(float)  # height in pixels
    return float(h)  # return height

# -----------------------------------------
# Per-record feature computation
# -----------------------------------------

def compute_volume_for_photo_id(photo_id):  # compute volume + debug parts for one record
    pid = normalize_photo_id(photo_id, pad_len)  # normalize id to match PNG filename
    front_key = f"{FRONT_MASK_PREFIX}{pid}.png"  # build front mask key
    side_key = f"{SIDE_MASK_PREFIX}{pid}.png"  # build side mask key

    try:
        front_mask = load_mask_bool_from_s3(MASK_BUCKET, front_key)  # load front mask
    except ClientError as e:  # handle missing front file
        print(f"    ERROR: missing front mask for {pid}: {e}")  # print error
        return np.nan, np.nan, np.nan, np.nan  # return NaNs

    try:
        side_mask = load_mask_bool_from_s3(MASK_BUCKET, side_key)  # load side mask
    except ClientError as e:  # handle missing side file
        print(f"    ERROR: missing side mask for {pid}: {e}")  # print error
        return np.nan, np.nan, np.nan, np.nan  # return NaNs

    front_w = mean_row_width(front_mask)  # front width (body looks wide from front)
    side_w = mean_row_width(side_mask)  # side width (body looks thick from side)
    avg_w = (front_w + side_w) / 2.0  # average width as instructed
    h = body_height(front_mask)  # height from front mask (vertical extent)

    raw_volume = avg_w * h  # volume â‰ˆ average width Ã— height
    norm_volume = raw_volume / (h ** 2) if (h is not None and not np.isnan(h) and h > 0) else np.nan  # normalize

    return norm_volume, front_w, side_w, h  # return volume + components for debugging

# -----------------------------------------
# Run across the dataframe and add column(s)
# -----------------------------------------

print("Starting volume computation for all records...")  # track progress
volumes = []  # store normalized volumes
front_ws = []  # store front widths (debug)
side_ws = []  # store side widths (debug)
heights = []  # store heights (debug)

n = len(data)  # total rows
print(f"Total rows to process: {n}")  # show total

for i, photo_id in enumerate(data[PHOTO_ID_COL].tolist(), start=1):  # iterate through photo_ids
    print(f"\nRecord {i}/{n} | photo_id={photo_id}")  # show per-record progress
    v, fw, sw, h = compute_volume_for_photo_id(photo_id)  # compute features
    print(f"    front_width={fw} | side_width={sw} | height={h} | volume={v}")  # print computed values
    volumes.append(v)  # append volume
    front_ws.append(fw)  # append front width
    side_ws.append(sw)  # append side width
    heights.append(h)  # append height

    if i % 50 == 0:  # every 50 rows
        print(f"\nProcessed {i}/{n} records so far...")  # periodic progress update

print("\nAttaching new feature columns to dataframe...")  # track progress
data[OUTPUT_COL] = volumes  # add normalized volume feature
data["front_width"] = front_ws  # optional debug column
data["side_width"] = side_ws  # optional debug column
data["mask_height"] = heights  # optional debug column

print("Done.")  # final status
print("Preview of updated dataframe:")  # show preview message
print(data[[PHOTO_ID_COL, OUTPUT_COL, "front_width", "side_width", "mask_height"]].head())  # preview new columns


Creating S3 client...
Inferring photo_id padding length...
Using zero-pad length = 32
Starting volume computation for all records...
Total rows to process: 1684

Record 1/1684 | photo_id=e5ae8fe5bbdf611a1e8d06e66e849bdf
    Downloading: s3://amazon-bodym/testA/mask/e5ae8fe5bbdf611a1e8d06e66e849bdf.png
    Downloading: s3://amazon-bodym/testA/mask_left/e5ae8fe5bbdf611a1e8d06e66e849bdf.png
    front_width=233.10262529832934 | side_width=108.52017448200654 | height=838.0 | volume=0.2038322194393412

Record 2/1684 | photo_id=605a5fd09058c48156b0ef518b63b2de
    Downloading: s3://amazon-bodym/testA/mask/605a5fd09058c48156b0ef518b63b2de.png
    Downloading: s3://amazon-bodym/testA/mask_left/605a5fd09058c48156b0ef518b63b2de.png
    front_width=163.38773006134969 | side_width=121.27625570776256 | height=815.0 | volume=0.17464048206694002

Record 3/1684 | photo_id=909c9277309e13ee014e347603aba620
    Downloading: s3://amazon-bodym/testA/mask/909c9277309e13ee014e347603aba620.png
    Downloading:

In [3]:
# Drop the intermediate/debug columns (ignore if they don't exist)
cols_to_drop = ["front_width", "side_width", "mask_height"]  # columns to remove
print("Dropping columns:", cols_to_drop)  # track process
data.drop(columns=cols_to_drop, inplace=True, errors="ignore")  # drop safely

Dropping columns: ['front_width', 'side_width', 'mask_height']


In [4]:
data.head()

Unnamed: 0,photo_id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,leg-length,shoulder-breadth,shoulder-to-crotch,thigh,waist,wrist,gender,height_cm,weight_kg,volume
0,e5ae8fe5bbdf611a1e8d06e66e849bdf,0.073159,0.085775,-0.133776,0.881202,0.214236,0.016104,-0.180302,-0.100713,-0.117249,...,83.279744,39.922305,70.005128,55.945992,98.25039,20.187082,male,180.0,94.6,0.203832
1,605a5fd09058c48156b0ef518b63b2de,0.092031,-0.066016,-0.145132,0.687441,0.186508,-0.075221,-0.093846,-0.03584,0.033903,...,84.876529,39.974203,73.591637,55.397032,88.003618,17.715785,male,188.9,86.75,0.17464
2,909c9277309e13ee014e347603aba620,0.057046,-0.051366,-0.148253,0.675916,0.209973,-0.073485,-0.072783,-0.059395,0.00837,...,82.275874,36.059983,66.440526,53.742692,82.100598,17.086464,male,179.7,73.85,0.16868
3,bef6a68bc8dd475c124f6de2413385d3,-0.018792,0.016435,-0.148091,0.464433,0.242849,-0.106556,0.001489,-0.083478,0.096048,...,76.081842,34.071748,62.218026,52.396573,83.999124,16.299751,female,166.95,69.05,0.181684
4,6d7ed4bc4a17546447efed0ca6e2ff11,0.084419,0.065945,-0.153379,0.635377,0.285274,-0.056372,-0.139008,-0.120711,-0.002466,...,81.328892,36.834735,64.426273,49.895157,86.020117,16.531431,male,173.2,65.55,0.186667


In [5]:
import boto3  # S3 client
from io import StringIO  # in-memory text buffer

print("Preparing to upload updated dataset to S3...")  # track process

s3_out_path = "s3://ai-bmi-predictor/test-data/eff_testingA_v2.csv"  # target S3 path
print("Target:", s3_out_path)  # show target

# ---- parse s3://bucket/key ----
out_bucket = s3_out_path.replace("s3://", "").split("/", 1)[0]  # extract bucket
out_key = s3_out_path.replace("s3://", "").split("/", 1)[1]     # extract key
print(f"Parsed -> bucket={out_bucket}, key={out_key}")  # confirm parsing

# ---- write CSV to memory ----
csv_buffer = StringIO()  # create in-memory buffer
print("Serializing dataframe to CSV (in-memory)...")  # track process
data.to_csv(csv_buffer, index=False)  # write dataframe as CSV text
csv_body = csv_buffer.getvalue()  # get CSV string content
print(f"CSV size (chars): {len(csv_body):,}")  # print rough size

# ---- upload to S3 ----
s3 = boto3.client("s3")  # create S3 client
print("Uploading to S3...")  # track process
s3.put_object(
    Bucket=out_bucket,                 # destination bucket
    Key=out_key,                       # destination key
    Body=csv_body.encode("utf-8"),     # file bytes
    ContentType="text/csv"             # content type
)  # upload

print("Upload complete âœ…")  # done
print(f"Saved to: {s3_out_path}")  # confirm final path


Preparing to upload updated dataset to S3...
Target: s3://ai-bmi-predictor/test-data/eff_testingA_v2.csv
Parsed -> bucket=ai-bmi-predictor, key=test-data/eff_testingA_v2.csv
Serializing dataframe to CSV (in-memory)...
CSV size (chars): 102,270,627
Uploading to S3...
Upload complete âœ…
Saved to: s3://ai-bmi-predictor/test-data/eff_testingA_v2.csv
