1. load the dataset

In [1]:
import boto3
import pandas as pd

bucket = "ai-bmi-predictor"
key = "data/eff_training.csv"

s3 = boto3.client("s3")
obj = s3.get_object(Bucket=bucket, Key=key)

data = pd.read_csv(obj["Body"])

data.head()


Unnamed: 0,photo_id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,hip,leg-length,shoulder-breadth,shoulder-to-crotch,thigh,waist,wrist,gender,height_cm,weight_kg
0,6ab1d061f51c6079633aeceed2faeb0b,6.8e-05,0.108145,-0.138813,0.633156,0.346266,-0.046055,0.016021,-0.058632,0.097968,...,105.3339,76.817467,35.362858,65.993683,54.459591,88.813789,16.764332,female,170.5,72.0
1,e94e2e05fb8b099955bbc4fa5ce81e22,0.020843,0.026005,-0.093442,0.736929,0.240569,0.089982,-0.112391,0.000435,-0.07611,...,101.478989,85.154358,37.25676,65.861588,52.773052,89.176338,15.690955,male,178.3,71.8
2,ba6951a4f37fc9302243370e927a02e2,0.014542,-0.071332,-0.154407,0.577781,0.196485,-0.125341,-0.056713,-0.027295,0.094879,...,97.488243,81.410393,37.503147,66.042679,57.059261,82.201988,16.686253,male,176.25,76.5
3,947d16539d4702427aa74f737329ffb9,0.041775,0.075746,-0.128497,0.48501,0.120409,0.011227,0.017852,-0.089796,-0.011273,...,120.586845,69.361534,34.084633,60.41333,65.0,102.323845,17.693762,female,152.1,88.9
4,9326695bf62926ec22690f576a633bba,0.004397,0.05859,-0.154224,0.52814,0.290956,-0.108486,-0.021441,-0.099909,0.08077,...,110.543564,77.160583,38.086231,68.400543,57.172279,107.378578,16.594791,male,171.5,88.4


2 calculate the body volume

In [2]:
# =========================
# Volume feature engineering
# =========================

import boto3  # AWS SDK for Python (S3 access)
import numpy as np  # fast array/mask operations
import pandas as pd  # dataframe ops
from PIL import Image  # read PNG images
from io import BytesIO  # convert S3 bytes -> file-like object
from botocore.exceptions import ClientError  # catch S3 missing-key errors

# -------------------------
# CONFIG (edit if needed)
# -------------------------

MASK_BUCKET = "amazon-bodym"            # S3 bucket that contains mask images
FRONT_MASK_PREFIX = "train/mask/"      # front masks folder (from your 1st screenshot)
SIDE_MASK_PREFIX = "train/mask_left/"  # side masks folder (from your 2nd screenshot)
PHOTO_ID_COL = "photo_id"              # column in your CSV that maps to mask filenames
OUTPUT_COL = "volume"                  # name for the new feature column

# -------------------------
# S3 client (reused)
# -------------------------

print("Creating S3 client...")  # track progress
s3 = boto3.client("s3")  # create an S3 client

# -----------------------------------------
# Helper: standardize photo_id -> filename
# -----------------------------------------

print("Inferring photo_id padding length...")  # track progress
photo_id_series = data[PHOTO_ID_COL].astype(str).fillna("")  # ensure string + no NaN
photo_id_series = photo_id_series.str.strip()  # remove whitespace
photo_id_series = photo_id_series.str.replace(".png", "", regex=False)  # drop extension if present
photo_id_series = photo_id_series.str.split(".", n=1).str[0]  # drop trailing ".0" etc (common if numeric CSV)
pad_len = int(photo_id_series.str.len().max()) if len(photo_id_series) else 0  # infer max length
pad_len = max(pad_len, 4)  # default to 4 (matches your example like "0021")
print(f"Using zero-pad length = {pad_len}")  # show chosen padding

def normalize_photo_id(photo_id, pad_length):  # define normalizer
    pid = str(photo_id).strip()  # convert to string and trim spaces
    pid = pid.replace(".png", "")  # remove .png if included
    pid = pid.split(".", 1)[0]  # remove any trailing decimals (e.g., "21.0" -> "21")
    if pid.isdigit():  # only pad if it's purely digits
        pid = pid.zfill(pad_length)  # left-pad with zeros
    return pid  # return normalized id like "0021"

# -----------------------------------------
# Helper: read mask PNG from S3 into boolean
# -----------------------------------------

def load_mask_bool_from_s3(bucket, key):  # function to load a mask file
    print(f"    Downloading: s3://{bucket}/{key}")  # track downloads
    obj = s3.get_object(Bucket=bucket, Key=key)  # fetch object bytes from S3
    img = Image.open(BytesIO(obj["Body"].read()))  # open image from bytes
    img = img.convert("L")  # convert to grayscale
    arr = np.array(img)  # convert to numpy array
    mask_bool = arr > 0  # treat any non-zero pixel as "body"
    return mask_bool  # return boolean mask

# -----------------------------------------
# Helper: compute mean width across rows
# -----------------------------------------

def mean_row_width(mask_bool):  # compute average body width across horizontal rows
    if mask_bool.size == 0:  # guard: empty image
        return np.nan  # cannot compute
    row_has_body = mask_bool.any(axis=1)  # rows where body exists
    if not row_has_body.any():  # guard: no body pixels
        return np.nan  # cannot compute
    first = mask_bool.argmax(axis=1)  # first True index per row (0 if none)
    last = mask_bool.shape[1] - 1 - np.flip(mask_bool, axis=1).argmax(axis=1)  # last True index per row
    widths = (last - first + 1).astype(float)  # width per row (in pixels)
    widths[~row_has_body] = np.nan  # ignore rows with no body
    return float(np.nanmean(widths))  # average width across valid rows

# -----------------------------------------
# Helper: compute height from front mask
# -----------------------------------------

def body_height(mask_bool):  # compute vertical extent (top-to-bottom)
    if mask_bool.size == 0:  # guard: empty image
        return np.nan  # cannot compute
    ys, xs = np.where(mask_bool)  # get coordinates of body pixels
    if ys.size == 0:  # guard: no body pixels
        return np.nan  # cannot compute
    h = (ys.max() - ys.min() + 1).astype(float)  # height in pixels
    return float(h)  # return height

# -----------------------------------------
# Per-record feature computation
# -----------------------------------------

def compute_volume_for_photo_id(photo_id):  # compute volume + debug parts for one record
    pid = normalize_photo_id(photo_id, pad_len)  # normalize id to match PNG filename
    front_key = f"{FRONT_MASK_PREFIX}{pid}.png"  # build front mask key
    side_key = f"{SIDE_MASK_PREFIX}{pid}.png"  # build side mask key

    try:
        front_mask = load_mask_bool_from_s3(MASK_BUCKET, front_key)  # load front mask
    except ClientError as e:  # handle missing front file
        print(f"    ERROR: missing front mask for {pid}: {e}")  # print error
        return np.nan, np.nan, np.nan, np.nan  # return NaNs

    try:
        side_mask = load_mask_bool_from_s3(MASK_BUCKET, side_key)  # load side mask
    except ClientError as e:  # handle missing side file
        print(f"    ERROR: missing side mask for {pid}: {e}")  # print error
        return np.nan, np.nan, np.nan, np.nan  # return NaNs

    front_w = mean_row_width(front_mask)  # front width (body looks wide from front)
    side_w = mean_row_width(side_mask)  # side width (body looks thick from side)
    avg_w = (front_w + side_w) / 2.0  # average width as instructed
    h = body_height(front_mask)  # height from front mask (vertical extent)

    raw_volume = avg_w * h  # volume â‰ˆ average width Ã— height
    norm_volume = raw_volume / (h ** 2) if (h is not None and not np.isnan(h) and h > 0) else np.nan  # normalize

    return norm_volume, front_w, side_w, h  # return volume + components for debugging

# -----------------------------------------
# Run across the dataframe and add column(s)
# -----------------------------------------

print("Starting volume computation for all records...")  # track progress
volumes = []  # store normalized volumes
front_ws = []  # store front widths (debug)
side_ws = []  # store side widths (debug)
heights = []  # store heights (debug)

n = len(data)  # total rows
print(f"Total rows to process: {n}")  # show total

for i, photo_id in enumerate(data[PHOTO_ID_COL].tolist(), start=1):  # iterate through photo_ids
    print(f"\nRecord {i}/{n} | photo_id={photo_id}")  # show per-record progress
    v, fw, sw, h = compute_volume_for_photo_id(photo_id)  # compute features
    print(f"    front_width={fw} | side_width={sw} | height={h} | volume={v}")  # print computed values
    volumes.append(v)  # append volume
    front_ws.append(fw)  # append front width
    side_ws.append(sw)  # append side width
    heights.append(h)  # append height

    if i % 50 == 0:  # every 50 rows
        print(f"\nProcessed {i}/{n} records so far...")  # periodic progress update

print("\nAttaching new feature columns to dataframe...")  # track progress
data[OUTPUT_COL] = volumes  # add normalized volume feature
data["front_width"] = front_ws  # optional debug column
data["side_width"] = side_ws  # optional debug column
data["mask_height"] = heights  # optional debug column

print("Done.")  # final status
print("Preview of updated dataframe:")  # show preview message
print(data[[PHOTO_ID_COL, OUTPUT_COL, "front_width", "side_width", "mask_height"]].head())  # preview new columns


Creating S3 client...
Inferring photo_id padding length...
Using zero-pad length = 32
Starting volume computation for all records...
Total rows to process: 6134

Record 1/6134 | photo_id=6ab1d061f51c6079633aeceed2faeb0b
    Downloading: s3://amazon-bodym/train/mask/6ab1d061f51c6079633aeceed2faeb0b.png
    Downloading: s3://amazon-bodym/train/mask_left/6ab1d061f51c6079633aeceed2faeb0b.png
    front_width=199.62155688622755 | side_width=113.00723763570566 | height=835.0 | volume=0.18720287097121752

Record 2/6134 | photo_id=e94e2e05fb8b099955bbc4fa5ce81e22
    Downloading: s3://amazon-bodym/train/mask/e94e2e05fb8b099955bbc4fa5ce81e22.png
    Downloading: s3://amazon-bodym/train/mask_left/e94e2e05fb8b099955bbc4fa5ce81e22.png
    front_width=203.3661971830986 | side_width=93.07472527472528 | height=852.0 | volume=0.17396767749872294

Record 3/6134 | photo_id=ba6951a4f37fc9302243370e927a02e2
    Downloading: s3://amazon-bodym/train/mask/ba6951a4f37fc9302243370e927a02e2.png
    Downloading: 

In [3]:
data.head()

Unnamed: 0,photo_id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,thigh,waist,wrist,gender,height_cm,weight_kg,volume,front_width,side_width,mask_height
0,6ab1d061f51c6079633aeceed2faeb0b,6.8e-05,0.108145,-0.138813,0.633156,0.346266,-0.046055,0.016021,-0.058632,0.097968,...,54.459591,88.813789,16.764332,female,170.5,72.0,0.187203,199.621557,113.007238,835.0
1,e94e2e05fb8b099955bbc4fa5ce81e22,0.020843,0.026005,-0.093442,0.736929,0.240569,0.089982,-0.112391,0.000435,-0.07611,...,52.773052,89.176338,15.690955,male,178.3,71.8,0.173968,203.366197,93.074725,852.0
2,ba6951a4f37fc9302243370e927a02e2,0.014542,-0.071332,-0.154407,0.577781,0.196485,-0.125341,-0.056713,-0.027295,0.094879,...,57.059261,82.201988,16.686253,male,176.25,76.5,0.17431,197.497773,115.563284,898.0
3,947d16539d4702427aa74f737329ffb9,0.041775,0.075746,-0.128497,0.48501,0.120409,0.011227,0.017852,-0.089796,-0.011273,...,65.0,102.323845,17.693762,female,152.1,88.9,0.237156,280.037037,142.574353,891.0
4,9326695bf62926ec22690f576a633bba,0.004397,0.05859,-0.154224,0.52814,0.290956,-0.108486,-0.021441,-0.099909,0.08077,...,57.172279,107.378578,16.594791,male,171.5,88.4,0.198802,206.658344,95.91866,761.0


In [5]:
data.shape

(6134, 5142)

In [6]:
# Drop the intermediate/debug columns (ignore if they don't exist)
cols_to_drop = ["front_width", "side_width", "mask_height"]  # columns to remove
print("Dropping columns:", cols_to_drop)  # track process
data.drop(columns=cols_to_drop, inplace=True, errors="ignore")  # drop safely

Dropping columns: ['front_width', 'side_width', 'mask_height']


In [7]:
data.shape

(6134, 5139)

In [8]:
data.head()

Unnamed: 0,photo_id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,leg-length,shoulder-breadth,shoulder-to-crotch,thigh,waist,wrist,gender,height_cm,weight_kg,volume
0,6ab1d061f51c6079633aeceed2faeb0b,6.8e-05,0.108145,-0.138813,0.633156,0.346266,-0.046055,0.016021,-0.058632,0.097968,...,76.817467,35.362858,65.993683,54.459591,88.813789,16.764332,female,170.5,72.0,0.187203
1,e94e2e05fb8b099955bbc4fa5ce81e22,0.020843,0.026005,-0.093442,0.736929,0.240569,0.089982,-0.112391,0.000435,-0.07611,...,85.154358,37.25676,65.861588,52.773052,89.176338,15.690955,male,178.3,71.8,0.173968
2,ba6951a4f37fc9302243370e927a02e2,0.014542,-0.071332,-0.154407,0.577781,0.196485,-0.125341,-0.056713,-0.027295,0.094879,...,81.410393,37.503147,66.042679,57.059261,82.201988,16.686253,male,176.25,76.5,0.17431
3,947d16539d4702427aa74f737329ffb9,0.041775,0.075746,-0.128497,0.48501,0.120409,0.011227,0.017852,-0.089796,-0.011273,...,69.361534,34.084633,60.41333,65.0,102.323845,17.693762,female,152.1,88.9,0.237156
4,9326695bf62926ec22690f576a633bba,0.004397,0.05859,-0.154224,0.52814,0.290956,-0.108486,-0.021441,-0.099909,0.08077,...,77.160583,38.086231,68.400543,57.172279,107.378578,16.594791,male,171.5,88.4,0.198802


In [9]:
import boto3  # S3 client
from io import StringIO  # in-memory text buffer

print("Preparing to upload updated dataset to S3...")  # track process

s3_out_path = "s3://ai-bmi-predictor/data/eff_training_v2.csv"  # target S3 path
print("Target:", s3_out_path)  # show target

# ---- parse s3://bucket/key ----
out_bucket = s3_out_path.replace("s3://", "").split("/", 1)[0]  # extract bucket
out_key = s3_out_path.replace("s3://", "").split("/", 1)[1]     # extract key
print(f"Parsed -> bucket={out_bucket}, key={out_key}")  # confirm parsing

# ---- write CSV to memory ----
csv_buffer = StringIO()  # create in-memory buffer
print("Serializing dataframe to CSV (in-memory)...")  # track process
data.to_csv(csv_buffer, index=False)  # write dataframe as CSV text
csv_body = csv_buffer.getvalue()  # get CSV string content
print(f"CSV size (chars): {len(csv_body):,}")  # print rough size

# ---- upload to S3 ----
s3 = boto3.client("s3")  # create S3 client
print("Uploading to S3...")  # track process
s3.put_object(
    Bucket=out_bucket,                 # destination bucket
    Key=out_key,                       # destination key
    Body=csv_body.encode("utf-8"),     # file bytes
    ContentType="text/csv"             # content type
)  # upload

print("Upload complete âœ…")  # done
print(f"Saved to: {s3_out_path}")  # confirm final path


Preparing to upload updated dataset to S3...
Target: s3://ai-bmi-predictor/data/eff_training_v2.csv
Parsed -> bucket=ai-bmi-predictor, key=data/eff_training_v2.csv
Serializing dataframe to CSV (in-memory)...
CSV size (chars): 372,619,770
Uploading to S3...
Upload complete âœ…
Saved to: s3://ai-bmi-predictor/data/eff_training_v2.csv
