1. create dataframe using extracted features of front masks
    * 1.1. load the pickle file of extracted features of front masks
    * 1.2. inspect one element to understand the shape
    * 1.3. create 2D array
    * 1.4. create the dataframe

2. create dataframe using extrcated features from side masks
    * 2.1. load the pickle file of extrcated features of side masks
    * 2.2. inspect one element to understand the shape
    * 2.3. create 2D array
    * 2.4. create the dataframe

3. merge front masks dataframe and side masks dataframe

4. load body_measurements dataset

5. load mapped subject_id and photo_id dataset

6. map body measurements with photo_id

7. merge body measurements and feature extrractions ( removed .png suffix from feature extracted dataframe )

1. create dataframe using extracted features of front masks

1.1. load the pickle file

In [1]:
import pickle
import boto3
import pandas as pd
import numpy as np

# ------------------------------------------------
# Config
# ------------------------------------------------
s3_path = "s3://ai-bmi-predictor/feature-extraction-data/training-data/mae_large_front_masked_features.pkl"

print("[INFO] Starting process...")
print(f"[INFO] S3 path: {s3_path}")

# ------------------------------------------------
# Parse S3 path
# ------------------------------------------------
assert s3_path.startswith("s3://"), "S3 path must start with 's3://'"
path_no_scheme = s3_path[len("s3://"):]
bucket, key = path_no_scheme.split("/", 1)

print(f"[INFO] Bucket: {bucket}")
print(f"[INFO] Key: {key}")

# ------------------------------------------------
# Read pickle directly from S3 into memory
# ------------------------------------------------
print("[INFO] Initializing S3 client...")
s3 = boto3.client("s3")

print("[INFO] Fetching object from S3 (streaming)...")
obj = s3.get_object(Bucket=bucket, Key=key)

print("[INFO] Loading pickle from StreamingBody...")
features_dict = pickle.load(obj["Body"])

print("[INFO] Pickle load complete.")
print(f"[INFO] Type of loaded object: {type(features_dict)}")
print(f"[INFO] Number of photo_ids (dict keys): {len(features_dict):,}")

[INFO] Starting process...
[INFO] S3 path: s3://ai-bmi-predictor/feature-extraction-data/training-data/mae_large_front_masked_features.pkl
[INFO] Bucket: ai-bmi-predictor
[INFO] Key: feature-extraction-data/training-data/mae_large_front_masked_features.pkl
[INFO] Initializing S3 client...
[INFO] Fetching object from S3 (streaming)...
[INFO] Loading pickle from StreamingBody...
[INFO] Pickle load complete.
[INFO] Type of loaded object: <class 'dict'>
[INFO] Number of photo_ids (dict keys): 6,134


1.2. inspect one element to understand the shape

In [2]:
# ------------------------------------------------
# Inspect one element to understand shape
# ------------------------------------------------
first_key = next(iter(features_dict))
first_raw_value = features_dict[first_key]
first_value = np.asarray(first_raw_value)

print(f"[DEBUG] Example photo_id: {first_key}")
print(f"[DEBUG] Raw type of feature value: {type(first_raw_value)}")
print(f"[DEBUG] Converted to np.array with shape: {first_value.shape}, ndim: {first_value.ndim}")

if first_value.ndim == 1:
    feature_length = first_value.shape[0]
elif first_value.ndim == 2:
    # e.g. (1, 2560) or (2560, 1)
    feature_length = first_value.size   # total elements
else:
    raise ValueError(f"[ERROR] Unsupported feature value ndim: {first_value.ndim}")

print(f"[INFO] Inferred feature_length: {feature_length}")

[DEBUG] Example photo_id: 6ab1d061f51c6079633aeceed2faeb0b.png
[DEBUG] Raw type of feature value: <class 'numpy.ndarray'>
[DEBUG] Converted to np.array with shape: (1, 1280), ndim: 2
[INFO] Inferred feature_length: 1280


1.3. create 2D array

In [3]:
# ------------------------------------------------
# Flatten all feature vectors and build 2D array
# ------------------------------------------------
print("[INFO] Flattening all feature vectors and building feature matrix...")

photo_ids = []
features_list = []

for i, (pid, feat) in enumerate(features_dict.items(), start=1):
    arr = np.asarray(feat)

    if arr.ndim == 1:
        vec = arr
    elif arr.ndim == 2:
        # Flatten any 2D array (e.g. (1, 2560) -> (2560,)
        vec = arr.reshape(-1)
    else:
        raise ValueError(f"[ERROR] Feature for photo_id {pid} has unsupported ndim: {arr.ndim}")

    if vec.shape[0] != feature_length:
        raise ValueError(
            f"[ERROR] Inconsistent feature length for photo_id {pid}: "
            f"expected {feature_length}, got {vec.shape[0]}"
        )

    photo_ids.append(pid)
    features_list.append(vec)

    if i % 1000 == 0:
        print(f"[INFO] Processed {i} feature vectors...")

features_matrix = np.vstack(features_list)
print(f"[INFO] Feature matrix shape: {features_matrix.shape} (rows x cols)")

[INFO] Flattening all feature vectors and building feature matrix...
[INFO] Processed 1000 feature vectors...
[INFO] Processed 2000 feature vectors...
[INFO] Processed 3000 feature vectors...
[INFO] Processed 4000 feature vectors...
[INFO] Processed 5000 feature vectors...
[INFO] Processed 6000 feature vectors...
[INFO] Feature matrix shape: (6134, 1280) (rows x cols)


1.4. create the dataframe

In [4]:
# ------------------------------------------------
# Convert to DataFrame with f1..fN columns
# ------------------------------------------------
num_features = features_matrix.shape[1]
feature_columns = [f"f{i+1}" for i in range(num_features)]

print("[INFO] Creating DataFrame eff_front_train...")
eff_front_train = pd.DataFrame(features_matrix, columns=feature_columns)
eff_front_train.insert(0, "photo_id", photo_ids)

print("[INFO] Final DataFrame created: eff_front_train")
print(f"[INFO] Final shape (rows x cols): {eff_front_train.shape}")
print("[INFO] First 10 columns:", eff_front_train.columns.tolist()[:10])

print("[INFO] Preview of eff_front_train:")
eff_front_train.head()


[INFO] Creating DataFrame eff_front_train...
[INFO] Final DataFrame created: eff_front_train
[INFO] Final shape (rows x cols): (6134, 1281)
[INFO] First 10 columns: ['photo_id', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9']
[INFO] Preview of eff_front_train:


Unnamed: 0,photo_id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f1271,f1272,f1273,f1274,f1275,f1276,f1277,f1278,f1279,f1280
0,6ab1d061f51c6079633aeceed2faeb0b.png,-0.793212,0.273933,0.033019,-0.243811,0.108227,-0.473733,-0.049997,0.013127,0.05402,...,0.233709,0.114627,2.293837,0.187028,-0.009033,-0.002467,-0.053492,-0.095007,-0.068402,0.135148
1,e94e2e05fb8b099955bbc4fa5ce81e22.png,-0.826627,0.319319,0.024243,-0.234189,0.108982,-0.461531,-0.048002,0.054156,0.084309,...,0.232864,0.08588,2.311957,0.165544,-0.051573,-0.008645,-0.037998,-0.080164,-0.051973,0.105797
2,ba6951a4f37fc9302243370e927a02e2.png,-0.821884,0.444237,0.006762,-0.264877,0.069731,-0.352035,-0.066946,0.016612,0.136272,...,0.228452,0.067057,2.395272,0.109724,-0.041822,-0.007469,-0.052627,-0.034859,-0.065974,0.142989
3,947d16539d4702427aa74f737329ffb9.png,-0.805589,0.41526,-0.025132,-0.285213,0.081083,-0.395155,-0.064792,0.023976,0.092922,...,0.238674,0.070381,1.948647,0.148333,-0.03315,0.013198,-0.035708,-0.030198,-0.070175,0.127971
4,9326695bf62926ec22690f576a633bba.png,-0.811305,0.412178,0.013643,-0.169168,0.06875,-0.494908,-0.081864,-0.001935,0.088432,...,0.26384,0.031341,2.362497,0.141601,-0.015102,0.025052,-0.043656,-0.08691,-0.073212,0.149565


2. create dataframe using extracted features from side masks

2.1. load the pickle file

In [5]:
# 1.1 Load the pickle file

import pickle  # module to load pickle files
import boto3   # AWS SDK for Python to access S3
import pandas as pd  # pandas for DataFrame operations
import numpy as np   # numpy for numerical operations

# ------------------------------------------------
# Config
# ------------------------------------------------
s3_path = "s3://ai-bmi-predictor/feature-extraction-data/training-data/mae_large_side_masked_features.pkl"  # S3 path for side features pickle

print("[INFO] Starting process...")  # log start of process
print(f"[INFO] S3 path: {s3_path}")  # log S3 path used

# ------------------------------------------------
# Parse S3 path
# ------------------------------------------------
assert s3_path.startswith("s3://"), "S3 path must start with 's3://'"  # ensure S3 path format is correct
path_no_scheme = s3_path[len("s3://"):]  # remove 's3://' prefix
bucket, key = path_no_scheme.split("/", 1)  # split into bucket and key

print(f"[INFO] Bucket: {bucket}")  # log bucket name
print(f"[INFO] Key: {key}")        # log object key

# ------------------------------------------------
# Read pickle directly from S3 into memory
# ------------------------------------------------
print("[INFO] Initializing S3 client...")  # log S3 client initialization
s3 = boto3.client("s3")                   # create S3 client

print("[INFO] Fetching object from S3 (streaming)...")  # log object fetch
obj = s3.get_object(Bucket=bucket, Key=key)             # get object from S3

print("[INFO] Loading pickle from StreamingBody...")  # log pickle loading
features_dict = pickle.load(obj["Body"])             # load dictionary from S3 stream

print("[INFO] Pickle load complete.")  # log completion
print(f"[INFO] Type of loaded object: {type(features_dict)}")  # log object type
print(f"[INFO] Number of photo_ids (dict keys): {len(features_dict):,}")  # log number of keys


[INFO] Starting process...
[INFO] S3 path: s3://ai-bmi-predictor/feature-extraction-data/training-data/mae_large_side_masked_features.pkl
[INFO] Bucket: ai-bmi-predictor
[INFO] Key: feature-extraction-data/training-data/mae_large_side_masked_features.pkl
[INFO] Initializing S3 client...
[INFO] Fetching object from S3 (streaming)...
[INFO] Loading pickle from StreamingBody...
[INFO] Pickle load complete.
[INFO] Type of loaded object: <class 'dict'>
[INFO] Number of photo_ids (dict keys): 6,134


2.2. inspect one element to understand the shape

In [6]:
# 1.2 Inspect one element to understand the shape

# ------------------------------------------------
# Inspect one element to understand shape
# ------------------------------------------------
first_key = next(iter(features_dict))     # get first photo_id key
first_raw_value = features_dict[first_key]  # get raw feature value for that key
first_value = np.asarray(first_raw_value)   # convert feature to numpy array

print(f"[DEBUG] Example photo_id: {first_key}")  # log example photo_id
print(f"[DEBUG] Raw type of feature value: {type(first_raw_value)}")  # log raw value type
print(f"[DEBUG] Converted to np.array with shape: {first_value.shape}, ndim: {first_value.ndim}")  # log array shape

if first_value.ndim == 1:                   # check if feature is 1D
    feature_length = first_value.shape[0]    # use length of first dimension
elif first_value.ndim == 2:                 # check if feature is 2D (e.g. 1 x N)
    feature_length = first_value.size       # use total number of elements
else:                                       # other dimensions not supported
    raise ValueError(f"[ERROR] Unsupported feature value ndim: {first_value.ndim}")  # raise error if dimension invalid

print(f"[INFO] Inferred feature_length: {feature_length}")  # log inferred feature length


[DEBUG] Example photo_id: 6ab1d061f51c6079633aeceed2faeb0b.png
[DEBUG] Raw type of feature value: <class 'numpy.ndarray'>
[DEBUG] Converted to np.array with shape: (1, 1280), ndim: 2
[INFO] Inferred feature_length: 1280


2.3. create 2D array

In [7]:
# 1.3 Create 2D array

# ------------------------------------------------
# Flatten all feature vectors and build 2D array
# ------------------------------------------------
print("[INFO] Flattening all feature vectors and building feature matrix...")  # log start of flattening

photo_ids = []       # list to store photo_id values
features_list = []   # list to store flattened feature vectors

for i, (pid, feat) in enumerate(features_dict.items(), start=1):  # loop over all items in dictionary
    arr = np.asarray(feat)  # convert current feature to numpy array

    if arr.ndim == 1:       # if array is 1D
        vec = arr           # use as is
    elif arr.ndim == 2:     # if array is 2D
        vec = arr.reshape(-1)  # flatten to 1D vector
    else:                   # unsupported dimensions
        raise ValueError(f"[ERROR] Feature for photo_id {pid} has unsupported ndim: {arr.ndim}")  # raise error

    if vec.shape[0] != feature_length:  # check if length matches inferred feature length
        raise ValueError(
            f"[ERROR] Inconsistent feature length for photo_id {pid}: "
            f"expected {feature_length}, got {vec.shape[0]}"
        )  # raise error if length mismatch

    photo_ids.append(pid)   # add current photo_id to list
    features_list.append(vec)  # add flattened feature to list

    if i % 1000 == 0:   # every 1000 items processed
        print(f"[INFO] Processed {i} feature vectors...")  # log progress

features_matrix = np.vstack(features_list)  # stack all feature vectors into 2D array
print(f"[INFO] Feature matrix shape: {features_matrix.shape} (rows x cols)")  # log final matrix shape


[INFO] Flattening all feature vectors and building feature matrix...
[INFO] Processed 1000 feature vectors...
[INFO] Processed 2000 feature vectors...
[INFO] Processed 3000 feature vectors...
[INFO] Processed 4000 feature vectors...
[INFO] Processed 5000 feature vectors...
[INFO] Processed 6000 feature vectors...
[INFO] Feature matrix shape: (6134, 1280) (rows x cols)


2.4. create the dataframe

In [8]:
# 1.4 Create the DataFrame

# ------------------------------------------------
# Convert to DataFrame with s1..sN columns
# ------------------------------------------------
num_features = features_matrix.shape[1]               # get number of feature columns
feature_columns = [f"s{i+1}" for i in range(num_features)]  # create column names s1, s2, ..., sN

print("[INFO] Creating DataFrame eff_side_train...")  # log DataFrame creation
eff_side_train = pd.DataFrame(features_matrix, columns=feature_columns)  # create DataFrame from feature matrix
eff_side_train.insert(0, "photo_id", photo_ids)       # insert photo_id column at first position

print("[INFO] Final DataFrame created: eff_side_train")  # log completion
print(f"[INFO] Final shape (rows x cols): {eff_side_train.shape}")  # log DataFrame shape
print("[INFO] First 10 columns:", eff_side_train.columns.tolist()[:10])  # log first 10 column names

print("[INFO] Preview of eff_side_train:")  # log preview message
eff_side_train.head()               # print first few rows of DataFrame


[INFO] Creating DataFrame eff_side_train...
[INFO] Final DataFrame created: eff_side_train
[INFO] Final shape (rows x cols): (6134, 1281)
[INFO] First 10 columns: ['photo_id', 's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9']
[INFO] Preview of eff_side_train:


Unnamed: 0,photo_id,s1,s2,s3,s4,s5,s6,s7,s8,s9,...,s1271,s1272,s1273,s1274,s1275,s1276,s1277,s1278,s1279,s1280
0,6ab1d061f51c6079633aeceed2faeb0b.png,-0.549792,0.157892,0.03387,-0.220515,0.091458,-0.298959,-0.080563,0.05627,0.017978,...,0.199713,0.148138,2.441903,0.25035,0.013573,-0.002677,-0.05539,0.043681,-0.036409,0.067276
1,e94e2e05fb8b099955bbc4fa5ce81e22.png,-0.531603,0.213573,0.093421,-0.21031,0.059411,-0.294608,-0.061867,0.091439,-0.004924,...,0.226531,0.149257,2.486324,0.169767,-0.010669,-0.001358,-0.059706,-0.052934,-0.037515,0.103851
2,ba6951a4f37fc9302243370e927a02e2.png,-0.58502,0.260966,0.017228,-0.198938,-0.017891,-0.304409,-0.069098,0.097711,-0.07327,...,0.183153,0.043049,2.668457,0.140228,0.006035,0.046011,-0.003853,-0.060804,0.000488,0.055096
3,947d16539d4702427aa74f737329ffb9.png,-0.628085,0.155093,0.101718,-0.196355,0.064072,-0.332784,-0.093034,0.060503,0.070847,...,0.138245,0.128593,2.345997,0.217457,0.048299,0.004688,-0.019163,-0.043402,-0.037881,0.094496
4,9326695bf62926ec22690f576a633bba.png,-0.618357,0.156004,0.078353,-0.158116,0.083159,-0.326341,-0.084785,0.030467,0.025784,...,0.178251,0.10584,2.838005,0.218767,0.026409,-0.009528,-0.008197,-0.027993,-0.047976,0.120585


3. merge front masks dataframe and side masks dataframe

In [9]:
# 2. Merge the two DataFrames by photo_id

print("[INFO] Merging eff_front_train and eff_side_train on photo_id...")  # log start of merge

merged_eff = pd.merge(
    eff_front_train,   # left DataFrame with front features
    eff_side_train,    # right DataFrame with side features
    on="photo_id",     # join key column
    how="inner"        # inner join keeps only common photo_ids
)

print("[INFO] Merge complete.")  # log merge completion
print(f"[INFO] merged_eff shape (rows x cols): {merged_eff.shape}")  # log shape of merged DataFrame
print("[INFO] First 10 columns:", merged_eff.columns.tolist()[:10])  # log first 10 column names

print("[INFO] Preview of merged_eff:")  # log preview message
merged_eff.head()              # show first few rows of merged DataFrame


[INFO] Merging eff_front_train and eff_side_train on photo_id...
[INFO] Merge complete.
[INFO] merged_eff shape (rows x cols): (6134, 2561)
[INFO] First 10 columns: ['photo_id', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9']
[INFO] Preview of merged_eff:


Unnamed: 0,photo_id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,s1271,s1272,s1273,s1274,s1275,s1276,s1277,s1278,s1279,s1280
0,6ab1d061f51c6079633aeceed2faeb0b.png,-0.793212,0.273933,0.033019,-0.243811,0.108227,-0.473733,-0.049997,0.013127,0.05402,...,0.199713,0.148138,2.441903,0.25035,0.013573,-0.002677,-0.05539,0.043681,-0.036409,0.067276
1,e94e2e05fb8b099955bbc4fa5ce81e22.png,-0.826627,0.319319,0.024243,-0.234189,0.108982,-0.461531,-0.048002,0.054156,0.084309,...,0.226531,0.149257,2.486324,0.169767,-0.010669,-0.001358,-0.059706,-0.052934,-0.037515,0.103851
2,ba6951a4f37fc9302243370e927a02e2.png,-0.821884,0.444237,0.006762,-0.264877,0.069731,-0.352035,-0.066946,0.016612,0.136272,...,0.183153,0.043049,2.668457,0.140228,0.006035,0.046011,-0.003853,-0.060804,0.000488,0.055096
3,947d16539d4702427aa74f737329ffb9.png,-0.805589,0.41526,-0.025132,-0.285213,0.081083,-0.395155,-0.064792,0.023976,0.092922,...,0.138245,0.128593,2.345997,0.217457,0.048299,0.004688,-0.019163,-0.043402,-0.037881,0.094496
4,9326695bf62926ec22690f576a633bba.png,-0.811305,0.412178,0.013643,-0.169168,0.06875,-0.494908,-0.081864,-0.001935,0.088432,...,0.178251,0.10584,2.838005,0.218767,0.026409,-0.009528,-0.008197,-0.027993,-0.047976,0.120585


4. load body measurements dataset

In [10]:
# 3. Load body_measurements.csv from S3 as body_measurements DataFrame

import boto3        # AWS SDK for Python (if not already imported)
import pandas as pd # pandas for DataFrame operations

s3_csv_path = "s3://ai-bmi-predictor/data/body_measurements.csv"  # S3 path for body measurements CSV

print("[INFO] Starting to load body_measurements.csv...")  # log start
print(f"[INFO] S3 CSV path: {s3_csv_path}")                # log S3 path

# Ensure S3 path has correct prefix
assert s3_csv_path.startswith("s3://"), "S3 path must start with 's3://'"  # basic validation

# Remove scheme and split into bucket and key
csv_path_no_scheme = s3_csv_path[len("s3://"):]   # strip 's3://'
csv_bucket, csv_key = csv_path_no_scheme.split("/", 1)  # split into bucket and key

print(f"[INFO] CSV Bucket: {csv_bucket}")  # log bucket
print(f"[INFO] CSV Key: {csv_key}")        # log key

# Initialize S3 client
print("[INFO] Initializing S3 client for CSV...")  # log client creation
s3_client = boto3.client("s3")                     # create S3 client

# Fetch CSV object from S3
print("[INFO] Fetching CSV object from S3 (streaming)...")         # log fetch
csv_obj = s3_client.get_object(Bucket=csv_bucket, Key=csv_key)    # get object

# Read CSV directly from S3 stream into pandas DataFrame
print("[INFO] Reading CSV into pandas DataFrame...")  # log read_csv
body_measurements = pd.read_csv(csv_obj["Body"])      # load into DataFrame

print("[INFO] body_measurements DataFrame loaded successfully.")   # log success
print(f"[INFO] DataFrame shape (rows x cols): {body_measurements.shape}")  # log shape
print("[INFO] Preview of body_measurements:")       # log preview message
body_measurements.head()                    # show first few rows


[INFO] Starting to load body_measurements.csv...
[INFO] S3 CSV path: s3://ai-bmi-predictor/data/body_measurements.csv
[INFO] CSV Bucket: ai-bmi-predictor
[INFO] CSV Key: data/body_measurements.csv
[INFO] Initializing S3 client for CSV...
[INFO] Fetching CSV object from S3 (streaming)...
[INFO] Reading CSV into pandas DataFrame...
[INFO] body_measurements DataFrame loaded successfully.
[INFO] DataFrame shape (rows x cols): (2018, 17)
[INFO] Preview of body_measurements:


Unnamed: 0,subject_id,ankle,arm-length,bicep,calf,chest,forearm,hip,leg-length,shoulder-breadth,shoulder-to-crotch,thigh,waist,wrist,gender,height_cm,weight_kg
0,-494U-YoXOD8e8gkCuyaRLn4MLo5P8Dm2B1s59WBGdg,24.343954,52.902378,33.294395,38.709511,102.526024,27.599094,108.047134,81.736435,35.915691,67.891273,57.872795,92.286171,16.709131,male,178.0,79.6
1,-5OHO2b5mRDY1rvAf07sYYtekEEX7ViEGmvRr7l4tfc,22.001783,46.763233,28.288971,35.649963,99.253395,24.813375,92.128685,75.127258,33.848778,60.541386,52.41832,87.307899,15.192038,female,163.0,65.0
2,-5iHTazqnmWBsnUY6ymoCw1kORMymRDaUla1eNOjgvI,26.565006,53.288597,30.800959,39.85495,102.391273,27.719795,106.044121,84.055527,37.856457,72.004379,52.934086,96.596413,17.936598,male,185.0,82.9
3,-60o95b9oEE83BDV3GpsAVPA4TFvxuPPqtXEY5PApy8,24.779566,49.077751,34.613113,38.104267,111.16407,28.550409,109.529465,74.310234,38.1978,68.536713,58.656673,103.395119,17.02986,male,166.0,88.7
4,-67TrwlJ3GSsqPeHaa2Nb0iQ-BKZfIx4ERysQPZ0-SY,20.788853,42.229557,24.527142,29.181356,93.627777,21.979038,87.134102,67.269859,31.659197,52.697277,41.844944,74.400856,15.120061,female,144.6,44.2


5. subject_id and photo_id map dataset

In [11]:
# 4. Load subject_to_photo_map.csv as smp DataFrame

import boto3        # AWS SDK for accessing S3
import pandas as pd # pandas for DataFrame operations

smp_s3_path = "s3://amazon-bodym/train/subject_to_photo_map.csv"  # S3 path for subject_to_photo_map.csv

print("[INFO] Starting to load subject_to_photo_map.csv...")  # log start
print(f"[INFO] S3 CSV path: {smp_s3_path}")                    # log S3 path

assert smp_s3_path.startswith("s3://"), "S3 path must start with 's3://'"  # basic validation of S3 path

smp_path_no_scheme = smp_s3_path[len("s3://"):]        # remove 's3://' prefix
smp_bucket, smp_key = smp_path_no_scheme.split("/", 1) # split into bucket and key

print(f"[INFO] CSV Bucket: {smp_bucket}")  # log bucket name
print(f"[INFO] CSV Key: {smp_key}")        # log key (object path)

print("[INFO] Initializing S3 client for subject_to_photo_map.csv...")  # log client init
s3_client = boto3.client("s3")                                          # create S3 client

print("[INFO] Fetching subject_to_photo_map.csv from S3 (streaming)...")  # log fetch
csv_obj = s3_client.get_object(Bucket=smp_bucket, Key=smp_key)           # get CSV object from S3

print("[INFO] Reading CSV into pandas DataFrame (smp)...")  # log DataFrame creation
smp = pd.read_csv(csv_obj["Body"])                         # read CSV stream into DataFrame

print("[INFO] smp DataFrame loaded successfully.")         # log success
print(f"[INFO] smp shape (rows x cols): {smp.shape}")      # log DataFrame shape
print("[INFO] Preview of smp:")                            # log preview message
smp.head()                                        # show first few rows


[INFO] Starting to load subject_to_photo_map.csv...
[INFO] S3 CSV path: s3://amazon-bodym/train/subject_to_photo_map.csv
[INFO] CSV Bucket: amazon-bodym
[INFO] CSV Key: train/subject_to_photo_map.csv
[INFO] Initializing S3 client for subject_to_photo_map.csv...
[INFO] Fetching subject_to_photo_map.csv from S3 (streaming)...
[INFO] Reading CSV into pandas DataFrame (smp)...
[INFO] smp DataFrame loaded successfully.
[INFO] smp shape (rows x cols): (6134, 2)
[INFO] Preview of smp:


Unnamed: 0,subject_id,photo_id
0,-494U-YoXOD8e8gkCuyaRLn4MLo5P8Dm2B1s59WBGdg,e6f404ebda41ebe93573d3e219c88297
1,-494U-YoXOD8e8gkCuyaRLn4MLo5P8Dm2B1s59WBGdg,39a4241cb892618a694747e224d57b9e
2,-494U-YoXOD8e8gkCuyaRLn4MLo5P8Dm2B1s59WBGdg,a00a6c0c074efbd92c894cbc77023939
3,-494U-YoXOD8e8gkCuyaRLn4MLo5P8Dm2B1s59WBGdg,7c99b3017f21f9797851e70de084bfcb
4,-494U-YoXOD8e8gkCuyaRLn4MLo5P8Dm2B1s59WBGdg,3000cb26b2090db5a7d82926af8c6a83


6. merge body_measurements with photo_id

In [12]:
# 5. Merge body_measurements and smp by subject_id
#    Keep all rows from smp (body_measurements can duplicate)

print("[INFO] Merging smp and body_measurements on subject_id...")  # log start of merge

smp_body_measurements = pd.merge(
    smp,                 # left DataFrame: keep all rows from smp
    body_measurements,   # right DataFrame: body measurements
    on="subject_id",     # join key column
    how="left"           # left join: all rows from smp, matches from body_measurements
)

print("[INFO] Merge complete.")  # log merge completion
print(f"[INFO] smp_body_measurements shape (rows x cols): {smp_body_measurements.shape}")  # log shape
print("[INFO] First 10 columns:", smp_body_measurements.columns.tolist()[:10])  # log first columns

print("[INFO] Preview of smp_body_measurements:")  # log preview message
smp_body_measurements.head()                # show first few merged rows


[INFO] Merging smp and body_measurements on subject_id...
[INFO] Merge complete.
[INFO] smp_body_measurements shape (rows x cols): (6134, 18)
[INFO] First 10 columns: ['subject_id', 'photo_id', 'ankle', 'arm-length', 'bicep', 'calf', 'chest', 'forearm', 'hip', 'leg-length']
[INFO] Preview of smp_body_measurements:


Unnamed: 0,subject_id,photo_id,ankle,arm-length,bicep,calf,chest,forearm,hip,leg-length,shoulder-breadth,shoulder-to-crotch,thigh,waist,wrist,gender,height_cm,weight_kg
0,-494U-YoXOD8e8gkCuyaRLn4MLo5P8Dm2B1s59WBGdg,e6f404ebda41ebe93573d3e219c88297,24.343954,52.902378,33.294395,38.709511,102.526024,27.599094,108.047134,81.736435,35.915691,67.891273,57.872795,92.286171,16.709131,male,178.0,79.6
1,-494U-YoXOD8e8gkCuyaRLn4MLo5P8Dm2B1s59WBGdg,39a4241cb892618a694747e224d57b9e,24.343954,52.902378,33.294395,38.709511,102.526024,27.599094,108.047134,81.736435,35.915691,67.891273,57.872795,92.286171,16.709131,male,178.0,79.6
2,-494U-YoXOD8e8gkCuyaRLn4MLo5P8Dm2B1s59WBGdg,a00a6c0c074efbd92c894cbc77023939,24.343954,52.902378,33.294395,38.709511,102.526024,27.599094,108.047134,81.736435,35.915691,67.891273,57.872795,92.286171,16.709131,male,178.0,79.6
3,-494U-YoXOD8e8gkCuyaRLn4MLo5P8Dm2B1s59WBGdg,7c99b3017f21f9797851e70de084bfcb,24.343954,52.902378,33.294395,38.709511,102.526024,27.599094,108.047134,81.736435,35.915691,67.891273,57.872795,92.286171,16.709131,male,178.0,79.6
4,-494U-YoXOD8e8gkCuyaRLn4MLo5P8Dm2B1s59WBGdg,3000cb26b2090db5a7d82926af8c6a83,24.343954,52.902378,33.294395,38.709511,102.526024,27.599094,108.047134,81.736435,35.915691,67.891273,57.872795,92.286171,16.709131,male,178.0,79.6


In [13]:
smp_body_measurements.shape

(6134, 18)

In [14]:
smp_body_measurements.columns

Index(['subject_id', 'photo_id', 'ankle', 'arm-length', 'bicep', 'calf',
       'chest', 'forearm', 'hip', 'leg-length', 'shoulder-breadth',
       'shoulder-to-crotch', 'thigh', 'waist', 'wrist', 'gender', 'height_cm',
       'weight_kg'],
      dtype='object')

In [15]:
merged_eff.head()

Unnamed: 0,photo_id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,s1271,s1272,s1273,s1274,s1275,s1276,s1277,s1278,s1279,s1280
0,6ab1d061f51c6079633aeceed2faeb0b.png,-0.793212,0.273933,0.033019,-0.243811,0.108227,-0.473733,-0.049997,0.013127,0.05402,...,0.199713,0.148138,2.441903,0.25035,0.013573,-0.002677,-0.05539,0.043681,-0.036409,0.067276
1,e94e2e05fb8b099955bbc4fa5ce81e22.png,-0.826627,0.319319,0.024243,-0.234189,0.108982,-0.461531,-0.048002,0.054156,0.084309,...,0.226531,0.149257,2.486324,0.169767,-0.010669,-0.001358,-0.059706,-0.052934,-0.037515,0.103851
2,ba6951a4f37fc9302243370e927a02e2.png,-0.821884,0.444237,0.006762,-0.264877,0.069731,-0.352035,-0.066946,0.016612,0.136272,...,0.183153,0.043049,2.668457,0.140228,0.006035,0.046011,-0.003853,-0.060804,0.000488,0.055096
3,947d16539d4702427aa74f737329ffb9.png,-0.805589,0.41526,-0.025132,-0.285213,0.081083,-0.395155,-0.064792,0.023976,0.092922,...,0.138245,0.128593,2.345997,0.217457,0.048299,0.004688,-0.019163,-0.043402,-0.037881,0.094496
4,9326695bf62926ec22690f576a633bba.png,-0.811305,0.412178,0.013643,-0.169168,0.06875,-0.494908,-0.081864,-0.001935,0.088432,...,0.178251,0.10584,2.838005,0.218767,0.026409,-0.009528,-0.008197,-0.027993,-0.047976,0.120585


7. merge body measurements and feature extractions

In [16]:
# 7. Clean photo_id in merged_eff: remove '.png' suffix and update in place

print("[INFO] Cleaning photo_id column in merged_eff (removing '.png')...")  # log start

# show a small sample before cleaning
print("[DEBUG] Sample photo_id values BEFORE cleaning:",
      merged_eff["photo_id"].head().tolist())  # log first few values

# convert photo_id to string and remove trailing '.png'
merged_eff["photo_id"] = (
    merged_eff["photo_id"]          # take photo_id column
    .astype(str)                    # ensure values are strings
    .str.replace(r"\.png$", "", regex=True)  # remove '.png' only at end of string
)

# show a small sample after cleaning
print("[DEBUG] Sample photo_id values AFTER cleaning:",
      merged_eff["photo_id"].head().tolist())  # log first few cleaned values

print(f"[INFO] Number of rows in merged_eff: {merged_eff.shape[0]}")           # log row count
print(f"[INFO] Number of unique photo_id values: {merged_eff['photo_id'].nunique()}")  # log unique ids

print("[INFO] merged_eff photo_id column updated successfully.")  # log completion
# NOTE: If you already created eff_training before this step,
#       rerun the merge step that builds eff_training so it uses the cleaned photo_id values.


[INFO] Cleaning photo_id column in merged_eff (removing '.png')...
[DEBUG] Sample photo_id values BEFORE cleaning: ['6ab1d061f51c6079633aeceed2faeb0b.png', 'e94e2e05fb8b099955bbc4fa5ce81e22.png', 'ba6951a4f37fc9302243370e927a02e2.png', '947d16539d4702427aa74f737329ffb9.png', '9326695bf62926ec22690f576a633bba.png']
[DEBUG] Sample photo_id values AFTER cleaning: ['6ab1d061f51c6079633aeceed2faeb0b', 'e94e2e05fb8b099955bbc4fa5ce81e22', 'ba6951a4f37fc9302243370e927a02e2', '947d16539d4702427aa74f737329ffb9', '9326695bf62926ec22690f576a633bba']
[INFO] Number of rows in merged_eff: 6134
[INFO] Number of unique photo_id values: 6134
[INFO] merged_eff photo_id column updated successfully.


In [17]:
merged_eff.head()

Unnamed: 0,photo_id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,s1271,s1272,s1273,s1274,s1275,s1276,s1277,s1278,s1279,s1280
0,6ab1d061f51c6079633aeceed2faeb0b,-0.793212,0.273933,0.033019,-0.243811,0.108227,-0.473733,-0.049997,0.013127,0.05402,...,0.199713,0.148138,2.441903,0.25035,0.013573,-0.002677,-0.05539,0.043681,-0.036409,0.067276
1,e94e2e05fb8b099955bbc4fa5ce81e22,-0.826627,0.319319,0.024243,-0.234189,0.108982,-0.461531,-0.048002,0.054156,0.084309,...,0.226531,0.149257,2.486324,0.169767,-0.010669,-0.001358,-0.059706,-0.052934,-0.037515,0.103851
2,ba6951a4f37fc9302243370e927a02e2,-0.821884,0.444237,0.006762,-0.264877,0.069731,-0.352035,-0.066946,0.016612,0.136272,...,0.183153,0.043049,2.668457,0.140228,0.006035,0.046011,-0.003853,-0.060804,0.000488,0.055096
3,947d16539d4702427aa74f737329ffb9,-0.805589,0.41526,-0.025132,-0.285213,0.081083,-0.395155,-0.064792,0.023976,0.092922,...,0.138245,0.128593,2.345997,0.217457,0.048299,0.004688,-0.019163,-0.043402,-0.037881,0.094496
4,9326695bf62926ec22690f576a633bba,-0.811305,0.412178,0.013643,-0.169168,0.06875,-0.494908,-0.081864,-0.001935,0.088432,...,0.178251,0.10584,2.838005,0.218767,0.026409,-0.009528,-0.008197,-0.027993,-0.047976,0.120585


In [18]:
# 6. Merge merged_eff and smp_body_measurements on photo_id (inner join)

print("[INFO] Merging merged_eff and smp_body_measurements on photo_id...")  # log start of merge

eff_training = pd.merge(          # create new merged DataFrame
    merged_eff,                   # left DataFrame with front+side features
    smp_body_measurements,        # right DataFrame with subject/photo/body data
    on="photo_id",                # join key column
    how="inner"                   # inner join: keep only matching photo_ids
)

print("[INFO] Merge complete.")                                   # log completion
print(f"[INFO] eff_training shape (rows x cols): {eff_training.shape}")  # log shape
print("[INFO] First 10 columns:", eff_training.columns.tolist()[:10])    # log first columns

print("[INFO] Preview of eff_training:")  # log preview message
eff_training.head()               # show first few rows


[INFO] Merging merged_eff and smp_body_measurements on photo_id...
[INFO] Merge complete.
[INFO] eff_training shape (rows x cols): (6134, 2578)
[INFO] First 10 columns: ['photo_id', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9']
[INFO] Preview of eff_training:


Unnamed: 0,photo_id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,hip,leg-length,shoulder-breadth,shoulder-to-crotch,thigh,waist,wrist,gender,height_cm,weight_kg
0,6ab1d061f51c6079633aeceed2faeb0b,-0.793212,0.273933,0.033019,-0.243811,0.108227,-0.473733,-0.049997,0.013127,0.05402,...,105.3339,76.817467,35.362858,65.993683,54.459591,88.813789,16.764332,female,170.5,72.0
1,e94e2e05fb8b099955bbc4fa5ce81e22,-0.826627,0.319319,0.024243,-0.234189,0.108982,-0.461531,-0.048002,0.054156,0.084309,...,101.478989,85.154358,37.25676,65.861588,52.773052,89.176338,15.690955,male,178.3,71.8
2,ba6951a4f37fc9302243370e927a02e2,-0.821884,0.444237,0.006762,-0.264877,0.069731,-0.352035,-0.066946,0.016612,0.136272,...,97.488243,81.410393,37.503147,66.042679,57.059261,82.201988,16.686253,male,176.25,76.5
3,947d16539d4702427aa74f737329ffb9,-0.805589,0.41526,-0.025132,-0.285213,0.081083,-0.395155,-0.064792,0.023976,0.092922,...,120.586845,69.361534,34.084633,60.41333,65.0,102.323845,17.693762,female,152.1,88.9
4,9326695bf62926ec22690f576a633bba,-0.811305,0.412178,0.013643,-0.169168,0.06875,-0.494908,-0.081864,-0.001935,0.088432,...,110.543564,77.160583,38.086231,68.400543,57.172279,107.378578,16.594791,male,171.5,88.4


In [19]:
eff_training.shape

(6134, 2578)

8. send final dataset to s3 location

In [20]:
# 8. Save eff_training DataFrame to S3 as eff_training.csv

import boto3      # AWS SDK for Python (if not already imported)
import io         # for in-memory text buffer

s3_out_path = "s3://ai-bmi-predictor/data/mae-vit-h_training.csv"  # target S3 path for output CSV

print("[INFO] Starting upload of eff_training to S3...")  # log start
print(f"[INFO] Output S3 path: {s3_out_path}")            # log S3 path

# Ensure S3 path format is correct
assert s3_out_path.startswith("s3://"), "S3 path must start with 's3://'"  # basic validation

# Parse bucket and key from S3 path
out_path_no_scheme = s3_out_path[len("s3://"):]     # remove 's3://' prefix
out_bucket, out_key = out_path_no_scheme.split("/", 1)  # split into bucket and key

print(f"[INFO] Output bucket: {out_bucket}")  # log bucket name
print(f"[INFO] Output key: {out_key}")        # log key (object path)

# Convert DataFrame to CSV in memory (no local file)
csv_buffer = io.StringIO()                           # create in-memory text buffer
eff_training.to_csv(csv_buffer, index=False)         # write DataFrame as CSV into buffer

# Initialize S3 client
print("[INFO] Initializing S3 client for upload...")  # log client creation
s3_client = boto3.client("s3")                        # create S3 client

# Upload CSV content from buffer to S3
print("[INFO] Uploading eff_training.csv to S3...")   # log upload start
s3_client.put_object(
    Bucket=out_bucket,                                # target bucket
    Key=out_key,                                      # target key / object name
    Body=csv_buffer.getvalue()                        # CSV data as string
)

print("[INFO] Upload complete: eff_training.csv saved to S3.")  # log completion


[INFO] Starting upload of eff_training to S3...
[INFO] Output S3 path: s3://ai-bmi-predictor/data/mae-vit-h_training.csv
[INFO] Output bucket: ai-bmi-predictor
[INFO] Output key: data/mae-vit-h_training.csv
[INFO] Initializing S3 client for upload...
[INFO] Uploading eff_training.csv to S3...
[INFO] Upload complete: eff_training.csv saved to S3.
