1. create dataframe using extracted features of front masks
    * 1.1. load the pickle file of extracted features of front masks
    * 1.2. inspect one element to understand the shape
    * 1.3. create 2D array
    * 1.4. create the dataframe

2. create dataframe using extrcated features from side masks
    * 2.1. load the pickle file of extrcated features of side masks
    * 2.2. inspect one element to understand the shape
    * 2.3. create 2D array
    * 2.4. create the dataframe

3. merge front masks dataframe and side masks dataframe

4. load body_measurements dataset

5. load mapped subject_id and photo_id dataset

6. map body measurements with photo_id

7. merge body measurements and feature extrractions ( removed .png suffix from feature extracted dataframe )

1. create dataframe using extracted features of front masks

1.1. load the pickle file

In [1]:
import pickle
import boto3
import pandas as pd
import numpy as np

# ------------------------------------------------
# Config
# ------------------------------------------------
s3_path = "s3://ai-bmi-predictor/feature-extraction-data/testing-data-A/dinov2_large_front_masked_features-testA.pkl"

print("[INFO] Starting process...")
print(f"[INFO] S3 path: {s3_path}")

# ------------------------------------------------
# Parse S3 path
# ------------------------------------------------
assert s3_path.startswith("s3://"), "S3 path must start with 's3://'"
path_no_scheme = s3_path[len("s3://"):]
bucket, key = path_no_scheme.split("/", 1)

print(f"[INFO] Bucket: {bucket}")
print(f"[INFO] Key: {key}")

# ------------------------------------------------
# Read pickle directly from S3 into memory
# ------------------------------------------------
print("[INFO] Initializing S3 client...")
s3 = boto3.client("s3")

print("[INFO] Fetching object from S3 (streaming)...")
obj = s3.get_object(Bucket=bucket, Key=key)

print("[INFO] Loading pickle from StreamingBody...")
features_dict = pickle.load(obj["Body"])

print("[INFO] Pickle load complete.")
print(f"[INFO] Type of loaded object: {type(features_dict)}")
print(f"[INFO] Number of photo_ids (dict keys): {len(features_dict):,}")

[INFO] Starting process...
[INFO] S3 path: s3://ai-bmi-predictor/feature-extraction-data/testing-data-A/dinov2_large_front_masked_features-testA.pkl
[INFO] Bucket: ai-bmi-predictor
[INFO] Key: feature-extraction-data/testing-data-A/dinov2_large_front_masked_features-testA.pkl
[INFO] Initializing S3 client...
[INFO] Fetching object from S3 (streaming)...
[INFO] Loading pickle from StreamingBody...
[INFO] Pickle load complete.
[INFO] Type of loaded object: <class 'dict'>
[INFO] Number of photo_ids (dict keys): 1,684


1.2. inspect one element to understand the shape

In [2]:
# ------------------------------------------------
# Inspect one element to understand shape
# ------------------------------------------------
first_key = next(iter(features_dict))
first_raw_value = features_dict[first_key]
first_value = np.asarray(first_raw_value)

print(f"[DEBUG] Example photo_id: {first_key}")
print(f"[DEBUG] Raw type of feature value: {type(first_raw_value)}")
print(f"[DEBUG] Converted to np.array with shape: {first_value.shape}, ndim: {first_value.ndim}")

if first_value.ndim == 1:
    feature_length = first_value.shape[0]
elif first_value.ndim == 2:
    # e.g. (1, 2560) or (2560, 1)
    feature_length = first_value.size   # total elements
else:
    raise ValueError(f"[ERROR] Unsupported feature value ndim: {first_value.ndim}")

print(f"[INFO] Inferred feature_length: {feature_length}")

[DEBUG] Example photo_id: e5ae8fe5bbdf611a1e8d06e66e849bdf.png
[DEBUG] Raw type of feature value: <class 'numpy.ndarray'>
[DEBUG] Converted to np.array with shape: (1, 1024), ndim: 2
[INFO] Inferred feature_length: 1024


1.3. create 2D array

In [3]:
# ------------------------------------------------
# Flatten all feature vectors and build 2D array
# ------------------------------------------------
print("[INFO] Flattening all feature vectors and building feature matrix...")

photo_ids = []
features_list = []

for i, (pid, feat) in enumerate(features_dict.items(), start=1):
    arr = np.asarray(feat)

    if arr.ndim == 1:
        vec = arr
    elif arr.ndim == 2:
        # Flatten any 2D array (e.g. (1, 2560) -> (2560,)
        vec = arr.reshape(-1)
    else:
        raise ValueError(f"[ERROR] Feature for photo_id {pid} has unsupported ndim: {arr.ndim}")

    if vec.shape[0] != feature_length:
        raise ValueError(
            f"[ERROR] Inconsistent feature length for photo_id {pid}: "
            f"expected {feature_length}, got {vec.shape[0]}"
        )

    photo_ids.append(pid)
    features_list.append(vec)

    if i % 1000 == 0:
        print(f"[INFO] Processed {i} feature vectors...")

features_matrix = np.vstack(features_list)
print(f"[INFO] Feature matrix shape: {features_matrix.shape} (rows x cols)")

[INFO] Flattening all feature vectors and building feature matrix...
[INFO] Processed 1000 feature vectors...
[INFO] Feature matrix shape: (1684, 1024) (rows x cols)


1.4. create the dataframe

In [4]:
# ------------------------------------------------
# Convert to DataFrame with f1..fN columns
# ------------------------------------------------
num_features = features_matrix.shape[1]
feature_columns = [f"f{i+1}" for i in range(num_features)]

print("[INFO] Creating DataFrame eff_front_train...")
eff_front_train = pd.DataFrame(features_matrix, columns=feature_columns)
eff_front_train.insert(0, "photo_id", photo_ids)

print("[INFO] Final DataFrame created: eff_front_train")
print(f"[INFO] Final shape (rows x cols): {eff_front_train.shape}")
print("[INFO] First 10 columns:", eff_front_train.columns.tolist()[:10])

print("[INFO] Preview of eff_front_train:")
eff_front_train.head()


[INFO] Creating DataFrame eff_front_train...
[INFO] Final DataFrame created: eff_front_train
[INFO] Final shape (rows x cols): (1684, 1025)
[INFO] First 10 columns: ['photo_id', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9']
[INFO] Preview of eff_front_train:


Unnamed: 0,photo_id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f1015,f1016,f1017,f1018,f1019,f1020,f1021,f1022,f1023,f1024
0,e5ae8fe5bbdf611a1e8d06e66e849bdf.png,-1.042094,-1.730657,1.566182,-1.60408,-0.453005,0.18847,-1.124131,-0.845571,-1.161284,...,0.347894,2.846795,-1.044655,-0.11878,0.435428,-2.357119,1.53893,0.730773,0.815797,-1.74262
1,605a5fd09058c48156b0ef518b63b2de.png,-1.472728,-0.183869,2.148343,-1.645508,-0.92586,0.131387,-1.567845,-0.359548,-1.792026,...,0.288444,2.684972,-1.136333,-0.394402,-0.348682,-1.758533,1.637419,0.481312,0.97542,-1.106456
2,909c9277309e13ee014e347603aba620.png,-1.47145,0.037155,1.973593,-1.695215,-0.308931,0.758703,-1.801447,-0.705424,-1.482358,...,0.227096,2.718798,-0.683584,0.194305,0.011414,-1.815809,1.294205,1.404522,0.725003,-1.453352
3,bef6a68bc8dd475c124f6de2413385d3.png,0.071158,-1.256408,1.118973,-2.16653,-1.075335,0.872181,-1.01387,-0.730944,-1.59996,...,-1.003827,2.95091,-0.3249,0.043176,-1.017775,-1.019191,1.542311,-0.036608,-0.151371,-1.151821
4,6d7ed4bc4a17546447efed0ca6e2ff11.png,-1.396314,-1.110229,2.116553,-1.739712,-1.533494,0.797858,-0.780543,-0.863575,-1.269442,...,-0.362187,2.566309,-0.836707,-0.458111,-0.122545,-1.50045,1.702744,0.068517,1.203996,-1.310105


2. create dataframe using extracted features from side masks

2.1. load the pickle file

In [5]:
# 1.1 Load the pickle file

import pickle  # module to load pickle files
import boto3   # AWS SDK for Python to access S3
import pandas as pd  # pandas for DataFrame operations
import numpy as np   # numpy for numerical operations

# ------------------------------------------------
# Config
# ------------------------------------------------
s3_path = "s3://ai-bmi-predictor/feature-extraction-data/testing-data-A/dinov2_large_side_masked_features-testA.pkl"  # S3 path for side features pickle

print("[INFO] Starting process...")  # log start of process
print(f"[INFO] S3 path: {s3_path}")  # log S3 path used

# ------------------------------------------------
# Parse S3 path
# ------------------------------------------------
assert s3_path.startswith("s3://"), "S3 path must start with 's3://'"  # ensure S3 path format is correct
path_no_scheme = s3_path[len("s3://"):]  # remove 's3://' prefix
bucket, key = path_no_scheme.split("/", 1)  # split into bucket and key

print(f"[INFO] Bucket: {bucket}")  # log bucket name
print(f"[INFO] Key: {key}")        # log object key

# ------------------------------------------------
# Read pickle directly from S3 into memory
# ------------------------------------------------
print("[INFO] Initializing S3 client...")  # log S3 client initialization
s3 = boto3.client("s3")                   # create S3 client

print("[INFO] Fetching object from S3 (streaming)...")  # log object fetch
obj = s3.get_object(Bucket=bucket, Key=key)             # get object from S3

print("[INFO] Loading pickle from StreamingBody...")  # log pickle loading
features_dict = pickle.load(obj["Body"])             # load dictionary from S3 stream

print("[INFO] Pickle load complete.")  # log completion
print(f"[INFO] Type of loaded object: {type(features_dict)}")  # log object type
print(f"[INFO] Number of photo_ids (dict keys): {len(features_dict):,}")  # log number of keys


[INFO] Starting process...
[INFO] S3 path: s3://ai-bmi-predictor/feature-extraction-data/testing-data-A/dinov2_large_side_masked_features-testA.pkl
[INFO] Bucket: ai-bmi-predictor
[INFO] Key: feature-extraction-data/testing-data-A/dinov2_large_side_masked_features-testA.pkl
[INFO] Initializing S3 client...
[INFO] Fetching object from S3 (streaming)...
[INFO] Loading pickle from StreamingBody...
[INFO] Pickle load complete.
[INFO] Type of loaded object: <class 'dict'>
[INFO] Number of photo_ids (dict keys): 1,684


2.2. inspect one element to understand the shape

In [6]:
# 1.2 Inspect one element to understand the shape

# ------------------------------------------------
# Inspect one element to understand shape
# ------------------------------------------------
first_key = next(iter(features_dict))     # get first photo_id key
first_raw_value = features_dict[first_key]  # get raw feature value for that key
first_value = np.asarray(first_raw_value)   # convert feature to numpy array

print(f"[DEBUG] Example photo_id: {first_key}")  # log example photo_id
print(f"[DEBUG] Raw type of feature value: {type(first_raw_value)}")  # log raw value type
print(f"[DEBUG] Converted to np.array with shape: {first_value.shape}, ndim: {first_value.ndim}")  # log array shape

if first_value.ndim == 1:                   # check if feature is 1D
    feature_length = first_value.shape[0]    # use length of first dimension
elif first_value.ndim == 2:                 # check if feature is 2D (e.g. 1 x N)
    feature_length = first_value.size       # use total number of elements
else:                                       # other dimensions not supported
    raise ValueError(f"[ERROR] Unsupported feature value ndim: {first_value.ndim}")  # raise error if dimension invalid

print(f"[INFO] Inferred feature_length: {feature_length}")  # log inferred feature length


[DEBUG] Example photo_id: e5ae8fe5bbdf611a1e8d06e66e849bdf.png
[DEBUG] Raw type of feature value: <class 'numpy.ndarray'>
[DEBUG] Converted to np.array with shape: (1, 1024), ndim: 2
[INFO] Inferred feature_length: 1024


2.3. create 2D array

In [7]:
# 1.3 Create 2D array

# ------------------------------------------------
# Flatten all feature vectors and build 2D array
# ------------------------------------------------
print("[INFO] Flattening all feature vectors and building feature matrix...")  # log start of flattening

photo_ids = []       # list to store photo_id values
features_list = []   # list to store flattened feature vectors

for i, (pid, feat) in enumerate(features_dict.items(), start=1):  # loop over all items in dictionary
    arr = np.asarray(feat)  # convert current feature to numpy array

    if arr.ndim == 1:       # if array is 1D
        vec = arr           # use as is
    elif arr.ndim == 2:     # if array is 2D
        vec = arr.reshape(-1)  # flatten to 1D vector
    else:                   # unsupported dimensions
        raise ValueError(f"[ERROR] Feature for photo_id {pid} has unsupported ndim: {arr.ndim}")  # raise error

    if vec.shape[0] != feature_length:  # check if length matches inferred feature length
        raise ValueError(
            f"[ERROR] Inconsistent feature length for photo_id {pid}: "
            f"expected {feature_length}, got {vec.shape[0]}"
        )  # raise error if length mismatch

    photo_ids.append(pid)   # add current photo_id to list
    features_list.append(vec)  # add flattened feature to list

    if i % 1000 == 0:   # every 1000 items processed
        print(f"[INFO] Processed {i} feature vectors...")  # log progress

features_matrix = np.vstack(features_list)  # stack all feature vectors into 2D array
print(f"[INFO] Feature matrix shape: {features_matrix.shape} (rows x cols)")  # log final matrix shape


[INFO] Flattening all feature vectors and building feature matrix...
[INFO] Processed 1000 feature vectors...
[INFO] Feature matrix shape: (1684, 1024) (rows x cols)


2.4. create the dataframe

In [8]:
# 1.4 Create the DataFrame

# ------------------------------------------------
# Convert to DataFrame with s1..sN columns
# ------------------------------------------------
num_features = features_matrix.shape[1]               # get number of feature columns
feature_columns = [f"s{i+1}" for i in range(num_features)]  # create column names s1, s2, ..., sN

print("[INFO] Creating DataFrame eff_side_train...")  # log DataFrame creation
eff_side_train = pd.DataFrame(features_matrix, columns=feature_columns)  # create DataFrame from feature matrix
eff_side_train.insert(0, "photo_id", photo_ids)       # insert photo_id column at first position

print("[INFO] Final DataFrame created: eff_side_train")  # log completion
print(f"[INFO] Final shape (rows x cols): {eff_side_train.shape}")  # log DataFrame shape
print("[INFO] First 10 columns:", eff_side_train.columns.tolist()[:10])  # log first 10 column names

print("[INFO] Preview of eff_side_train:")  # log preview message
eff_side_train.head()               # print first few rows of DataFrame


[INFO] Creating DataFrame eff_side_train...
[INFO] Final DataFrame created: eff_side_train
[INFO] Final shape (rows x cols): (1684, 1025)
[INFO] First 10 columns: ['photo_id', 's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9']
[INFO] Preview of eff_side_train:


Unnamed: 0,photo_id,s1,s2,s3,s4,s5,s6,s7,s8,s9,...,s1015,s1016,s1017,s1018,s1019,s1020,s1021,s1022,s1023,s1024
0,e5ae8fe5bbdf611a1e8d06e66e849bdf.png,-0.807149,-0.032779,-0.560389,-0.437324,-1.986211,1.267072,-0.844637,-2.005862,-1.13212,...,-0.302019,-1.04651,1.822773,0.370059,0.210585,0.036682,0.932571,-0.502113,0.454886,-1.286794
1,605a5fd09058c48156b0ef518b63b2de.png,0.443066,-0.54886,-0.459339,-0.607113,-0.964106,-0.079236,-1.850846,0.037031,-1.736993,...,-1.375771,1.48191,0.29084,-0.721068,-0.171349,-1.032791,0.701256,-2.140274,-2.126096,-0.975578
2,909c9277309e13ee014e347603aba620.png,0.344981,0.112628,-0.445933,-0.81271,-1.611799,0.936948,-0.552384,-0.75265,-1.139424,...,-1.77907,0.532031,0.757014,-0.398765,-1.256006,-0.549223,1.210784,-1.171533,-1.008888,-0.86987
3,bef6a68bc8dd475c124f6de2413385d3.png,0.859871,0.425893,-0.099129,-0.848447,-2.333242,0.289111,-0.191757,-0.884657,-1.00621,...,-0.511344,-0.008478,1.005939,-0.706059,-0.051107,-0.803857,0.246187,-1.094311,0.056111,-0.633578
4,6d7ed4bc4a17546447efed0ca6e2ff11.png,-0.82758,-0.245938,-1.515478,-1.576778,-2.155475,1.14129,-0.992209,-0.071712,-0.260987,...,-1.235825,0.989174,0.786326,0.343737,0.153467,-0.336685,1.361712,-0.083629,-0.881862,-1.714599


3. merge front masks dataframe and side masks dataframe

In [9]:
# 2. Merge the two DataFrames by photo_id

print("[INFO] Merging eff_front_train and eff_side_train on photo_id...")  # log start of merge

merged_eff = pd.merge(
    eff_front_train,   # left DataFrame with front features
    eff_side_train,    # right DataFrame with side features
    on="photo_id",     # join key column
    how="inner"        # inner join keeps only common photo_ids
)

print("[INFO] Merge complete.")  # log merge completion
print(f"[INFO] merged_eff shape (rows x cols): {merged_eff.shape}")  # log shape of merged DataFrame
print("[INFO] First 10 columns:", merged_eff.columns.tolist()[:10])  # log first 10 column names

print("[INFO] Preview of merged_eff:")  # log preview message
merged_eff.head()              # show first few rows of merged DataFrame


[INFO] Merging eff_front_train and eff_side_train on photo_id...
[INFO] Merge complete.
[INFO] merged_eff shape (rows x cols): (1684, 2049)
[INFO] First 10 columns: ['photo_id', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9']
[INFO] Preview of merged_eff:


Unnamed: 0,photo_id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,s1015,s1016,s1017,s1018,s1019,s1020,s1021,s1022,s1023,s1024
0,e5ae8fe5bbdf611a1e8d06e66e849bdf.png,-1.042094,-1.730657,1.566182,-1.60408,-0.453005,0.18847,-1.124131,-0.845571,-1.161284,...,-0.302019,-1.04651,1.822773,0.370059,0.210585,0.036682,0.932571,-0.502113,0.454886,-1.286794
1,605a5fd09058c48156b0ef518b63b2de.png,-1.472728,-0.183869,2.148343,-1.645508,-0.92586,0.131387,-1.567845,-0.359548,-1.792026,...,-1.375771,1.48191,0.29084,-0.721068,-0.171349,-1.032791,0.701256,-2.140274,-2.126096,-0.975578
2,909c9277309e13ee014e347603aba620.png,-1.47145,0.037155,1.973593,-1.695215,-0.308931,0.758703,-1.801447,-0.705424,-1.482358,...,-1.77907,0.532031,0.757014,-0.398765,-1.256006,-0.549223,1.210784,-1.171533,-1.008888,-0.86987
3,bef6a68bc8dd475c124f6de2413385d3.png,0.071158,-1.256408,1.118973,-2.16653,-1.075335,0.872181,-1.01387,-0.730944,-1.59996,...,-0.511344,-0.008478,1.005939,-0.706059,-0.051107,-0.803857,0.246187,-1.094311,0.056111,-0.633578
4,6d7ed4bc4a17546447efed0ca6e2ff11.png,-1.396314,-1.110229,2.116553,-1.739712,-1.533494,0.797858,-0.780543,-0.863575,-1.269442,...,-1.235825,0.989174,0.786326,0.343737,0.153467,-0.336685,1.361712,-0.083629,-0.881862,-1.714599


4. load body measurements dataset

In [10]:
# 3. Load body_measurements.csv from S3 as body_measurements DataFrame

import boto3        # AWS SDK for Python (if not already imported)
import pandas as pd # pandas for DataFrame operations

s3_csv_path = "s3://ai-bmi-predictor/test-data/body_measurements-testA.csv"  # S3 path for body measurements CSV

print("[INFO] Starting to load body_measurements.csv...")  # log start
print(f"[INFO] S3 CSV path: {s3_csv_path}")                # log S3 path

# Ensure S3 path has correct prefix
assert s3_csv_path.startswith("s3://"), "S3 path must start with 's3://'"  # basic validation

# Remove scheme and split into bucket and key
csv_path_no_scheme = s3_csv_path[len("s3://"):]   # strip 's3://'
csv_bucket, csv_key = csv_path_no_scheme.split("/", 1)  # split into bucket and key

print(f"[INFO] CSV Bucket: {csv_bucket}")  # log bucket
print(f"[INFO] CSV Key: {csv_key}")        # log key

# Initialize S3 client
print("[INFO] Initializing S3 client for CSV...")  # log client creation
s3_client = boto3.client("s3")                     # create S3 client

# Fetch CSV object from S3
print("[INFO] Fetching CSV object from S3 (streaming)...")         # log fetch
csv_obj = s3_client.get_object(Bucket=csv_bucket, Key=csv_key)    # get object

# Read CSV directly from S3 stream into pandas DataFrame
print("[INFO] Reading CSV into pandas DataFrame...")  # log read_csv
body_measurements = pd.read_csv(csv_obj["Body"])      # load into DataFrame

print("[INFO] body_measurements DataFrame loaded successfully.")   # log success
print(f"[INFO] DataFrame shape (rows x cols): {body_measurements.shape}")  # log shape
print("[INFO] Preview of body_measurements:")       # log preview message
body_measurements.head()                    # show first few rows


[INFO] Starting to load body_measurements.csv...
[INFO] S3 CSV path: s3://ai-bmi-predictor/test-data/body_measurements-testA.csv
[INFO] CSV Bucket: ai-bmi-predictor
[INFO] CSV Key: test-data/body_measurements-testA.csv
[INFO] Initializing S3 client for CSV...
[INFO] Fetching CSV object from S3 (streaming)...
[INFO] Reading CSV into pandas DataFrame...
[INFO] body_measurements DataFrame loaded successfully.
[INFO] DataFrame shape (rows x cols): (87, 17)
[INFO] Preview of body_measurements:


Unnamed: 0,subject_id,ankle,arm-length,bicep,calf,chest,forearm,hip,leg-length,shoulder-breadth,shoulder-to-crotch,thigh,waist,wrist,gender,height_cm,weight_kg
0,-JtWlHEvrLPWagPaAhm0eQ_UKMSg8By1yoe-SkdrKSs,22.739975,46.199041,26.398842,35.114396,97.744168,23.393059,98.066856,72.135391,34.749683,59.790292,52.042527,80.647595,15.284669,female,160.9,61.95
1,1T9SoRhbCpQ0zXf4Le6WjwTOPdfnmEDebvWaECZg_HM,25.477705,48.231339,31.915012,38.683422,107.267212,28.925291,103.848091,77.136208,38.38118,68.413773,55.454933,90.152718,17.260105,male,170.7,80.0
2,2LAYE-qYtPZMSeH-gUgP-H4o-WRPzC0Cg9Nf5_AkLBE,25.254661,50.602189,29.626611,37.51582,100.673547,27.386687,99.342301,82.275874,36.059983,66.440526,53.742692,82.100598,17.086464,male,179.7,73.85
3,3KCeTD5hLgY5qDa-Dhg0lcrnsRM_EndCLV4DMSOQfQM,23.995326,51.213376,25.230598,35.231322,92.436885,24.257676,95.405576,79.827815,36.394586,67.045784,48.81131,82.786952,17.30264,male,176.1,63.3
4,3beF_SfHubarfkfoyCeA6EDR7UqE8j-bgj4bzBFzGFM,24.038458,53.719632,28.889543,35.938774,107.578957,26.352268,97.628494,85.352661,37.451279,66.207982,53.045108,89.971087,16.362741,male,179.7,73.8


5. subject_id and photo_id map dataset

In [11]:
# 4. Load subject_to_photo_map.csv as smp DataFrame

import boto3        # AWS SDK for accessing S3
import pandas as pd # pandas for DataFrame operations

smp_s3_path = "s3://amazon-bodym/testA/subject_to_photo_map.csv"  # S3 path for subject_to_photo_map.csv

print("[INFO] Starting to load subject_to_photo_map.csv...")  # log start
print(f"[INFO] S3 CSV path: {smp_s3_path}")                    # log S3 path

assert smp_s3_path.startswith("s3://"), "S3 path must start with 's3://'"  # basic validation of S3 path

smp_path_no_scheme = smp_s3_path[len("s3://"):]        # remove 's3://' prefix
smp_bucket, smp_key = smp_path_no_scheme.split("/", 1) # split into bucket and key

print(f"[INFO] CSV Bucket: {smp_bucket}")  # log bucket name
print(f"[INFO] CSV Key: {smp_key}")        # log key (object path)

print("[INFO] Initializing S3 client for subject_to_photo_map.csv...")  # log client init
s3_client = boto3.client("s3")                                          # create S3 client

print("[INFO] Fetching subject_to_photo_map.csv from S3 (streaming)...")  # log fetch
csv_obj = s3_client.get_object(Bucket=smp_bucket, Key=smp_key)           # get CSV object from S3

print("[INFO] Reading CSV into pandas DataFrame (smp)...")  # log DataFrame creation
smp = pd.read_csv(csv_obj["Body"])                         # read CSV stream into DataFrame

print("[INFO] smp DataFrame loaded successfully.")         # log success
print(f"[INFO] smp shape (rows x cols): {smp.shape}")      # log DataFrame shape
print("[INFO] Preview of smp:")                            # log preview message
smp.head()                                        # show first few rows


[INFO] Starting to load subject_to_photo_map.csv...
[INFO] S3 CSV path: s3://amazon-bodym/testA/subject_to_photo_map.csv
[INFO] CSV Bucket: amazon-bodym
[INFO] CSV Key: testA/subject_to_photo_map.csv
[INFO] Initializing S3 client for subject_to_photo_map.csv...
[INFO] Fetching subject_to_photo_map.csv from S3 (streaming)...
[INFO] Reading CSV into pandas DataFrame (smp)...
[INFO] smp DataFrame loaded successfully.
[INFO] smp shape (rows x cols): (1684, 2)
[INFO] Preview of smp:


Unnamed: 0,subject_id,photo_id
0,-JtWlHEvrLPWagPaAhm0eQ_UKMSg8By1yoe-SkdrKSs,1982951758ea65ae715d9d1a95a8ecd1
1,-JtWlHEvrLPWagPaAhm0eQ_UKMSg8By1yoe-SkdrKSs,dc567997daef596835cce05a0c6f770f
2,-JtWlHEvrLPWagPaAhm0eQ_UKMSg8By1yoe-SkdrKSs,f72236f1ab2ddb4a7b2863019c1c943c
3,-JtWlHEvrLPWagPaAhm0eQ_UKMSg8By1yoe-SkdrKSs,f6aff3e542d4101fe9ac7cfa6ee4439a
4,-JtWlHEvrLPWagPaAhm0eQ_UKMSg8By1yoe-SkdrKSs,00e11c860f7b081a01e9c7f04405c40f


6. merge body_measurements with photo_id

In [12]:
# 5. Merge body_measurements and smp by subject_id
#    Keep all rows from smp (body_measurements can duplicate)

print("[INFO] Merging smp and body_measurements on subject_id...")  # log start of merge

smp_body_measurements = pd.merge(
    smp,                 # left DataFrame: keep all rows from smp
    body_measurements,   # right DataFrame: body measurements
    on="subject_id",     # join key column
    how="left"           # left join: all rows from smp, matches from body_measurements
)

print("[INFO] Merge complete.")  # log merge completion
print(f"[INFO] smp_body_measurements shape (rows x cols): {smp_body_measurements.shape}")  # log shape
print("[INFO] First 10 columns:", smp_body_measurements.columns.tolist()[:10])  # log first columns

print("[INFO] Preview of smp_body_measurements:")  # log preview message
smp_body_measurements.head()                # show first few merged rows


[INFO] Merging smp and body_measurements on subject_id...
[INFO] Merge complete.
[INFO] smp_body_measurements shape (rows x cols): (1684, 18)
[INFO] First 10 columns: ['subject_id', 'photo_id', 'ankle', 'arm-length', 'bicep', 'calf', 'chest', 'forearm', 'hip', 'leg-length']
[INFO] Preview of smp_body_measurements:


Unnamed: 0,subject_id,photo_id,ankle,arm-length,bicep,calf,chest,forearm,hip,leg-length,shoulder-breadth,shoulder-to-crotch,thigh,waist,wrist,gender,height_cm,weight_kg
0,-JtWlHEvrLPWagPaAhm0eQ_UKMSg8By1yoe-SkdrKSs,1982951758ea65ae715d9d1a95a8ecd1,22.739975,46.199041,26.398842,35.114396,97.744168,23.393059,98.066856,72.135391,34.749683,59.790292,52.042527,80.647595,15.284669,female,160.9,61.95
1,-JtWlHEvrLPWagPaAhm0eQ_UKMSg8By1yoe-SkdrKSs,dc567997daef596835cce05a0c6f770f,22.739975,46.199041,26.398842,35.114396,97.744168,23.393059,98.066856,72.135391,34.749683,59.790292,52.042527,80.647595,15.284669,female,160.9,61.95
2,-JtWlHEvrLPWagPaAhm0eQ_UKMSg8By1yoe-SkdrKSs,f72236f1ab2ddb4a7b2863019c1c943c,22.739975,46.199041,26.398842,35.114396,97.744168,23.393059,98.066856,72.135391,34.749683,59.790292,52.042527,80.647595,15.284669,female,160.9,61.95
3,-JtWlHEvrLPWagPaAhm0eQ_UKMSg8By1yoe-SkdrKSs,f6aff3e542d4101fe9ac7cfa6ee4439a,22.739975,46.199041,26.398842,35.114396,97.744168,23.393059,98.066856,72.135391,34.749683,59.790292,52.042527,80.647595,15.284669,female,160.9,61.95
4,-JtWlHEvrLPWagPaAhm0eQ_UKMSg8By1yoe-SkdrKSs,00e11c860f7b081a01e9c7f04405c40f,22.739975,46.199041,26.398842,35.114396,97.744168,23.393059,98.066856,72.135391,34.749683,59.790292,52.042527,80.647595,15.284669,female,160.9,61.95


In [13]:
smp_body_measurements.shape

(1684, 18)

In [14]:
smp_body_measurements.columns

Index(['subject_id', 'photo_id', 'ankle', 'arm-length', 'bicep', 'calf',
       'chest', 'forearm', 'hip', 'leg-length', 'shoulder-breadth',
       'shoulder-to-crotch', 'thigh', 'waist', 'wrist', 'gender', 'height_cm',
       'weight_kg'],
      dtype='object')

In [15]:
merged_eff.head()

Unnamed: 0,photo_id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,s1015,s1016,s1017,s1018,s1019,s1020,s1021,s1022,s1023,s1024
0,e5ae8fe5bbdf611a1e8d06e66e849bdf.png,-1.042094,-1.730657,1.566182,-1.60408,-0.453005,0.18847,-1.124131,-0.845571,-1.161284,...,-0.302019,-1.04651,1.822773,0.370059,0.210585,0.036682,0.932571,-0.502113,0.454886,-1.286794
1,605a5fd09058c48156b0ef518b63b2de.png,-1.472728,-0.183869,2.148343,-1.645508,-0.92586,0.131387,-1.567845,-0.359548,-1.792026,...,-1.375771,1.48191,0.29084,-0.721068,-0.171349,-1.032791,0.701256,-2.140274,-2.126096,-0.975578
2,909c9277309e13ee014e347603aba620.png,-1.47145,0.037155,1.973593,-1.695215,-0.308931,0.758703,-1.801447,-0.705424,-1.482358,...,-1.77907,0.532031,0.757014,-0.398765,-1.256006,-0.549223,1.210784,-1.171533,-1.008888,-0.86987
3,bef6a68bc8dd475c124f6de2413385d3.png,0.071158,-1.256408,1.118973,-2.16653,-1.075335,0.872181,-1.01387,-0.730944,-1.59996,...,-0.511344,-0.008478,1.005939,-0.706059,-0.051107,-0.803857,0.246187,-1.094311,0.056111,-0.633578
4,6d7ed4bc4a17546447efed0ca6e2ff11.png,-1.396314,-1.110229,2.116553,-1.739712,-1.533494,0.797858,-0.780543,-0.863575,-1.269442,...,-1.235825,0.989174,0.786326,0.343737,0.153467,-0.336685,1.361712,-0.083629,-0.881862,-1.714599


7. merge body measurements and feature extractions

In [16]:
# 7. Clean photo_id in merged_eff: remove '.png' suffix and update in place

print("[INFO] Cleaning photo_id column in merged_eff (removing '.png')...")  # log start

# show a small sample before cleaning
print("[DEBUG] Sample photo_id values BEFORE cleaning:",
      merged_eff["photo_id"].head().tolist())  # log first few values

# convert photo_id to string and remove trailing '.png'
merged_eff["photo_id"] = (
    merged_eff["photo_id"]          # take photo_id column
    .astype(str)                    # ensure values are strings
    .str.replace(r"\.png$", "", regex=True)  # remove '.png' only at end of string
)

# show a small sample after cleaning
print("[DEBUG] Sample photo_id values AFTER cleaning:",
      merged_eff["photo_id"].head().tolist())  # log first few cleaned values

print(f"[INFO] Number of rows in merged_eff: {merged_eff.shape[0]}")           # log row count
print(f"[INFO] Number of unique photo_id values: {merged_eff['photo_id'].nunique()}")  # log unique ids

print("[INFO] merged_eff photo_id column updated successfully.")  # log completion
# NOTE: If you already created eff_training before this step,
#       rerun the merge step that builds eff_training so it uses the cleaned photo_id values.


[INFO] Cleaning photo_id column in merged_eff (removing '.png')...
[DEBUG] Sample photo_id values BEFORE cleaning: ['e5ae8fe5bbdf611a1e8d06e66e849bdf.png', '605a5fd09058c48156b0ef518b63b2de.png', '909c9277309e13ee014e347603aba620.png', 'bef6a68bc8dd475c124f6de2413385d3.png', '6d7ed4bc4a17546447efed0ca6e2ff11.png']
[DEBUG] Sample photo_id values AFTER cleaning: ['e5ae8fe5bbdf611a1e8d06e66e849bdf', '605a5fd09058c48156b0ef518b63b2de', '909c9277309e13ee014e347603aba620', 'bef6a68bc8dd475c124f6de2413385d3', '6d7ed4bc4a17546447efed0ca6e2ff11']
[INFO] Number of rows in merged_eff: 1684
[INFO] Number of unique photo_id values: 1684
[INFO] merged_eff photo_id column updated successfully.


In [17]:
merged_eff.head()

Unnamed: 0,photo_id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,s1015,s1016,s1017,s1018,s1019,s1020,s1021,s1022,s1023,s1024
0,e5ae8fe5bbdf611a1e8d06e66e849bdf,-1.042094,-1.730657,1.566182,-1.60408,-0.453005,0.18847,-1.124131,-0.845571,-1.161284,...,-0.302019,-1.04651,1.822773,0.370059,0.210585,0.036682,0.932571,-0.502113,0.454886,-1.286794
1,605a5fd09058c48156b0ef518b63b2de,-1.472728,-0.183869,2.148343,-1.645508,-0.92586,0.131387,-1.567845,-0.359548,-1.792026,...,-1.375771,1.48191,0.29084,-0.721068,-0.171349,-1.032791,0.701256,-2.140274,-2.126096,-0.975578
2,909c9277309e13ee014e347603aba620,-1.47145,0.037155,1.973593,-1.695215,-0.308931,0.758703,-1.801447,-0.705424,-1.482358,...,-1.77907,0.532031,0.757014,-0.398765,-1.256006,-0.549223,1.210784,-1.171533,-1.008888,-0.86987
3,bef6a68bc8dd475c124f6de2413385d3,0.071158,-1.256408,1.118973,-2.16653,-1.075335,0.872181,-1.01387,-0.730944,-1.59996,...,-0.511344,-0.008478,1.005939,-0.706059,-0.051107,-0.803857,0.246187,-1.094311,0.056111,-0.633578
4,6d7ed4bc4a17546447efed0ca6e2ff11,-1.396314,-1.110229,2.116553,-1.739712,-1.533494,0.797858,-0.780543,-0.863575,-1.269442,...,-1.235825,0.989174,0.786326,0.343737,0.153467,-0.336685,1.361712,-0.083629,-0.881862,-1.714599


In [18]:
# 6. Merge merged_eff and smp_body_measurements on photo_id (inner join)

print("[INFO] Merging merged_eff and smp_body_measurements on photo_id...")  # log start of merge

eff_training = pd.merge(          # create new merged DataFrame
    merged_eff,                   # left DataFrame with front+side features
    smp_body_measurements,        # right DataFrame with subject/photo/body data
    on="photo_id",                # join key column
    how="inner"                   # inner join: keep only matching photo_ids
)

print("[INFO] Merge complete.")                                   # log completion
print(f"[INFO] eff_training shape (rows x cols): {eff_training.shape}")  # log shape
print("[INFO] First 10 columns:", eff_training.columns.tolist()[:10])    # log first columns

print("[INFO] Preview of eff_training:")  # log preview message
eff_training.head()               # show first few rows


[INFO] Merging merged_eff and smp_body_measurements on photo_id...
[INFO] Merge complete.
[INFO] eff_training shape (rows x cols): (1684, 2066)
[INFO] First 10 columns: ['photo_id', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9']
[INFO] Preview of eff_training:


Unnamed: 0,photo_id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,hip,leg-length,shoulder-breadth,shoulder-to-crotch,thigh,waist,wrist,gender,height_cm,weight_kg
0,e5ae8fe5bbdf611a1e8d06e66e849bdf,-1.042094,-1.730657,1.566182,-1.60408,-0.453005,0.18847,-1.124131,-0.845571,-1.161284,...,106.77469,83.279744,39.922305,70.005128,55.945992,98.25039,20.187082,male,180.0,94.6
1,605a5fd09058c48156b0ef518b63b2de,-1.472728,-0.183869,2.148343,-1.645508,-0.92586,0.131387,-1.567845,-0.359548,-1.792026,...,102.481633,84.876529,39.974203,73.591637,55.397032,88.003618,17.715785,male,188.9,86.75
2,909c9277309e13ee014e347603aba620,-1.47145,0.037155,1.973593,-1.695215,-0.308931,0.758703,-1.801447,-0.705424,-1.482358,...,99.342301,82.275874,36.059983,66.440526,53.742692,82.100598,17.086464,male,179.7,73.85
3,bef6a68bc8dd475c124f6de2413385d3,0.071158,-1.256408,1.118973,-2.16653,-1.075335,0.872181,-1.01387,-0.730944,-1.59996,...,101.770144,76.081842,34.071748,62.218026,52.396573,83.999124,16.299751,female,166.95,69.05
4,6d7ed4bc4a17546447efed0ca6e2ff11,-1.396314,-1.110229,2.116553,-1.739712,-1.533494,0.797858,-0.780543,-0.863575,-1.269442,...,94.707063,81.328892,36.834735,64.426273,49.895157,86.020117,16.531431,male,173.2,65.55


In [19]:
eff_training.shape

(1684, 2066)

8. send final dataset to s3 location

In [20]:
# 8. Save eff_training DataFrame to S3 as eff_training.csv

import boto3      # AWS SDK for Python (if not already imported)
import io         # for in-memory text buffer

s3_out_path = "s3://ai-bmi-predictor/test-data/dinov2-large_testingA.csv"  # target S3 path for output CSV

print("[INFO] Starting upload of eff_training to S3...")  # log start
print(f"[INFO] Output S3 path: {s3_out_path}")            # log S3 path

# Ensure S3 path format is correct
assert s3_out_path.startswith("s3://"), "S3 path must start with 's3://'"  # basic validation

# Parse bucket and key from S3 path
out_path_no_scheme = s3_out_path[len("s3://"):]     # remove 's3://' prefix
out_bucket, out_key = out_path_no_scheme.split("/", 1)  # split into bucket and key

print(f"[INFO] Output bucket: {out_bucket}")  # log bucket name
print(f"[INFO] Output key: {out_key}")        # log key (object path)

# Convert DataFrame to CSV in memory (no local file)
csv_buffer = io.StringIO()                           # create in-memory text buffer
eff_training.to_csv(csv_buffer, index=False)         # write DataFrame as CSV into buffer

# Initialize S3 client
print("[INFO] Initializing S3 client for upload...")  # log client creation
s3_client = boto3.client("s3")                        # create S3 client

# Upload CSV content from buffer to S3
print("[INFO] Uploading eff_training.csv to S3...")   # log upload start
s3_client.put_object(
    Bucket=out_bucket,                                # target bucket
    Key=out_key,                                      # target key / object name
    Body=csv_buffer.getvalue()                        # CSV data as string
)

print("[INFO] Upload complete: eff_training.csv saved to S3.")  # log completion


[INFO] Starting upload of eff_training to S3...
[INFO] Output S3 path: s3://ai-bmi-predictor/test-data/dinov2-large_testingA.csv
[INFO] Output bucket: ai-bmi-predictor
[INFO] Output key: test-data/dinov2-large_testingA.csv
[INFO] Initializing S3 client for upload...
[INFO] Uploading eff_training.csv to S3...
[INFO] Upload complete: eff_training.csv saved to S3.
