1. create dataframe using extracted features of front masks
    * 1.1. load the pickle file of extracted features of front masks
    * 1.2. inspect one element to understand the shape
    * 1.3. create 2D array
    * 1.4. create the dataframe

2. create dataframe using extrcated features from side masks
    * 2.1. load the pickle file of extrcated features of side masks
    * 2.2. inspect one element to understand the shape
    * 2.3. create 2D array
    * 2.4. create the dataframe

3. merge front masks dataframe and side masks dataframe

4. load body_measurements dataset

5. load mapped subject_id and photo_id dataset

6. map body measurements with photo_id

7. merge body measurements and feature extrractions ( removed .png suffix from feature extracted dataframe )

1. create dataframe using extracted features of front masks

1.1. load the pickle file

In [1]:
import pickle
import boto3
import pandas as pd
import numpy as np

# ------------------------------------------------
# Config
# ------------------------------------------------
s3_path = "s3://ai-bmi-predictor/feature-extraction-data/testing-data-B/testB - resnet50_front_masked_features.pkl"

print("[INFO] Starting process...")
print(f"[INFO] S3 path: {s3_path}")

# ------------------------------------------------
# Parse S3 path
# ------------------------------------------------
assert s3_path.startswith("s3://"), "S3 path must start with 's3://'"
path_no_scheme = s3_path[len("s3://"):]
bucket, key = path_no_scheme.split("/", 1)

print(f"[INFO] Bucket: {bucket}")
print(f"[INFO] Key: {key}")

# ------------------------------------------------
# Read pickle directly from S3 into memory
# ------------------------------------------------
print("[INFO] Initializing S3 client...")
s3 = boto3.client("s3")

print("[INFO] Fetching object from S3 (streaming)...")
obj = s3.get_object(Bucket=bucket, Key=key)

print("[INFO] Loading pickle from StreamingBody...")
features_dict = pickle.load(obj["Body"])

print("[INFO] Pickle load complete.")
print(f"[INFO] Type of loaded object: {type(features_dict)}")
print(f"[INFO] Number of photo_ids (dict keys): {len(features_dict):,}")

[INFO] Starting process...
[INFO] S3 path: s3://ai-bmi-predictor/feature-extraction-data/testing-data-B/testB - resnet50_front_masked_features.pkl
[INFO] Bucket: ai-bmi-predictor
[INFO] Key: feature-extraction-data/testing-data-B/testB - resnet50_front_masked_features.pkl
[INFO] Initializing S3 client...
[INFO] Fetching object from S3 (streaming)...
[INFO] Loading pickle from StreamingBody...
[INFO] Pickle load complete.
[INFO] Type of loaded object: <class 'dict'>
[INFO] Number of photo_ids (dict keys): 1,160


1.2. inspect one element to understand the shape

In [2]:
# ------------------------------------------------
# Inspect one element to understand shape
# ------------------------------------------------
first_key = next(iter(features_dict))
first_raw_value = features_dict[first_key]
first_value = np.asarray(first_raw_value)

print(f"[DEBUG] Example photo_id: {first_key}")
print(f"[DEBUG] Raw type of feature value: {type(first_raw_value)}")
print(f"[DEBUG] Converted to np.array with shape: {first_value.shape}, ndim: {first_value.ndim}")

if first_value.ndim == 1:
    feature_length = first_value.shape[0]
elif first_value.ndim == 2:
    # e.g. (1, 2560) or (2560, 1)
    feature_length = first_value.size   # total elements
else:
    raise ValueError(f"[ERROR] Unsupported feature value ndim: {first_value.ndim}")

print(f"[INFO] Inferred feature_length: {feature_length}")

[DEBUG] Example photo_id: 5e09e1b0d43b6c430709a513f594c591.png
[DEBUG] Raw type of feature value: <class 'numpy.ndarray'>
[DEBUG] Converted to np.array with shape: (1, 2048), ndim: 2
[INFO] Inferred feature_length: 2048


1.3. create 2D array

In [3]:
# ------------------------------------------------
# Flatten all feature vectors and build 2D array
# ------------------------------------------------
print("[INFO] Flattening all feature vectors and building feature matrix...")

photo_ids = []
features_list = []

for i, (pid, feat) in enumerate(features_dict.items(), start=1):
    arr = np.asarray(feat)

    if arr.ndim == 1:
        vec = arr
    elif arr.ndim == 2:
        # Flatten any 2D array (e.g. (1, 2560) -> (2560,)
        vec = arr.reshape(-1)
    else:
        raise ValueError(f"[ERROR] Feature for photo_id {pid} has unsupported ndim: {arr.ndim}")

    if vec.shape[0] != feature_length:
        raise ValueError(
            f"[ERROR] Inconsistent feature length for photo_id {pid}: "
            f"expected {feature_length}, got {vec.shape[0]}"
        )

    photo_ids.append(pid)
    features_list.append(vec)

    if i % 1000 == 0:
        print(f"[INFO] Processed {i} feature vectors...")

features_matrix = np.vstack(features_list)
print(f"[INFO] Feature matrix shape: {features_matrix.shape} (rows x cols)")

[INFO] Flattening all feature vectors and building feature matrix...
[INFO] Processed 1000 feature vectors...
[INFO] Feature matrix shape: (1160, 2048) (rows x cols)


1.4. create the dataframe

In [4]:
# ------------------------------------------------
# Convert to DataFrame with f1..fN columns
# ------------------------------------------------
num_features = features_matrix.shape[1]
feature_columns = [f"f{i+1}" for i in range(num_features)]

print("[INFO] Creating DataFrame eff_front_train...")
eff_front_train = pd.DataFrame(features_matrix, columns=feature_columns)
eff_front_train.insert(0, "photo_id", photo_ids)

print("[INFO] Final DataFrame created: eff_front_train")
print(f"[INFO] Final shape (rows x cols): {eff_front_train.shape}")
print("[INFO] First 10 columns:", eff_front_train.columns.tolist()[:10])

print("[INFO] Preview of eff_front_train:")
eff_front_train.head()


[INFO] Creating DataFrame eff_front_train...
[INFO] Final DataFrame created: eff_front_train
[INFO] Final shape (rows x cols): (1160, 2049)
[INFO] First 10 columns: ['photo_id', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9']
[INFO] Preview of eff_front_train:


Unnamed: 0,photo_id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f2039,f2040,f2041,f2042,f2043,f2044,f2045,f2046,f2047,f2048
0,5e09e1b0d43b6c430709a513f594c591.png,2.651107,0.725831,0.294534,0.245607,0.417708,0.241973,0.957666,0.154901,0.044594,...,0.0,0.136274,1.113357,2.759108,1.035532,0.421357,3.5888,0.016119,0.862712,0.502704
1,7e6a57e2fcabf518e9007a77d4cc4960.png,1.202348,1.214229,0.113891,0.388146,0.679341,0.054397,0.785075,0.297846,0.279252,...,0.007131,0.143738,1.977156,3.021626,0.460767,0.110417,1.789404,0.02339,0.67342,0.0
2,4ee8251ae7e4ad42c75644db390fc5c5.png,1.778628,1.157071,0.641115,0.310468,0.513974,0.218655,0.599819,0.0,0.534557,...,0.060644,0.02369,0.95632,2.083938,0.494635,0.140661,2.635852,0.0,0.396027,0.180219
3,6367d327f96b951e4a498c88d615e8a4.png,1.980412,1.035453,0.127889,0.279539,0.935939,0.079369,0.707042,0.181815,0.006486,...,0.008424,0.287363,1.472589,2.959509,0.986431,0.040627,2.103104,0.023831,0.519398,0.23883
4,8b16daea6768b92fb27997f76215aa5f.png,1.837158,1.586316,0.240891,0.529174,0.268506,0.287906,0.780661,0.442943,0.037388,...,0.0,0.056634,1.007222,3.526659,1.075611,0.566785,2.532306,0.0,0.473031,0.243284


2. create dataframe using extracted features from side masks

2.1. load the pickle file

In [5]:
# 1.1 Load the pickle file

import pickle  # module to load pickle files
import boto3   # AWS SDK for Python to access S3
import pandas as pd  # pandas for DataFrame operations
import numpy as np   # numpy for numerical operations

# ------------------------------------------------
# Config
# ------------------------------------------------
s3_path = "s3://ai-bmi-predictor/feature-extraction-data/testing-data-B/testB - resnet50_side_masked_features.pkl"  # S3 path for side features pickle

print("[INFO] Starting process...")  # log start of process
print(f"[INFO] S3 path: {s3_path}")  # log S3 path used

# ------------------------------------------------
# Parse S3 path
# ------------------------------------------------
assert s3_path.startswith("s3://"), "S3 path must start with 's3://'"  # ensure S3 path format is correct
path_no_scheme = s3_path[len("s3://"):]  # remove 's3://' prefix
bucket, key = path_no_scheme.split("/", 1)  # split into bucket and key

print(f"[INFO] Bucket: {bucket}")  # log bucket name
print(f"[INFO] Key: {key}")        # log object key

# ------------------------------------------------
# Read pickle directly from S3 into memory
# ------------------------------------------------
print("[INFO] Initializing S3 client...")  # log S3 client initialization
s3 = boto3.client("s3")                   # create S3 client

print("[INFO] Fetching object from S3 (streaming)...")  # log object fetch
obj = s3.get_object(Bucket=bucket, Key=key)             # get object from S3

print("[INFO] Loading pickle from StreamingBody...")  # log pickle loading
features_dict = pickle.load(obj["Body"])             # load dictionary from S3 stream

print("[INFO] Pickle load complete.")  # log completion
print(f"[INFO] Type of loaded object: {type(features_dict)}")  # log object type
print(f"[INFO] Number of photo_ids (dict keys): {len(features_dict):,}")  # log number of keys


[INFO] Starting process...
[INFO] S3 path: s3://ai-bmi-predictor/feature-extraction-data/testing-data-B/testB - resnet50_side_masked_features.pkl
[INFO] Bucket: ai-bmi-predictor
[INFO] Key: feature-extraction-data/testing-data-B/testB - resnet50_side_masked_features.pkl
[INFO] Initializing S3 client...
[INFO] Fetching object from S3 (streaming)...
[INFO] Loading pickle from StreamingBody...
[INFO] Pickle load complete.
[INFO] Type of loaded object: <class 'dict'>
[INFO] Number of photo_ids (dict keys): 1,160


2.2. inspect one element to understand the shape

In [6]:
# 1.2 Inspect one element to understand the shape

# ------------------------------------------------
# Inspect one element to understand shape
# ------------------------------------------------
first_key = next(iter(features_dict))     # get first photo_id key
first_raw_value = features_dict[first_key]  # get raw feature value for that key
first_value = np.asarray(first_raw_value)   # convert feature to numpy array

print(f"[DEBUG] Example photo_id: {first_key}")  # log example photo_id
print(f"[DEBUG] Raw type of feature value: {type(first_raw_value)}")  # log raw value type
print(f"[DEBUG] Converted to np.array with shape: {first_value.shape}, ndim: {first_value.ndim}")  # log array shape

if first_value.ndim == 1:                   # check if feature is 1D
    feature_length = first_value.shape[0]    # use length of first dimension
elif first_value.ndim == 2:                 # check if feature is 2D (e.g. 1 x N)
    feature_length = first_value.size       # use total number of elements
else:                                       # other dimensions not supported
    raise ValueError(f"[ERROR] Unsupported feature value ndim: {first_value.ndim}")  # raise error if dimension invalid

print(f"[INFO] Inferred feature_length: {feature_length}")  # log inferred feature length


[DEBUG] Example photo_id: 5e09e1b0d43b6c430709a513f594c591.png
[DEBUG] Raw type of feature value: <class 'numpy.ndarray'>
[DEBUG] Converted to np.array with shape: (1, 2048), ndim: 2
[INFO] Inferred feature_length: 2048


2.3. create 2D array

In [7]:
# 1.3 Create 2D array

# ------------------------------------------------
# Flatten all feature vectors and build 2D array
# ------------------------------------------------
print("[INFO] Flattening all feature vectors and building feature matrix...")  # log start of flattening

photo_ids = []       # list to store photo_id values
features_list = []   # list to store flattened feature vectors

for i, (pid, feat) in enumerate(features_dict.items(), start=1):  # loop over all items in dictionary
    arr = np.asarray(feat)  # convert current feature to numpy array

    if arr.ndim == 1:       # if array is 1D
        vec = arr           # use as is
    elif arr.ndim == 2:     # if array is 2D
        vec = arr.reshape(-1)  # flatten to 1D vector
    else:                   # unsupported dimensions
        raise ValueError(f"[ERROR] Feature for photo_id {pid} has unsupported ndim: {arr.ndim}")  # raise error

    if vec.shape[0] != feature_length:  # check if length matches inferred feature length
        raise ValueError(
            f"[ERROR] Inconsistent feature length for photo_id {pid}: "
            f"expected {feature_length}, got {vec.shape[0]}"
        )  # raise error if length mismatch

    photo_ids.append(pid)   # add current photo_id to list
    features_list.append(vec)  # add flattened feature to list

    if i % 1000 == 0:   # every 1000 items processed
        print(f"[INFO] Processed {i} feature vectors...")  # log progress

features_matrix = np.vstack(features_list)  # stack all feature vectors into 2D array
print(f"[INFO] Feature matrix shape: {features_matrix.shape} (rows x cols)")  # log final matrix shape


[INFO] Flattening all feature vectors and building feature matrix...
[INFO] Processed 1000 feature vectors...
[INFO] Feature matrix shape: (1160, 2048) (rows x cols)


2.4. create the dataframe

In [8]:
# 1.4 Create the DataFrame

# ------------------------------------------------
# Convert to DataFrame with s1..sN columns
# ------------------------------------------------
num_features = features_matrix.shape[1]               # get number of feature columns
feature_columns = [f"s{i+1}" for i in range(num_features)]  # create column names s1, s2, ..., sN

print("[INFO] Creating DataFrame eff_side_train...")  # log DataFrame creation
eff_side_train = pd.DataFrame(features_matrix, columns=feature_columns)  # create DataFrame from feature matrix
eff_side_train.insert(0, "photo_id", photo_ids)       # insert photo_id column at first position

print("[INFO] Final DataFrame created: eff_side_train")  # log completion
print(f"[INFO] Final shape (rows x cols): {eff_side_train.shape}")  # log DataFrame shape
print("[INFO] First 10 columns:", eff_side_train.columns.tolist()[:10])  # log first 10 column names

print("[INFO] Preview of eff_side_train:")  # log preview message
eff_side_train.head()               # print first few rows of DataFrame


[INFO] Creating DataFrame eff_side_train...
[INFO] Final DataFrame created: eff_side_train
[INFO] Final shape (rows x cols): (1160, 2049)
[INFO] First 10 columns: ['photo_id', 's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9']
[INFO] Preview of eff_side_train:


Unnamed: 0,photo_id,s1,s2,s3,s4,s5,s6,s7,s8,s9,...,s2039,s2040,s2041,s2042,s2043,s2044,s2045,s2046,s2047,s2048
0,5e09e1b0d43b6c430709a513f594c591.png,1.631846,1.332057,0.154007,0.036712,0.469591,0.0,0.192053,0.53644,0.043505,...,0.007417,0.045765,0.189162,2.731008,2.057057,0.11977,2.457719,0.528056,0.530376,0.480334
1,7e6a57e2fcabf518e9007a77d4cc4960.png,0.377816,1.072008,0.12811,0.0,0.271439,0.0,0.017754,0.0,0.004563,...,0.0,0.007807,1.382514,1.436955,0.199988,0.167083,3.044427,0.157858,0.063795,0.211922
2,4ee8251ae7e4ad42c75644db390fc5c5.png,1.367023,0.643553,0.356323,0.006441,0.55725,0.173831,0.113309,0.358639,0.011668,...,0.008276,0.0,0.686753,1.189021,0.689507,0.0,2.191277,0.307915,0.322818,0.555965
3,6367d327f96b951e4a498c88d615e8a4.png,1.923406,0.252396,0.011254,0.158228,0.807402,0.0,0.604845,0.528208,0.0,...,0.0,0.018173,0.310373,2.886946,0.452101,0.029239,1.629398,0.696043,0.603521,0.435756
4,8b16daea6768b92fb27997f76215aa5f.png,1.609097,0.638758,0.075071,0.005283,0.098552,0.0,0.025141,0.81076,0.001704,...,0.006192,0.0,0.327299,3.130944,1.115668,0.000732,3.103849,0.279516,0.820196,0.04031


3. merge front masks dataframe and side masks dataframe

In [9]:
# 2. Merge the two DataFrames by photo_id

print("[INFO] Merging eff_front_train and eff_side_train on photo_id...")  # log start of merge

merged_eff = pd.merge(
    eff_front_train,   # left DataFrame with front features
    eff_side_train,    # right DataFrame with side features
    on="photo_id",     # join key column
    how="inner"        # inner join keeps only common photo_ids
)

print("[INFO] Merge complete.")  # log merge completion
print(f"[INFO] merged_eff shape (rows x cols): {merged_eff.shape}")  # log shape of merged DataFrame
print("[INFO] First 10 columns:", merged_eff.columns.tolist()[:10])  # log first 10 column names

print("[INFO] Preview of merged_eff:")  # log preview message
merged_eff.head()              # show first few rows of merged DataFrame


[INFO] Merging eff_front_train and eff_side_train on photo_id...
[INFO] Merge complete.
[INFO] merged_eff shape (rows x cols): (1160, 4097)
[INFO] First 10 columns: ['photo_id', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9']
[INFO] Preview of merged_eff:


Unnamed: 0,photo_id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,s2039,s2040,s2041,s2042,s2043,s2044,s2045,s2046,s2047,s2048
0,5e09e1b0d43b6c430709a513f594c591.png,2.651107,0.725831,0.294534,0.245607,0.417708,0.241973,0.957666,0.154901,0.044594,...,0.007417,0.045765,0.189162,2.731008,2.057057,0.11977,2.457719,0.528056,0.530376,0.480334
1,7e6a57e2fcabf518e9007a77d4cc4960.png,1.202348,1.214229,0.113891,0.388146,0.679341,0.054397,0.785075,0.297846,0.279252,...,0.0,0.007807,1.382514,1.436955,0.199988,0.167083,3.044427,0.157858,0.063795,0.211922
2,4ee8251ae7e4ad42c75644db390fc5c5.png,1.778628,1.157071,0.641115,0.310468,0.513974,0.218655,0.599819,0.0,0.534557,...,0.008276,0.0,0.686753,1.189021,0.689507,0.0,2.191277,0.307915,0.322818,0.555965
3,6367d327f96b951e4a498c88d615e8a4.png,1.980412,1.035453,0.127889,0.279539,0.935939,0.079369,0.707042,0.181815,0.006486,...,0.0,0.018173,0.310373,2.886946,0.452101,0.029239,1.629398,0.696043,0.603521,0.435756
4,8b16daea6768b92fb27997f76215aa5f.png,1.837158,1.586316,0.240891,0.529174,0.268506,0.287906,0.780661,0.442943,0.037388,...,0.006192,0.0,0.327299,3.130944,1.115668,0.000732,3.103849,0.279516,0.820196,0.04031


4. load body measurements dataset

In [10]:
# 3. Load body_measurements.csv from S3 as body_measurements DataFrame

import boto3        # AWS SDK for Python (if not already imported)
import pandas as pd # pandas for DataFrame operations

s3_csv_path = "s3://ai-bmi-predictor/test-data/body_measurements-testB.csv"  # S3 path for body measurements CSV

print("[INFO] Starting to load body_measurements.csv...")  # log start
print(f"[INFO] S3 CSV path: {s3_csv_path}")                # log S3 path

# Ensure S3 path has correct prefix
assert s3_csv_path.startswith("s3://"), "S3 path must start with 's3://'"  # basic validation

# Remove scheme and split into bucket and key
csv_path_no_scheme = s3_csv_path[len("s3://"):]   # strip 's3://'
csv_bucket, csv_key = csv_path_no_scheme.split("/", 1)  # split into bucket and key

print(f"[INFO] CSV Bucket: {csv_bucket}")  # log bucket
print(f"[INFO] CSV Key: {csv_key}")        # log key

# Initialize S3 client
print("[INFO] Initializing S3 client for CSV...")  # log client creation
s3_client = boto3.client("s3")                     # create S3 client

# Fetch CSV object from S3
print("[INFO] Fetching CSV object from S3 (streaming)...")         # log fetch
csv_obj = s3_client.get_object(Bucket=csv_bucket, Key=csv_key)    # get object

# Read CSV directly from S3 stream into pandas DataFrame
print("[INFO] Reading CSV into pandas DataFrame...")  # log read_csv
body_measurements = pd.read_csv(csv_obj["Body"])      # load into DataFrame

print("[INFO] body_measurements DataFrame loaded successfully.")   # log success
print(f"[INFO] DataFrame shape (rows x cols): {body_measurements.shape}")  # log shape
print("[INFO] Preview of body_measurements:")       # log preview message
body_measurements.head()                    # show first few rows


[INFO] Starting to load body_measurements.csv...
[INFO] S3 CSV path: s3://ai-bmi-predictor/test-data/body_measurements-testB.csv
[INFO] CSV Bucket: ai-bmi-predictor
[INFO] CSV Key: test-data/body_measurements-testB.csv
[INFO] Initializing S3 client for CSV...
[INFO] Fetching CSV object from S3 (streaming)...
[INFO] Reading CSV into pandas DataFrame...
[INFO] body_measurements DataFrame loaded successfully.
[INFO] DataFrame shape (rows x cols): (400, 17)
[INFO] Preview of body_measurements:


Unnamed: 0,subject_id,ankle,arm-length,bicep,calf,chest,forearm,hip,leg-length,shoulder-breadth,shoulder-to-crotch,thigh,waist,wrist,gender,height_cm,weight_kg
0,07P1Yi2alvpi0MtJTu7-TmoT5E3x2dABQKiuu_jcnLU,23.424223,47.128738,29.744926,37.697201,98.719429,25.687422,101.086189,74.17186,37.112251,64.66703,52.378044,87.121025,16.179781,male,167.0,70.0
1,0AQqSm49JJoFaYTzjDcmRHM4gdwcYZyOqXBZ_yvNy0E,23.465136,48.513802,28.611294,33.724255,96.476715,24.859922,98.461998,76.29821,35.401356,66.202171,48.386967,85.32827,16.165899,male,171.5,63.8
2,0PghUpgAgKJiVgHZpWydh6WqIYKYTlzV-5QR7eSR318,21.773336,45.14246,25.588623,32.983479,85.886726,21.98181,89.685318,72.312012,31.207792,55.746342,46.059982,72.437881,14.523148,female,155.5,49.8
3,0QW7SNzqmRooRGXql2nRJG5HmfgDtvxaxDcovjh9DRQ,23.121952,47.717579,27.691643,36.397129,85.084686,23.563515,100.694473,76.831047,31.613873,59.997318,52.054901,76.603951,15.654763,female,163.5,59.3
4,0RRTDLueddykSM_-MbQYeCfx43VR1uf19kN8n6L7sJA,22.359692,46.879379,33.301228,35.578388,114.84845,25.253469,106.579338,75.1987,33.318371,63.201122,57.078968,100.175446,15.805896,female,164.0,77.3


5. subject_id and photo_id map dataset

In [11]:
# 4. Load subject_to_photo_map.csv as smp DataFrame

import boto3        # AWS SDK for accessing S3
import pandas as pd # pandas for DataFrame operations

smp_s3_path = "s3://amazon-bodym/testB/subject_to_photo_map.csv"  # S3 path for subject_to_photo_map.csv

print("[INFO] Starting to load subject_to_photo_map.csv...")  # log start
print(f"[INFO] S3 CSV path: {smp_s3_path}")                    # log S3 path

assert smp_s3_path.startswith("s3://"), "S3 path must start with 's3://'"  # basic validation of S3 path

smp_path_no_scheme = smp_s3_path[len("s3://"):]        # remove 's3://' prefix
smp_bucket, smp_key = smp_path_no_scheme.split("/", 1) # split into bucket and key

print(f"[INFO] CSV Bucket: {smp_bucket}")  # log bucket name
print(f"[INFO] CSV Key: {smp_key}")        # log key (object path)

print("[INFO] Initializing S3 client for subject_to_photo_map.csv...")  # log client init
s3_client = boto3.client("s3")                                          # create S3 client

print("[INFO] Fetching subject_to_photo_map.csv from S3 (streaming)...")  # log fetch
csv_obj = s3_client.get_object(Bucket=smp_bucket, Key=smp_key)           # get CSV object from S3

print("[INFO] Reading CSV into pandas DataFrame (smp)...")  # log DataFrame creation
smp = pd.read_csv(csv_obj["Body"])                         # read CSV stream into DataFrame

print("[INFO] smp DataFrame loaded successfully.")         # log success
print(f"[INFO] smp shape (rows x cols): {smp.shape}")      # log DataFrame shape
print("[INFO] Preview of smp:")                            # log preview message
smp.head()                                        # show first few rows


[INFO] Starting to load subject_to_photo_map.csv...
[INFO] S3 CSV path: s3://amazon-bodym/testB/subject_to_photo_map.csv
[INFO] CSV Bucket: amazon-bodym
[INFO] CSV Key: testB/subject_to_photo_map.csv
[INFO] Initializing S3 client for subject_to_photo_map.csv...
[INFO] Fetching subject_to_photo_map.csv from S3 (streaming)...
[INFO] Reading CSV into pandas DataFrame (smp)...
[INFO] smp DataFrame loaded successfully.
[INFO] smp shape (rows x cols): (1160, 2)
[INFO] Preview of smp:


Unnamed: 0,subject_id,photo_id
0,ikt5hpg4u--nWYDSMAfZXOVZhhah-YtX7p6p7q8Hg6I,d151534ec2286935a58485f594f95464
1,ikt5hpg4u--nWYDSMAfZXOVZhhah-YtX7p6p7q8Hg6I,8659001c5f3d675871b418c374b886d0
2,ikt5hpg4u--nWYDSMAfZXOVZhhah-YtX7p6p7q8Hg6I,0cf8bc3ab37436850dd21e301f6a2b1a
3,AcXtVN5DaisfM7J3T6N5F-ZePD30WawSdWJa-8IEPpA,38626dc8402b497be1796c5668f5f4eb
4,AcXtVN5DaisfM7J3T6N5F-ZePD30WawSdWJa-8IEPpA,020ee6e3d2db84419b49e08f7da4ecbc


6. merge body_measurements with photo_id

In [12]:
# 5. Merge body_measurements and smp by subject_id
#    Keep all rows from smp (body_measurements can duplicate)

print("[INFO] Merging smp and body_measurements on subject_id...")  # log start of merge

smp_body_measurements = pd.merge(
    smp,                 # left DataFrame: keep all rows from smp
    body_measurements,   # right DataFrame: body measurements
    on="subject_id",     # join key column
    how="left"           # left join: all rows from smp, matches from body_measurements
)

print("[INFO] Merge complete.")  # log merge completion
print(f"[INFO] smp_body_measurements shape (rows x cols): {smp_body_measurements.shape}")  # log shape
print("[INFO] First 10 columns:", smp_body_measurements.columns.tolist()[:10])  # log first columns

print("[INFO] Preview of smp_body_measurements:")  # log preview message
smp_body_measurements.head()                # show first few merged rows


[INFO] Merging smp and body_measurements on subject_id...
[INFO] Merge complete.
[INFO] smp_body_measurements shape (rows x cols): (1160, 18)
[INFO] First 10 columns: ['subject_id', 'photo_id', 'ankle', 'arm-length', 'bicep', 'calf', 'chest', 'forearm', 'hip', 'leg-length']
[INFO] Preview of smp_body_measurements:


Unnamed: 0,subject_id,photo_id,ankle,arm-length,bicep,calf,chest,forearm,hip,leg-length,shoulder-breadth,shoulder-to-crotch,thigh,waist,wrist,gender,height_cm,weight_kg
0,ikt5hpg4u--nWYDSMAfZXOVZhhah-YtX7p6p7q8Hg6I,d151534ec2286935a58485f594f95464,21.672325,50.716846,24.574076,30.899563,87.887985,22.8797,96.490822,82.649368,33.166767,61.04874,46.537296,73.152466,15.133351,female,174.0,56.7
1,ikt5hpg4u--nWYDSMAfZXOVZhhah-YtX7p6p7q8Hg6I,8659001c5f3d675871b418c374b886d0,21.672325,50.716846,24.574076,30.899563,87.887985,22.8797,96.490822,82.649368,33.166767,61.04874,46.537296,73.152466,15.133351,female,174.0,56.7
2,ikt5hpg4u--nWYDSMAfZXOVZhhah-YtX7p6p7q8Hg6I,0cf8bc3ab37436850dd21e301f6a2b1a,21.672325,50.716846,24.574076,30.899563,87.887985,22.8797,96.490822,82.649368,33.166767,61.04874,46.537296,73.152466,15.133351,female,174.0,56.7
3,AcXtVN5DaisfM7J3T6N5F-ZePD30WawSdWJa-8IEPpA,38626dc8402b497be1796c5668f5f4eb,26.156717,51.795052,34.60302,38.637165,108.793091,30.038279,112.588089,82.382767,38.854736,73.14167,56.130325,101.958183,18.90831,male,186.0,93.6
4,AcXtVN5DaisfM7J3T6N5F-ZePD30WawSdWJa-8IEPpA,020ee6e3d2db84419b49e08f7da4ecbc,26.156717,51.795052,34.60302,38.637165,108.793091,30.038279,112.588089,82.382767,38.854736,73.14167,56.130325,101.958183,18.90831,male,186.0,93.6


In [13]:
smp_body_measurements.shape

(1160, 18)

In [14]:
smp_body_measurements.columns

Index(['subject_id', 'photo_id', 'ankle', 'arm-length', 'bicep', 'calf',
       'chest', 'forearm', 'hip', 'leg-length', 'shoulder-breadth',
       'shoulder-to-crotch', 'thigh', 'waist', 'wrist', 'gender', 'height_cm',
       'weight_kg'],
      dtype='object')

In [15]:
merged_eff.head()

Unnamed: 0,photo_id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,s2039,s2040,s2041,s2042,s2043,s2044,s2045,s2046,s2047,s2048
0,5e09e1b0d43b6c430709a513f594c591.png,2.651107,0.725831,0.294534,0.245607,0.417708,0.241973,0.957666,0.154901,0.044594,...,0.007417,0.045765,0.189162,2.731008,2.057057,0.11977,2.457719,0.528056,0.530376,0.480334
1,7e6a57e2fcabf518e9007a77d4cc4960.png,1.202348,1.214229,0.113891,0.388146,0.679341,0.054397,0.785075,0.297846,0.279252,...,0.0,0.007807,1.382514,1.436955,0.199988,0.167083,3.044427,0.157858,0.063795,0.211922
2,4ee8251ae7e4ad42c75644db390fc5c5.png,1.778628,1.157071,0.641115,0.310468,0.513974,0.218655,0.599819,0.0,0.534557,...,0.008276,0.0,0.686753,1.189021,0.689507,0.0,2.191277,0.307915,0.322818,0.555965
3,6367d327f96b951e4a498c88d615e8a4.png,1.980412,1.035453,0.127889,0.279539,0.935939,0.079369,0.707042,0.181815,0.006486,...,0.0,0.018173,0.310373,2.886946,0.452101,0.029239,1.629398,0.696043,0.603521,0.435756
4,8b16daea6768b92fb27997f76215aa5f.png,1.837158,1.586316,0.240891,0.529174,0.268506,0.287906,0.780661,0.442943,0.037388,...,0.006192,0.0,0.327299,3.130944,1.115668,0.000732,3.103849,0.279516,0.820196,0.04031


7. merge body measurements and feature extractions

In [16]:
# 7. Clean photo_id in merged_eff: remove '.png' suffix and update in place

print("[INFO] Cleaning photo_id column in merged_eff (removing '.png')...")  # log start

# show a small sample before cleaning
print("[DEBUG] Sample photo_id values BEFORE cleaning:",
      merged_eff["photo_id"].head().tolist())  # log first few values

# convert photo_id to string and remove trailing '.png'
merged_eff["photo_id"] = (
    merged_eff["photo_id"]          # take photo_id column
    .astype(str)                    # ensure values are strings
    .str.replace(r"\.png$", "", regex=True)  # remove '.png' only at end of string
)

# show a small sample after cleaning
print("[DEBUG] Sample photo_id values AFTER cleaning:",
      merged_eff["photo_id"].head().tolist())  # log first few cleaned values

print(f"[INFO] Number of rows in merged_eff: {merged_eff.shape[0]}")           # log row count
print(f"[INFO] Number of unique photo_id values: {merged_eff['photo_id'].nunique()}")  # log unique ids

print("[INFO] merged_eff photo_id column updated successfully.")  # log completion
# NOTE: If you already created eff_training before this step,
#       rerun the merge step that builds eff_training so it uses the cleaned photo_id values.


[INFO] Cleaning photo_id column in merged_eff (removing '.png')...
[DEBUG] Sample photo_id values BEFORE cleaning: ['5e09e1b0d43b6c430709a513f594c591.png', '7e6a57e2fcabf518e9007a77d4cc4960.png', '4ee8251ae7e4ad42c75644db390fc5c5.png', '6367d327f96b951e4a498c88d615e8a4.png', '8b16daea6768b92fb27997f76215aa5f.png']
[DEBUG] Sample photo_id values AFTER cleaning: ['5e09e1b0d43b6c430709a513f594c591', '7e6a57e2fcabf518e9007a77d4cc4960', '4ee8251ae7e4ad42c75644db390fc5c5', '6367d327f96b951e4a498c88d615e8a4', '8b16daea6768b92fb27997f76215aa5f']
[INFO] Number of rows in merged_eff: 1160
[INFO] Number of unique photo_id values: 1160
[INFO] merged_eff photo_id column updated successfully.


In [17]:
merged_eff.head()

Unnamed: 0,photo_id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,s2039,s2040,s2041,s2042,s2043,s2044,s2045,s2046,s2047,s2048
0,5e09e1b0d43b6c430709a513f594c591,2.651107,0.725831,0.294534,0.245607,0.417708,0.241973,0.957666,0.154901,0.044594,...,0.007417,0.045765,0.189162,2.731008,2.057057,0.11977,2.457719,0.528056,0.530376,0.480334
1,7e6a57e2fcabf518e9007a77d4cc4960,1.202348,1.214229,0.113891,0.388146,0.679341,0.054397,0.785075,0.297846,0.279252,...,0.0,0.007807,1.382514,1.436955,0.199988,0.167083,3.044427,0.157858,0.063795,0.211922
2,4ee8251ae7e4ad42c75644db390fc5c5,1.778628,1.157071,0.641115,0.310468,0.513974,0.218655,0.599819,0.0,0.534557,...,0.008276,0.0,0.686753,1.189021,0.689507,0.0,2.191277,0.307915,0.322818,0.555965
3,6367d327f96b951e4a498c88d615e8a4,1.980412,1.035453,0.127889,0.279539,0.935939,0.079369,0.707042,0.181815,0.006486,...,0.0,0.018173,0.310373,2.886946,0.452101,0.029239,1.629398,0.696043,0.603521,0.435756
4,8b16daea6768b92fb27997f76215aa5f,1.837158,1.586316,0.240891,0.529174,0.268506,0.287906,0.780661,0.442943,0.037388,...,0.006192,0.0,0.327299,3.130944,1.115668,0.000732,3.103849,0.279516,0.820196,0.04031


In [18]:
# 6. Merge merged_eff and smp_body_measurements on photo_id (inner join)

print("[INFO] Merging merged_eff and smp_body_measurements on photo_id...")  # log start of merge

eff_training = pd.merge(          # create new merged DataFrame
    merged_eff,                   # left DataFrame with front+side features
    smp_body_measurements,        # right DataFrame with subject/photo/body data
    on="photo_id",                # join key column
    how="inner"                   # inner join: keep only matching photo_ids
)

print("[INFO] Merge complete.")                                   # log completion
print(f"[INFO] eff_training shape (rows x cols): {eff_training.shape}")  # log shape
print("[INFO] First 10 columns:", eff_training.columns.tolist()[:10])    # log first columns

print("[INFO] Preview of eff_training:")  # log preview message
eff_training.head()               # show first few rows


[INFO] Merging merged_eff and smp_body_measurements on photo_id...
[INFO] Merge complete.
[INFO] eff_training shape (rows x cols): (1160, 4114)
[INFO] First 10 columns: ['photo_id', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9']
[INFO] Preview of eff_training:


Unnamed: 0,photo_id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,hip,leg-length,shoulder-breadth,shoulder-to-crotch,thigh,waist,wrist,gender,height_cm,weight_kg
0,5e09e1b0d43b6c430709a513f594c591,2.651107,0.725831,0.294534,0.245607,0.417708,0.241973,0.957666,0.154901,0.044594,...,91.273094,77.553963,36.633358,64.253914,47.014259,77.113548,16.389431,male,170.0,61.8
1,7e6a57e2fcabf518e9007a77d4cc4960,1.202348,1.214229,0.113891,0.388146,0.679341,0.054397,0.785075,0.297846,0.279252,...,91.70752,76.172325,34.205143,61.72636,48.404037,73.617821,14.480244,female,169.0,54.5
2,4ee8251ae7e4ad42c75644db390fc5c5,1.778628,1.157071,0.641115,0.310468,0.513974,0.218655,0.599819,0.0,0.534557,...,89.962387,74.400047,31.023907,56.728245,47.468864,72.155304,15.670779,female,164.59,52.1
3,6367d327f96b951e4a498c88d615e8a4,1.980412,1.035453,0.127889,0.279539,0.935939,0.079369,0.707042,0.181815,0.006486,...,96.964554,76.552322,36.380489,63.264416,47.635548,82.428825,16.714935,male,168.0,67.4
4,8b16daea6768b92fb27997f76215aa5f,1.837158,1.586316,0.240891,0.529174,0.268506,0.287906,0.780661,0.442943,0.037388,...,115.346634,85.163284,39.959301,72.7239,61.20694,115.251938,19.627054,male,184.3,111.3


In [19]:
eff_training.shape

(1160, 4114)

8. send final dataset to s3 location

In [None]:
# 8. Save eff_training DataFrame to S3 as eff_training.csv

import boto3      # AWS SDK for Python (if not already imported)
import io         # for in-memory text buffer

s3_out_path = "s3://ai-bmi-predictor/test-data/res_testingB.csv"  # target S3 path for output CSV

print("[INFO] Starting upload of eff_training to S3...")  # log start
print(f"[INFO] Output S3 path: {s3_out_path}")            # log S3 path

# Ensure S3 path format is correct
assert s3_out_path.startswith("s3://"), "S3 path must start with 's3://'"  # basic validation

# Parse bucket and key from S3 path
out_path_no_scheme = s3_out_path[len("s3://"):]     # remove 's3://' prefix
out_bucket, out_key = out_path_no_scheme.split("/", 1)  # split into bucket and key

print(f"[INFO] Output bucket: {out_bucket}")  # log bucket name
print(f"[INFO] Output key: {out_key}")        # log key (object path)

# Convert DataFrame to CSV in memory (no local file)
csv_buffer = io.StringIO()                           # create in-memory text buffer
eff_training.to_csv(csv_buffer, index=False)         # write DataFrame as CSV into buffer

# Initialize S3 client
print("[INFO] Initializing S3 client for upload...")  # log client creation
s3_client = boto3.client("s3")                        # create S3 client

# Upload CSV content from buffer to S3
print("[INFO] Uploading eff_training.csv to S3...")   # log upload start
s3_client.put_object(
    Bucket=out_bucket,                                # target bucket
    Key=out_key,                                      # target key / object name
    Body=csv_buffer.getvalue()                        # CSV data as string
)

print("[INFO] Upload complete: eff_training.csv saved to S3.")  # log completion
