1. load the dataset

In [None]:
import boto3
import pandas as pd

bucket = "ai-bmi-predictor"
key = "test-data/eff_testingA.csv"

s3 = boto3.client("s3")
obj = s3.get_object(Bucket=bucket, Key=key)

data = pd.read_csv(obj["Body"])

data.head()


2. data preprocessing

2.1. categorical encoding for 'gender' feature

In [None]:
import pandas as pd                 # import pandas for data handling

data['gender'] = data['gender'].astype('category')  # convert 'gender' values to categorical type
data['gender'] = data['gender'].cat.codes           # replace 'gender' with its numeric category codes

In [None]:
data['gender'].head()

In [None]:
#data['height_cm'].head()

2.2. define weight frequencies for class imbalance issue for weight_kg

In [None]:
'''import pandas as pd                     # import pandas for data handling
import numpy as np                      # import numpy to help with safe division

# Assume 'data' is your DataFrame and already loaded
#print("Preview of data:\n", data.head())  # print first few rows to check data
print("\nTotal samples in dataset:", len(data))  # print total number of rows

# -----------------------------
# 1. Create boolean masks for the three weight_kg classes
# -----------------------------
class_1_mask = data['weight_kg'] < 60                      # True where weight_kg is less than 60
class_2_mask = data['weight_kg'] > 100                     # True where weight_kg is greater than 100
class_3_mask = (data['weight_kg'] >= 60) & (data['weight_kg'] <= 100)  # True where weight is between 60 and 100

# -----------------------------
# 2. Calculate class frequencies (counts)
# -----------------------------
freq_class_1 = class_1_mask.sum()          # number of samples with weight_kg < 60
freq_class_2 = class_2_mask.sum()          # number of samples with weight_kg > 100
freq_class_3 = class_3_mask.sum()          # number of samples with 60 <= weight_kg <= 100

print("\nClass frequencies:")              # header for clarity
print("Class 1 (weight_kg < 60):", freq_class_1)   # print frequency of class 1
print("Class 2 (weight_kg > 100):", freq_class_2)  # print frequency of class 2
print("Class 3 (60 <= weight_kg <= 100):", freq_class_3)  # print frequency of class 3

# -----------------------------
# 3. Number of classes according to the strategy
# -----------------------------
num_classes = 3                             # we defined three classes by the rules above
print("\nNumber of classes:", num_classes)  # print number of classes

# -----------------------------
# 4. Compute inverse-frequency weights for each class
#    Formula: w = total_samples / (num_classes * class_frequency)
# -----------------------------
total_samples = len(data)                   # total number of rows in the dataset

def safe_weight(class_freq):                # helper function to avoid division by zero
    if class_freq == 0:                     # check if a class has zero samples
        return np.nan                       # return NaN if no samples exist for that class
    return total_samples / (num_classes * class_freq)  # apply weighting formula

weight_class_1 = safe_weight(freq_class_1)  # compute weight for class 1
weight_class_2 = safe_weight(freq_class_2)  # compute weight for class 2
weight_class_3 = safe_weight(freq_class_3)  # compute weight for class 3

print("\nClass weights (inverse frequency):")          # header for class weights
print("Weight for Class 1 (weight_kg < 60):", weight_class_1)   # print weight of class 1
print("Weight for Class 2 (weight_kg > 100):", weight_class_2)  # print weight of class 2
print("Weight for Class 3 (60 <= weight_kg <= 100):", weight_class_3)  # print weight of class 3
'''

2.3. define weight frequencies for class imbalance issue for gender feature

In [None]:
'''import numpy as np                                      # import numpy for numeric utilities (like NaN)

print("Preview of gender column:\n", data['gender'].head())  # show first few gender values to inspect

# -----------------------------------
# 1. Calculate class frequencies for gender
# -----------------------------------
gender_counts = data['gender'].value_counts()           # count how many samples belong to each gender class

print("\nClass frequencies for gender:")                # header for class frequency output
for gender_class, freq in gender_counts.items():        # loop over each gender class and its frequency
    print(f"Class {gender_class}: {freq}")              # print the class label and its frequency

# -----------------------------------
# 2. Number of gender classes
# -----------------------------------
num_gender_classes = len(gender_counts)                 # compute how many distinct gender classes we have
print("\nNumber of gender classes:", num_gender_classes)  # print number of gender classes

# -----------------------------------
# 3. Compute inverse-frequency weights for each gender class
#    Formula: w = total_samples / (num_classes * class_frequency)
# -----------------------------------
total_samples = len(data)                               # total number of samples in the dataset

def safe_weight(class_freq):                            # define helper function to compute class weight safely
    if class_freq == 0:                                 # check for zero frequency to avoid division by zero
        return np.nan                                   # return NaN if a class somehow has zero samples
    return total_samples / (num_gender_classes * class_freq)  # apply the inverse-frequency weight formula

gender_weights = {}                                     # create an empty dictionary to store weights per class
for gender_class, freq in gender_counts.items():        # loop through each gender class and its frequency
    gender_weights[gender_class] = safe_weight(freq)    # compute and store the weight for this gender class

print("\nClass weights (inverse frequency) for gender:")  # header for weight output
for gender_class, weight in gender_weights.items():     # loop over each class and its weight
    print(f"Weight for class {gender_class}: {weight}") # print the computed weight for this gender class
'''

2.4. weight frequencies for weight classes and gender classes

In [None]:
'''import numpy as np   # import numpy for numeric operations

# -------------------------------------------------
# 1. Store the already-computed weights for weight classes
#    (use the variables you created when handling weight_kg)
# -------------------------------------------------
weight_class_weights = {                          # dictionary to hold weight-class weights
    'weight_<60':  weight_class_1,                # weight for class: weight_kg < 60
    'weight_>100': weight_class_2,                # weight for class: weight_kg > 100
    'weight_60_100': weight_class_3               # weight for class: 60 <= weight_kg <= 100
}

print("Weight-class weights:", weight_class_weights)  # print weight-class weights to check

# gender_weights dict is assumed from previous step, e.g. {0: w0, 1: w1}
print("Gender-class weights:", gender_weights)        # print gender-class weights to check

# -------------------------------------------------
# 2. Multiply each gender class with each weight class
#    wi = w_weight * w_gender
# -------------------------------------------------
combined_weights = {}                                # dictionary to store combined class weights

print("\nCombined weights for each (weight_class, gender_class):")  # header
for w_label, w_w in weight_class_weights.items():    # loop over weight classes
    for g_label, w_g in gender_weights.items():      # loop over gender classes
        wi = w_w * w_g                               # multiply weight and gender class weights
        combined_weights[(w_label, g_label)] = wi    # store in dictionary
        print(f"{w_label} & gender {g_label}: {wi}") # print each combination'''

2.5. create a dictionary for weights and row index

In [None]:
'''# Check current columns in the DataFrame
print("Columns before adding index column:\n", data.columns)

# Add a new column named 'index' with values from 0 to number_of_rows-1
data['index'] = range(len(data))

# Move 'index' to the front (optional, just for nicer viewing)
cols = ['index'] + [c for c in data.columns if c != 'index']  # build new column order
data = data[cols]                                            # reorder columns

# Show first few rows to verify the new indexing column
#print("\nDataFrame after adding 'index' column:\n", data.head())
'''

In [None]:
'''import numpy as np               # import numpy for numeric operations
import pickle                    # import pickle to save Python objects

# -------------------------------------------------
# 0. We assume these already exist:
#    - weight_class_1, weight_class_2, weight_class_3
#    - gender_weights   (dict: {gender_class: weight})
# -------------------------------------------------

# create a dictionary of weight-class weights (same as before)
weight_class_weights = {         # dictionary mapping weight class labels to their weights
    'weight_<60':  weight_class_1,      # weight for class: weight_kg < 60
    'weight_>100': weight_class_2,      # weight for class: weight_kg > 100
    'weight_60_100': weight_class_3     # weight for class: 60 <= weight_kg <= 100
}

print("Weight-class weights:", weight_class_weights)  # print weight-class weights
print("Gender-class weights:", gender_weights)        # print gender-class weights

# -------------------------------------------------
# 1. Helper function to get the weight class label for a given weight_kg
# -------------------------------------------------
def get_weight_class(w):         # define a function that receives a single weight value
    if w < 60:                   # check if weight is less than 60
        return 'weight_<60'      # return label for class 1
    elif w > 100:                # check if weight is greater than 100
        return 'weight_>100'     # return label for class 2
    else:                        # otherwise weight is between 60 and 100 (inclusive)
        return 'weight_60_100'   # return label for class 3

# -------------------------------------------------
# 2. Build dictionary: keys = index values, values = combined weights
# -------------------------------------------------
final_weights = {}               # create empty dictionary to store final weights

print("\nBuilding final_weights dictionary...")  # message to track progress

for _, row in data.iterrows():   # loop over each row of the DataFrame
    idx_val = row['index']       # get the value from the 'index' column for this row
    gender_val = row['gender']   # get the gender class value for this row
    weight_val = row['weight_kg']# get the weight_kg value for this row

    w_class = get_weight_class(weight_val)        # determine weight class label from weight_kg
    w_weight = weight_class_weights[w_class]      # look up the weight-class weight
    w_gender = gender_weights[gender_val]         # look up the gender-class weight

    combined_w = w_weight * w_gender             # multiply to get combined weight w_i
    final_weights[idx_val] = combined_w          # store combined weight in dictionary with key=index

print("Number of entries in final_weights:", len(final_weights))  # print number of entries
print("First 5 items in final_weights:", list(final_weights.items())[:5])  # show first few items

# -------------------------------------------------
# 3. Check index 0: gender, weight_kg, and combined weight
# -------------------------------------------------
print("\nChecking entry with index 0...")        # message to show what we're doing

row0 = data.loc[data['index'] == 0].iloc[0]      # select the row where 'index' column equals 0

gender0 = row0['gender']                         # get gender value for index 0
weight0 = row0['weight_kg']                      # get weight_kg value for index 0
w_class0 = get_weight_class(weight0)             # get weight class label for index 0

w_weight0 = weight_class_weights[w_class0]       # get weight-class weight for index 0
w_gender0 = gender_weights[gender0]              # get gender-class weight for index 0
combined0_calc = w_weight0 * w_gender0           # calculate combined weight for index 0

print("Row 0 -> gender:", gender0)               # print gender class for index 0
print("Row 0 -> weight_kg:", weight0)            # print weight_kg for index 0
print("Row 0 -> weight class:", w_class0)        # print weight class label for index 0
print("w_weight for row 0:", w_weight0)          # print weight-class weight for index 0
print("w_gender for row 0:", w_gender0)          # print gender-class weight for index 0
print("Combined weight (calculated):", combined0_calc)        # print calculated combined weight
print("Combined weight from final_weights[0]:", final_weights[0])  # print value from dictionary

# -------------------------------------------------
# 4. Save final_weights dictionary as a pickle file
# -------------------------------------------------
print("\nSaving final_weights dictionary as pickle file...")   # message to track saving step

with open('final_weights.pkl', 'wb') as f:       # open a file named 'final_weights.pkl' in binary write mode
    pickle.dump(final_weights, f)                # write dictionary to the file using pickle

print("Dictionary saved to 'final_weights.pkl'.")# confirmation message
'''

2.6. apply min-max scaling with range -1 to 1 for body measurements

In [None]:
from sklearn.preprocessing import MinMaxScaler   # import the scaler for min-max normalization

# -----------------------------
# 1. Columns to scale
# -----------------------------

# body measurement + weight targets (exclude height_cm here)
cols_to_scale_targets = [
     'weight_kg'
]

# input-only variable
height_col = ['height_cm']

# -----------------------------
# 2. Create separate scalers
# -----------------------------

# scaler for all target body measurements (and weight_kg)
scaler_targets = MinMaxScaler(feature_range=(-1, 1))

# scaler for height_cm (input variable)
scaler_height = MinMaxScaler(feature_range=(-1, 1))

# -----------------------------
# 3. Fit & transform
# -----------------------------

# scale target columns
data[cols_to_scale_targets] = scaler_targets.fit_transform(
    data[cols_to_scale_targets]
)

# scale height_cm separately
data[height_col] = scaler_height.fit_transform(
    data[height_col]
)


3. model validation

3.1. split the data for independent and dependent features

In [None]:
# List of columns to be used as dependent (target) features
target_cols = [
     'weight_kg'
]

# Select these columns from the DataFrame as the multi-target Y
Y = data[target_cols]                  # Y will hold all dependent variables for multi-target regression

print("Selected target columns:", target_cols)  # print which columns are used as targets
print("Shape of Y (samples, targets):", Y.shape)  # print shape to confirm dimensions

In [None]:
# Columns to drop for building independent features (X)
drop_cols = ['photo_id', 'subject_id','ankle', 'arm-length', 'bicep', 'calf', 'chest', 'forearm', 'hip',
    'leg-length', 'shoulder-breadth', 'shoulder-to-crotch', 'thigh',
    'waist', 'wrist',] + target_cols   # combine ID columns with target columns


print("Columns to drop for X:\n", drop_cols)           # show which columns will be removed

# Create X by dropping ID columns and all target columns
X = data.drop(columns=drop_cols)                       # drop the unwanted columns to get independent features

print("\nShape of X (samples, independent features):", X.shape)  # print shape of X
#print("\nColumns in X:\n", X.columns.tolist())         # list all feature names in X

3.2. load the model for inferencing

In [None]:
import boto3                                             # import boto3 to read from S3
import io                                                # import io for in-memory byte streams
import h5py                                              # import h5py to open HDF5 file objects
import tensorflow as tf                                  # import tensorflow for loading the model

bucket_name = "ai-bmi-predictor"                         # S3 bucket name
model_key  = "trained-models/efficientnet-models/eff_ann_version3-weight.h5"  # path of model file in S3

print("Creating S3 client...")                           # status message
s3 = boto3.client("s3")                                  # create S3 client (uses your AWS credentials)

print(f"Reading model bytes from s3://{bucket_name}/{model_key}")  # status message
obj = s3.get_object(Bucket=bucket_name, Key=model_key)   # download object bytes into memory
model_bytes = obj["Body"].read()                         # read the body content as raw bytes

byte_stream = io.BytesIO(model_bytes)                    # wrap bytes in an in-memory binary stream

print("Opening HDF5 file from memory...")                # status message
with h5py.File(byte_stream, 'r') as h5file:              # open the stream as an HDF5 file
    best_model = tf.keras.models.load_model(h5file)      # load Keras model from this HDF5 file object

print("Model loaded successfully from S3 (in-memory)!")  # confirmation message

In [None]:
best_model.summary()

3.3. calculate performance matrices

In [None]:
# -------------------------------------------
# 1. Imports
# -------------------------------------------
import numpy as np                               # numerical operations
import pandas as pd                              # to build a nice results table
import tensorflow as tf                          # to load and run the Keras model
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error  # metrics

# -------------------------------------------
# 3. Prepare validation data
#    We assume:
#      - X : pandas DataFrame with validation independent features (already scaled)
#      - Y : pandas DataFrame with validation targets (same order as training, already scaled)
# -------------------------------------------

print("Shape of validation X:", X.shape)            # show shape of validation features
print("Shape of validation Y:", Y.shape)            # show shape of validation targets

# Save target column names (body measurement names)
target_cols = list(Y.columns)                       # list of body measurement names
print("\nTarget columns (body measurements):")
print(target_cols)                                  # print body measurement names

# Convert X and Y to NumPy arrays for prediction / metric computation
X_val = X.values.astype("float32")                  # features as float32 array
Y_val = Y.values.astype("float32")                  # targets as float32 array

In [None]:

# -------------------------------------------
# 4. Run inference to get predictions
# -------------------------------------------
print("\nRunning inference on validation data ...") # status message
Y_pred = best_model.predict(X_val, verbose=0)       # model predictions for validation data
print("Inference completed.")
print("Predictions shape:", Y_pred.shape)           # check prediction shape

In [None]:
'''# -------------------------------------------
# 4.1 Inverse-transform targets to original units
#      Assumes scaler_targets was fitted on target_cols during preprocessing
# -------------------------------------------

# Y_val and Y_pred are currently in [-1, 1] scaled space
# Convert both back to original measurement units (cm, kg, etc.)
Y_true_orig = scaler_targets.inverse_transform(Y_val)
Y_pred_orig = scaler_targets.inverse_transform(Y_pred)'''

In [None]:

# -------------------------------------------
# 5. Compute metrics per body measurement
# -------------------------------------------
print("\nComputing metrics (R^2, MSE, MAE) for each body measurement ...")

results = []                                        # list to collect metric rows

# Loop over each target dimension / body measurement
for i, name in enumerate(target_cols):              # i = column index, name = column name
    y_true = Y_val[:, i]                            # true values for this measurement
    y_pred = Y_pred[:, i]                           # predicted values for this measurement

    r2  = r2_score(y_true, y_pred)                  # compute R^2 score
    mse = mean_squared_error(y_true, y_pred)        # compute mean squared error
    mae = mean_absolute_error(y_true, y_pred)       # compute mean absolute error

    # append metrics as a dict (one row)
    results.append({
        "body_measurement": name,                   # column for measurement name
        "r2": r2,                                   # R^2 value
        "mse": mse,                                 # MSE value
        "mae": mae                                  # MAE value
    })

    # print quick summary for this measurement
    print(f"{name:20s} -> R^2: {r2:.4f}, MSE: {mse:.6f}, MAE: {mae:.6f}")

# Convert results list to a DataFrame for nice tabular view
results_df = pd.DataFrame(results)                  # create DataFrame from list of dicts

print("\nPer-measurement metrics table:")
print(results_df)                                   # display table with all metrics


In [None]:
# -------------------------------------------
# 6. Compute overall (mean) scores across all measurements
# -------------------------------------------
overall_r2  = results_df["r2"].mean()               # mean R^2 over all body measurements
overall_mse = results_df["mse"].mean()              # mean MSE over all body measurements
overall_mae = results_df["mae"].mean()              # mean MAE over all body measurements

print("\nOverall (mean) scores across all body measurements:")
print(f"Mean R^2 : {overall_r2:.4f}")
print(f"Mean MSE : {overall_mse:.6f}")
print(f"Mean MAE : {overall_mae:.6f}")

# Optionally, add a final row with the overall mean scores to the table
overall_row = {
    "body_measurement": "OVERALL_MEAN",             # label row as overall
    "r2": overall_r2,
    "mse": overall_mse,
    "mae": overall_mae
}
results_df = pd.concat([results_df, pd.DataFrame([overall_row])], ignore_index=True)

print("\nMetrics table including overall mean row:")
print(results_df)                                   # final table with per-measurement + overall row


3.4. performance based on real values after inverse transforming

In [None]:
# -------------------------------------------
# 4.1 Inverse-transform targets to original units
#      Assumes scaler_targets was fitted on target_cols during preprocessing
# -------------------------------------------

# Y_val and Y_pred are currently in [-1, 1] scaled space
# Convert both back to original measurement units (cm, kg, etc.)
Y_true_orig = scaler_targets.inverse_transform(Y_val)
Y_pred_orig = scaler_targets.inverse_transform(Y_pred)

In [None]:
# -------------------------------------------
# 5. Compute metrics per body measurement (in original units)
# -------------------------------------------
print("\nComputing metrics (R^2, MSE, MAE) for each body measurement in ORIGINAL UNITS ...")

results = []                                        # list to collect metric rows

# Loop over each target dimension / body measurement
for i, name in enumerate(target_cols):              # i = column index, name = column name
    # use inverse-transformed (real-unit) values
    y_true = Y_true_orig[:, i]                      # true values for this measurement (cm, kg, etc.)
    y_pred = Y_pred_orig[:, i]                      # predicted values for this measurement (cm, kg, etc.)

    r2  = r2_score(y_true, y_pred)                  # compute R^2 score
    mse = mean_squared_error(y_true, y_pred)        # compute mean squared error
    mae = mean_absolute_error(y_true, y_pred)       # compute mean absolute error

    # append metrics as a dict (one row)
    results.append({
        "body_measurement": name,                   # column for measurement name
        "r2": r2,                                   # R^2 value
        "mse": mse,                                 # MSE value (in squared real units)
        "mae": mae                                  # MAE value (in real units)
    })

    # print quick summary for this measurement
    print(f"{name:20s} -> R^2: {r2:.4f}, MSE: {mse:.6f}, MAE: {mae:.6f}")

# Convert results list to a DataFrame for nice tabular view
results_df = pd.DataFrame(results)                  # create DataFrame from list of dicts

print("\nPer-measurement metrics table (original units):")
print(results_df)                                   # display table with all metrics

In [None]:
# -------------------------------------------
# 6. Compute overall (mean) scores across all measurements (original units)
# -------------------------------------------
overall_r2  = results_df["r2"].mean()               # mean R^2 over all body measurements
overall_mse = results_df["mse"].mean()              # mean MSE over all body measurements
overall_mae = results_df["mae"].mean()              # mean MAE over all body measurements

print("\nOverall (mean) scores across all body measurements (original units):")
print(f"Mean R^2 : {overall_r2:.4f}")
print(f"Mean MSE : {overall_mse:.6f}")
print(f"Mean MAE : {overall_mae:.6f}")

# Optionally, add a final row with the overall mean scores to the table
overall_row = {
    "body_measurement": "OVERALL_MEAN",             # label row as overall
    "r2": overall_r2,
    "mse": overall_mse,
    "mae": overall_mae
}
results_df = pd.concat([results_df, pd.DataFrame([overall_row])], ignore_index=True)

print("\nMetrics table including overall mean row (original units):")
print(results_df)                                   # final table with per-measurement + overall row


* MAE = 2.5 -> predictions are off (away) by about 2.5 units from the true values. Sometimes error can be above (overestimates) and sometimes error can be below (underestimated). On average model is wrong by 2.5 unots
* R Squared = 0.55 -> model explains about 55% of the variation in the overall body measurements
* MSE = 12.25 -> squared error between the predictions and true values are 12.25 units
* MSE < 0 ->  get smaller overall error by ignoring all inputs and just predicting the average hip size for everyone, instead of using your modelâ€™s predictions.
* MSE = -0.5 -> worse than the mean, with 50% more squared error than the mean