1. load the dataset

In [1]:
import boto3
import pandas as pd

bucket = "ai-bmi-predictor"
key = "data/vit-base_training.csv"

s3 = boto3.client("s3")
obj = s3.get_object(Bucket=bucket, Key=key)

data = pd.read_csv(obj["Body"])

data.head()


Unnamed: 0,photo_id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,hip,leg-length,shoulder-breadth,shoulder-to-crotch,thigh,waist,wrist,gender,height_cm,weight_kg
0,6ab1d061f51c6079633aeceed2faeb0b,0.010041,-0.077473,0.191211,-0.400731,-0.083509,-0.096019,-0.347934,-0.292979,-0.029243,...,105.3339,76.817467,35.362858,65.993683,54.459591,88.813789,16.764332,female,170.5,72.0
1,e94e2e05fb8b099955bbc4fa5ce81e22,-0.041535,-0.283483,0.275583,-0.336727,-0.146229,-0.091574,-0.272296,-0.381968,-0.09993,...,101.478989,85.154358,37.25676,65.861588,52.773052,89.176338,15.690955,male,178.3,71.8
2,ba6951a4f37fc9302243370e927a02e2,0.104057,-0.015163,0.286,-0.336543,-0.161712,-0.193861,-0.27933,-0.254885,-0.097205,...,97.488243,81.410393,37.503147,66.042679,57.059261,82.201988,16.686253,male,176.25,76.5
3,947d16539d4702427aa74f737329ffb9,-0.042351,-0.011576,0.245237,-0.474828,-0.042683,-0.17796,-0.225197,-0.377451,-0.111781,...,120.586845,69.361534,34.084633,60.41333,65.0,102.323845,17.693762,female,152.1,88.9
4,9326695bf62926ec22690f576a633bba,0.109468,-0.109405,0.156043,-0.455113,-0.159429,-0.143344,-0.280409,-0.284992,-0.099507,...,110.543564,77.160583,38.086231,68.400543,57.172279,107.378578,16.594791,male,171.5,88.4


2. data preprocessing

2.1. categorical encoding for 'gender' feature

In [2]:
import pandas as pd                 # import pandas for data handling

data['gender'] = data['gender'].astype('category')  # convert 'gender' values to categorical type
data['gender'] = data['gender'].cat.codes           # replace 'gender' with its numeric category codes

In [3]:
data['gender'].head()

0    0
1    1
2    1
3    0
4    1
Name: gender, dtype: int8

In [4]:
#data['height_cm'].head()

2.2. define weight frequencies for class imbalance issue for weight_kg

In [5]:
import pandas as pd                     # import pandas for data handling
import numpy as np                      # import numpy to help with safe division

# Assume 'data' is your DataFrame and already loaded
#print("Preview of data:\n", data.head())  # print first few rows to check data
print("\nTotal samples in dataset:", len(data))  # print total number of rows

# -----------------------------
# 1. Create boolean masks for the three weight_kg classes
# -----------------------------
class_1_mask = data['weight_kg'] < 60                      # True where weight_kg is less than 60
class_2_mask = data['weight_kg'] > 100                     # True where weight_kg is greater than 100
class_3_mask = (data['weight_kg'] >= 60) & (data['weight_kg'] <= 100)  # True where weight is between 60 and 100

# -----------------------------
# 2. Calculate class frequencies (counts)
# -----------------------------
freq_class_1 = class_1_mask.sum()          # number of samples with weight_kg < 60
freq_class_2 = class_2_mask.sum()          # number of samples with weight_kg > 100
freq_class_3 = class_3_mask.sum()          # number of samples with 60 <= weight_kg <= 100

print("\nClass frequencies:")              # header for clarity
print("Class 1 (weight_kg < 60):", freq_class_1)   # print frequency of class 1
print("Class 2 (weight_kg > 100):", freq_class_2)  # print frequency of class 2
print("Class 3 (60 <= weight_kg <= 100):", freq_class_3)  # print frequency of class 3

# -----------------------------
# 3. Number of classes according to the strategy
# -----------------------------
num_classes = 3                             # we defined three classes by the rules above
print("\nNumber of classes:", num_classes)  # print number of classes

# -----------------------------
# 4. Compute inverse-frequency weights for each class
#    Formula: w = total_samples / (num_classes * class_frequency)
# -----------------------------
total_samples = len(data)                   # total number of rows in the dataset

def safe_weight(class_freq):                # helper function to avoid division by zero
    if class_freq == 0:                     # check if a class has zero samples
        return np.nan                       # return NaN if no samples exist for that class
    return total_samples / (num_classes * class_freq)  # apply weighting formula

weight_class_1 = safe_weight(freq_class_1)  # compute weight for class 1
weight_class_2 = safe_weight(freq_class_2)  # compute weight for class 2
weight_class_3 = safe_weight(freq_class_3)  # compute weight for class 3

print("\nClass weights (inverse frequency):")          # header for class weights
print("Weight for Class 1 (weight_kg < 60):", weight_class_1)   # print weight of class 1
print("Weight for Class 2 (weight_kg > 100):", weight_class_2)  # print weight of class 2
print("Weight for Class 3 (60 <= weight_kg <= 100):", weight_class_3)  # print weight of class 3



Total samples in dataset: 6134

Class frequencies:
Class 1 (weight_kg < 60): 1049
Class 2 (weight_kg > 100): 514
Class 3 (60 <= weight_kg <= 100): 4571

Number of classes: 3

Class weights (inverse frequency):
Weight for Class 1 (weight_kg < 60): 1.9491579281855735
Weight for Class 2 (weight_kg > 100): 3.9779507133592737
Weight for Class 3 (60 <= weight_kg <= 100): 0.4473127689054182


2.3. define weight frequencies for class imbalance issue for gender feature

In [6]:
import numpy as np                                      # import numpy for numeric utilities (like NaN)

print("Preview of gender column:\n", data['gender'].head())  # show first few gender values to inspect

# -----------------------------------
# 1. Calculate class frequencies for gender
# -----------------------------------
gender_counts = data['gender'].value_counts()           # count how many samples belong to each gender class

print("\nClass frequencies for gender:")                # header for class frequency output
for gender_class, freq in gender_counts.items():        # loop over each gender class and its frequency
    print(f"Class {gender_class}: {freq}")              # print the class label and its frequency

# -----------------------------------
# 2. Number of gender classes
# -----------------------------------
num_gender_classes = len(gender_counts)                 # compute how many distinct gender classes we have
print("\nNumber of gender classes:", num_gender_classes)  # print number of gender classes

# -----------------------------------
# 3. Compute inverse-frequency weights for each gender class
#    Formula: w = total_samples / (num_classes * class_frequency)
# -----------------------------------
total_samples = len(data)                               # total number of samples in the dataset

def safe_weight(class_freq):                            # define helper function to compute class weight safely
    if class_freq == 0:                                 # check for zero frequency to avoid division by zero
        return np.nan                                   # return NaN if a class somehow has zero samples
    return total_samples / (num_gender_classes * class_freq)  # apply the inverse-frequency weight formula

gender_weights = {}                                     # create an empty dictionary to store weights per class
for gender_class, freq in gender_counts.items():        # loop through each gender class and its frequency
    gender_weights[gender_class] = safe_weight(freq)    # compute and store the weight for this gender class

print("\nClass weights (inverse frequency) for gender:")  # header for weight output
for gender_class, weight in gender_weights.items():     # loop over each class and its weight
    print(f"Weight for class {gender_class}: {weight}") # print the computed weight for this gender class


Preview of gender column:
 0    0
1    1
2    1
3    0
4    1
Name: gender, dtype: int8

Class frequencies for gender:
Class 1: 3650
Class 0: 2484

Number of gender classes: 2

Class weights (inverse frequency) for gender:
Weight for class 1: 0.8402739726027397
Weight for class 0: 1.2347020933977455


2.4. weight frequencies for weight classes and gender classes

In [7]:
import numpy as np   # import numpy for numeric operations

# -------------------------------------------------
# 1. Store the already-computed weights for weight classes
#    (use the variables you created when handling weight_kg)
# -------------------------------------------------
weight_class_weights = {                          # dictionary to hold weight-class weights
    'weight_<60':  weight_class_1,                # weight for class: weight_kg < 60
    'weight_>100': weight_class_2,                # weight for class: weight_kg > 100
    'weight_60_100': weight_class_3               # weight for class: 60 <= weight_kg <= 100
}

print("Weight-class weights:", weight_class_weights)  # print weight-class weights to check

# gender_weights dict is assumed from previous step, e.g. {0: w0, 1: w1}
print("Gender-class weights:", gender_weights)        # print gender-class weights to check

# -------------------------------------------------
# 2. Multiply each gender class with each weight class
#    wi = w_weight * w_gender
# -------------------------------------------------
combined_weights = {}                                # dictionary to store combined class weights

print("\nCombined weights for each (weight_class, gender_class):")  # header
for w_label, w_w in weight_class_weights.items():    # loop over weight classes
    for g_label, w_g in gender_weights.items():      # loop over gender classes
        wi = w_w * w_g                               # multiply weight and gender class weights
        combined_weights[(w_label, g_label)] = wi    # store in dictionary
        print(f"{w_label} & gender {g_label}: {wi}") # print each combination

Weight-class weights: {'weight_<60': 1.9491579281855735, 'weight_>100': 3.9779507133592737, 'weight_60_100': 0.4473127689054182}
Gender-class weights: {1: 0.8402739726027397, 0: 1.2347020933977455}

Combined weights for each (weight_class, gender_class):
weight_<60 & gender 1: 1.6378266755466175
weight_<60 & gender 0: 2.4066293742935403
weight_>100 & gender 1: 3.3425684487322993
weight_>100 & gender 0: 4.9115840732177505
weight_60_100 & gender 1: 0.37586527732408703
weight_60_100 & gender 0: 0.5522980121710618


2.5. create a dictionary for weights and row index

In [8]:
# Check current columns in the DataFrame
print("Columns before adding index column:\n", data.columns)

# Add a new column named 'index' with values from 0 to number_of_rows-1
data['index'] = range(len(data))

# Move 'index' to the front (optional, just for nicer viewing)
cols = ['index'] + [c for c in data.columns if c != 'index']  # build new column order
data = data[cols]                                            # reorder columns

# Show first few rows to verify the new indexing column
#print("\nDataFrame after adding 'index' column:\n", data.head())


Columns before adding index column:
 Index(['photo_id', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9',
       ...
       'hip', 'leg-length', 'shoulder-breadth', 'shoulder-to-crotch', 'thigh',
       'waist', 'wrist', 'gender', 'height_cm', 'weight_kg'],
      dtype='object', length=1554)


In [9]:
import numpy as np               # import numpy for numeric operations
import pickle                    # import pickle to save Python objects

# -------------------------------------------------
# 0. We assume these already exist:
#    - weight_class_1, weight_class_2, weight_class_3
#    - gender_weights   (dict: {gender_class: weight})
# -------------------------------------------------

# create a dictionary of weight-class weights (same as before)
weight_class_weights = {         # dictionary mapping weight class labels to their weights
    'weight_<60':  weight_class_1,      # weight for class: weight_kg < 60
    'weight_>100': weight_class_2,      # weight for class: weight_kg > 100
    'weight_60_100': weight_class_3     # weight for class: 60 <= weight_kg <= 100
}

print("Weight-class weights:", weight_class_weights)  # print weight-class weights
print("Gender-class weights:", gender_weights)        # print gender-class weights

# -------------------------------------------------
# 1. Helper function to get the weight class label for a given weight_kg
# -------------------------------------------------
def get_weight_class(w):         # define a function that receives a single weight value
    if w < 60:                   # check if weight is less than 60
        return 'weight_<60'      # return label for class 1
    elif w > 100:                # check if weight is greater than 100
        return 'weight_>100'     # return label for class 2
    else:                        # otherwise weight is between 60 and 100 (inclusive)
        return 'weight_60_100'   # return label for class 3

# -------------------------------------------------
# 2. Build dictionary: keys = index values, values = combined weights
# -------------------------------------------------
final_weights = {}               # create empty dictionary to store final weights

print("\nBuilding final_weights dictionary...")  # message to track progress

for _, row in data.iterrows():   # loop over each row of the DataFrame
    idx_val = row['index']       # get the value from the 'index' column for this row
    gender_val = row['gender']   # get the gender class value for this row
    weight_val = row['weight_kg']# get the weight_kg value for this row

    w_class = get_weight_class(weight_val)        # determine weight class label from weight_kg
    w_weight = weight_class_weights[w_class]      # look up the weight-class weight
    w_gender = gender_weights[gender_val]         # look up the gender-class weight

    combined_w = w_weight * w_gender             # multiply to get combined weight w_i
    final_weights[idx_val] = combined_w          # store combined weight in dictionary with key=index

print("Number of entries in final_weights:", len(final_weights))  # print number of entries
print("First 5 items in final_weights:", list(final_weights.items())[:5])  # show first few items

# -------------------------------------------------
# 3. Check index 0: gender, weight_kg, and combined weight
# -------------------------------------------------
print("\nChecking entry with index 0...")        # message to show what we're doing

row0 = data.loc[data['index'] == 0].iloc[0]      # select the row where 'index' column equals 0

gender0 = row0['gender']                         # get gender value for index 0
weight0 = row0['weight_kg']                      # get weight_kg value for index 0
w_class0 = get_weight_class(weight0)             # get weight class label for index 0

w_weight0 = weight_class_weights[w_class0]       # get weight-class weight for index 0
w_gender0 = gender_weights[gender0]              # get gender-class weight for index 0
combined0_calc = w_weight0 * w_gender0           # calculate combined weight for index 0

print("Row 0 -> gender:", gender0)               # print gender class for index 0
print("Row 0 -> weight_kg:", weight0)            # print weight_kg for index 0
print("Row 0 -> weight class:", w_class0)        # print weight class label for index 0
print("w_weight for row 0:", w_weight0)          # print weight-class weight for index 0
print("w_gender for row 0:", w_gender0)          # print gender-class weight for index 0
print("Combined weight (calculated):", combined0_calc)        # print calculated combined weight
print("Combined weight from final_weights[0]:", final_weights[0])  # print value from dictionary

# -------------------------------------------------
# 4. Save final_weights dictionary as a pickle file
# -------------------------------------------------
print("\nSaving final_weights dictionary as pickle file...")   # message to track saving step

with open('final_weights.pkl', 'wb') as f:       # open a file named 'final_weights.pkl' in binary write mode
    pickle.dump(final_weights, f)                # write dictionary to the file using pickle

print("Dictionary saved to 'final_weights.pkl'.")# confirmation message


Weight-class weights: {'weight_<60': 1.9491579281855735, 'weight_>100': 3.9779507133592737, 'weight_60_100': 0.4473127689054182}
Gender-class weights: {1: 0.8402739726027397, 0: 1.2347020933977455}

Building final_weights dictionary...
Number of entries in final_weights: 6134
First 5 items in final_weights: [(0, 0.5522980121710618), (1, 0.37586527732408703), (2, 0.37586527732408703), (3, 0.5522980121710618), (4, 0.37586527732408703)]

Checking entry with index 0...
Row 0 -> gender: 0
Row 0 -> weight_kg: 72.0
Row 0 -> weight class: weight_60_100
w_weight for row 0: 0.4473127689054182
w_gender for row 0: 1.2347020933977455
Combined weight (calculated): 0.5522980121710618
Combined weight from final_weights[0]: 0.5522980121710618

Saving final_weights dictionary as pickle file...
Dictionary saved to 'final_weights.pkl'.


In [10]:
data.head()

Unnamed: 0,index,photo_id,f1,f2,f3,f4,f5,f6,f7,f8,...,hip,leg-length,shoulder-breadth,shoulder-to-crotch,thigh,waist,wrist,gender,height_cm,weight_kg
0,0,6ab1d061f51c6079633aeceed2faeb0b,0.010041,-0.077473,0.191211,-0.400731,-0.083509,-0.096019,-0.347934,-0.292979,...,105.3339,76.817467,35.362858,65.993683,54.459591,88.813789,16.764332,0,170.5,72.0
1,1,e94e2e05fb8b099955bbc4fa5ce81e22,-0.041535,-0.283483,0.275583,-0.336727,-0.146229,-0.091574,-0.272296,-0.381968,...,101.478989,85.154358,37.25676,65.861588,52.773052,89.176338,15.690955,1,178.3,71.8
2,2,ba6951a4f37fc9302243370e927a02e2,0.104057,-0.015163,0.286,-0.336543,-0.161712,-0.193861,-0.27933,-0.254885,...,97.488243,81.410393,37.503147,66.042679,57.059261,82.201988,16.686253,1,176.25,76.5
3,3,947d16539d4702427aa74f737329ffb9,-0.042351,-0.011576,0.245237,-0.474828,-0.042683,-0.17796,-0.225197,-0.377451,...,120.586845,69.361534,34.084633,60.41333,65.0,102.323845,17.693762,0,152.1,88.9
4,4,9326695bf62926ec22690f576a633bba,0.109468,-0.109405,0.156043,-0.455113,-0.159429,-0.143344,-0.280409,-0.284992,...,110.543564,77.160583,38.086231,68.400543,57.172279,107.378578,16.594791,1,171.5,88.4


2.6. apply Standard scaling for body measurements and robust scaling for cnn extracted features

In [11]:
from sklearn.preprocessing import StandardScaler, RobustScaler

# columns to exclude from any scaling
exclude_cols = ['photo_id', 'subject_id', 'index','gender']

# columns to standard scale
standard_cols = [
    'ankle', 'arm-length', 'bicep', 'calf', 'chest', 'forearm', 'hip',
    'leg-length', 'shoulder-breadth', 'shoulder-to-crotch', 'thigh',
    'waist', 'wrist', 'weight_kg', 'height_cm'
]

# ensure only existing columns are used
standard_cols = [col for col in standard_cols if col in data.columns]

# columns to robust scale (all others except excluded and standard-scaled)
robust_cols = [
    col for col in data.columns
    if col not in exclude_cols and col not in standard_cols
]

# initialize scalers
standard_scaler = StandardScaler()
robust_scaler = RobustScaler()

# apply scaling
data[standard_cols] = standard_scaler.fit_transform(data[standard_cols])
data[robust_cols] = robust_scaler.fit_transform(data[robust_cols])


In [12]:
data.head()

Unnamed: 0,index,photo_id,f1,f2,f3,f4,f5,f6,f7,f8,...,hip,leg-length,shoulder-breadth,shoulder-to-crotch,thigh,waist,wrist,gender,height_cm,weight_kg
0,0,6ab1d061f51c6079633aeceed2faeb0b,-0.653595,-0.524887,-1.128474,0.260144,0.880355,0.61675,-1.091702,0.128416,...,0.337897,-0.307665,-0.180657,0.190893,0.065198,-0.081858,0.050458,0,-0.081172,-0.23181
1,1,e94e2e05fb8b099955bbc4fa5ce81e22,-1.127302,-2.730089,-0.208672,0.982963,0.022948,0.681387,-0.21143,-0.908509,...,-0.131012,1.361007,0.516049,0.164107,-0.280967,-0.051067,-0.705275,1,0.72165,-0.243989
2,2,ba6951a4f37fc9302243370e927a02e2,0.20991,0.142095,-0.095101,0.985036,-0.188705,-0.806271,-0.293292,0.572296,...,-0.616444,0.611633,0.606687,0.200829,0.598786,-0.643388,-0.004515,1,0.510652,0.042217
3,3,947d16539d4702427aa74f737329ffb9,-1.134798,0.180492,-0.539487,-0.576654,1.438467,-0.575016,0.33672,-0.855886,...,2.193258,-1.800009,-0.650875,-0.940681,2.228637,1.06553,0.704842,0,-1.975009,0.797312
4,4,9326695bf62926ec22690f576a633bba,0.259606,-0.866703,-1.51186,-0.35401,-0.15749,-0.071556,-0.305843,0.221484,...,0.971598,-0.238989,0.821185,0.678952,0.621983,1.494821,-0.068911,1,0.021754,0.766864


3. model training

3.1. split the data for independent and dependent features

In [13]:
# List of columns to be used as dependent (target) features
target_cols = [
    'ankle', 'arm-length', 'bicep', 'calf', 'chest', 'forearm', 'hip',
    'leg-length', 'shoulder-breadth', 'shoulder-to-crotch', 'thigh',
    'waist', 'wrist', 'weight_kg'
]

# Select these columns from the DataFrame as the multi-target Y
Y = data[target_cols]                  # Y will hold all dependent variables for multi-target regression

print("Selected target columns:", target_cols)  # print which columns are used as targets
print("Shape of Y (samples, targets):", Y.shape)  # print shape to confirm dimensions

Selected target columns: ['ankle', 'arm-length', 'bicep', 'calf', 'chest', 'forearm', 'hip', 'leg-length', 'shoulder-breadth', 'shoulder-to-crotch', 'thigh', 'waist', 'wrist', 'weight_kg']
Shape of Y (samples, targets): (6134, 14)


In [14]:
# Columns to drop for building independent features (X)
drop_cols = ['photo_id', 'subject_id','index'] + target_cols   # combine ID columns with target columns

print("Columns to drop for X:\n", drop_cols)           # show which columns will be removed

# Create X by dropping ID columns and all target columns
X = data.drop(columns=drop_cols)                       # drop the unwanted columns to get independent features

print("\nShape of X (samples, independent features):", X.shape)  # print shape of X
#print("\nColumns in X:\n", X.columns.tolist())         # list all feature names in X

Columns to drop for X:
 ['photo_id', 'subject_id', 'index', 'ankle', 'arm-length', 'bicep', 'calf', 'chest', 'forearm', 'hip', 'leg-length', 'shoulder-breadth', 'shoulder-to-crotch', 'thigh', 'waist', 'wrist', 'weight_kg']

Shape of X (samples, independent features): (6134, 1538)


3.2. import necessary libraries for model training

In [15]:
# -----------------------------
# Imports
# -----------------------------
import numpy as np                          # numerical operations
import pickle                               # to load the final_weights.pkl file
import matplotlib.pyplot as plt             # for plotting loss curves

from sklearn.model_selection import train_test_split  # to create train/validation sets

import tensorflow as tf                     # main deep learning library
from tensorflow.keras.models import Sequential          # model container
from tensorflow.keras.layers import Dense, Dropout, InputLayer, LeakyReLU, Activation
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import SGD, Adam, RMSprop

# -----------------------------
# Reproducibility (optional)
# -----------------------------
np.random.seed(42)                          # fix numpy random seed
tf.random.set_seed(42)                      # fix tensorflow random seed

# -----------------------------
# 1. Assume you already have:
#    - data DataFrame
#    - X (independent features)
#    - Y (multi-output targets)
# If not, you can recreate X, Y here.
# -----------------------------

# Example (uncomment if you want everything in one place):
# target_cols = [
#     'ankle', 'arm-length', 'bicep', 'calf', 'chest', 'forearm', 'hip',
#     'leg-length', 'shoulder-breadth', 'shoulder-to-crotch', 'thigh',
#     'waist', 'wrist', 'weight_kg'
# ]
# Y = data[target_cols]                                            # select target columns
# drop_cols = ['photo_id', 'subject_id'] + target_cols             # columns not used as features
# X = data.drop(columns=drop_cols + ['index'])                     # drop also 'index' from features

print("Shape of X (features):", X.shape)           # show shape of feature matrix
print("Shape of Y (targets):", Y.shape)           # show shape of target matrix

# -----------------------------
# 2. Load final_weights.pkl (sample weights per index)
# -----------------------------
print("\nLoading final_weights.pkl ...")          # status message

with open('final_weights.pkl', 'rb') as f:        # open pickle file in read-binary mode
    final_weights_dict = pickle.load(f)           # load dictionary {index: weight}

print("Number of entries in final_weights_dict:", len(final_weights_dict))  # size of dictionary
print("First 5 entries in final_weights_dict:", list(final_weights_dict.items())[:5])  # preview


2025-12-12 14:11:53.585177: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-12 14:11:53.600946: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-12-12 14:11:53.625536: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-12-12 14:11:53.625575: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-12-12 14:11:53.640629: I tensorflow/core/platform/cpu_feature_gua

Shape of X (features): (6134, 1538)
Shape of Y (targets): (6134, 14)

Loading final_weights.pkl ...
Number of entries in final_weights_dict: 6134
First 5 entries in final_weights_dict: [(0, 0.5522980121710618), (1, 0.37586527732408703), (2, 0.37586527732408703), (3, 0.5522980121710618), (4, 0.37586527732408703)]


3.3. convert weight dictionary for array to balance class imbalance of a regression problem

In [16]:
# -----------------------------
# 3. Build sample_weight array based on DataFrame 'index' column
# -----------------------------
print("\nBuilding sample_weight array ...")       # status message

# map each row's 'index' value to its weight in the dictionary
sample_weights = data['index'].map(final_weights_dict).values.astype('float32')

print("Sample weights shape:", sample_weights.shape)   # show shape of weight array
print("First 10 sample weights:", sample_weights[:10]) # preview some weights


Building sample_weight array ...
Sample weights shape: (6134,)
First 10 sample weights: [0.552298   0.37586528 0.37586528 0.552298   0.37586528 0.552298
 0.37586528 0.37586528 0.552298   0.37586528]


3.5. split the data into train and validation

In [17]:
# -----------------------------
# 4. Train/validation split (X, Y, and weights)
# -----------------------------
print("\nSplitting into train and validation sets ...")  # status message

X_train, X_val, Y_train, Y_val, w_train, w_val = train_test_split(
    X, Y, sample_weights,           # split features, targets, and weights together
    test_size=0.2,                  # 20% validation
    random_state=42,                # reproducible split
    shuffle=True                    # shuffle data before splitting
)

print("X_train shape:", X_train.shape)          # show training feature shape
print("Y_train shape:", Y_train.shape)          # show training target shape
print("X_val shape:", X_val.shape)              # show validation feature shape
print("Y_val shape:", Y_val.shape)              # show validation target shape


Splitting into train and validation sets ...
X_train shape: (4907, 1538)
Y_train shape: (4907, 14)
X_val shape: (1227, 1538)
Y_val shape: (1227, 14)


In [18]:
%%writefile train_vit_base_ann.py
# import os for paths and environment variables
import os  # import os module
# import numpy for array operations
import numpy as np  # import numpy
# import boto3 for S3 interaction
import boto3  # import boto3 for AWS S3

# import tensorflow main package
import tensorflow as tf  # import tensorflow
# import Keras model and layers
from tensorflow.keras.models import Sequential  # import Sequential model
from tensorflow.keras.layers import Dense, Dropout, InputLayer, LeakyReLU, Activation  # import layers
# import optimizers
from tensorflow.keras.optimizers import SGD, Adam, RMSprop  # import optimizers
# import callbacks
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau  # import callbacks


def build_model(num_hidden_layers, num_neurons, activation_name,
                learning_rate, optimizer_name, input_dim, output_dim,
                dropout_rate=0.3):  # define function to build model
    """
    Build and compile a Keras Sequential model
    for multi-output regression.
    """  # docstring for build_model

    model = Sequential()  # create empty sequential model

    model.add(InputLayer(input_shape=(input_dim,)))  # add input layer with input_dim features

    for _ in range(num_hidden_layers):  # loop over number of hidden layers
        model.add(Dense(num_neurons))  # add dense layer with num_neurons units

        if activation_name.lower() == 'leakyrelu':  # check for LeakyReLU activation
            model.add(LeakyReLU(alpha=0.1))  # add LeakyReLU layer
        elif activation_name.lower() == 'gelu':  # check for GELU activation
            model.add(Activation(tf.keras.activations.gelu))  # add GELU activation
        elif activation_name.lower() == 'tanh':  # check for tanh activation
            model.add(Activation('tanh'))  # add tanh activation
        else:  # handle unknown activation name
            raise ValueError(f"Unknown activation: {activation_name}")  # raise error

        model.add(Dropout(dropout_rate))  # add dropout for regularization

    model.add(Dense(output_dim, activation='linear'))  # add output layer with linear activation

    if optimizer_name.lower() == 'sgd':  # choose optimizer if name is sgd
        optimizer = SGD(learning_rate=learning_rate, momentum=0.9)  # create SGD optimizer
    elif optimizer_name.lower() == 'adam':  # choose optimizer if name is adam
        optimizer = Adam(learning_rate=learning_rate)  # create Adam optimizer
    elif optimizer_name.lower() == 'rmsprop':  # choose optimizer if name is rmsprop
        optimizer = RMSprop(learning_rate=learning_rate)  # create RMSprop optimizer
    else:  # handle unknown optimizer name
        raise ValueError(f"Unknown optimizer: {optimizer_name}")  # raise error

    model.compile(  # compile the model
        optimizer=optimizer,  # set optimizer
        loss='mse',  # set loss as mean squared error
        metrics=[],  # no unweighted metrics
        weighted_metrics=[tf.keras.metrics.MeanSquaredError(name='mse')]  # add weighted MSE metric
    )

    return model  # return compiled model


def load_data():  # define function to load data from SageMaker channels
    train_dir = os.environ.get('SM_CHANNEL_TRAIN')  # get training channel directory
    val_dir = os.environ.get('SM_CHANNEL_VALIDATION')  # get validation channel directory

    print(f"Train dir: {train_dir}")  # print training directory
    print(f"Validation dir: {val_dir}")  # print validation directory

    X_train = np.load(os.path.join(train_dir, 'X_train.npy'))  # load X_train array
    Y_train = np.load(os.path.join(train_dir, 'Y_train.npy'))  # load Y_train array
    w_train = np.load(os.path.join(train_dir, 'w_train.npy'))  # load w_train array

    X_val = np.load(os.path.join(val_dir, 'X_val.npy'))  # load X_val array
    Y_val = np.load(os.path.join(val_dir, 'Y_val.npy'))  # load Y_val array
    w_val = np.load(os.path.join(val_dir, 'w_val.npy'))  # load w_val array

    return X_train, Y_train, w_train, X_val, Y_val, w_val  # return loaded arrays


def main():  # define main function
    print("Loading data ...")  # print status message
    X_train, Y_train, w_train, X_val, Y_val, w_val = load_data()  # call load_data

    input_dim = X_train.shape[1]  # get number of input features
    output_dim = Y_train.shape[1]  # get number of output targets

    num_hidden_layers_list = [3, 5, 7]  # list of hidden layer counts
    num_neurons_list = [64, 128, 256]  # list of neuron counts
    activation_list = ['gelu', 'leakyrelu', 'tanh']  # list of activation functions
    learning_rates = [1e-4, 1e-3, 1e-5]  # list of learning rates
    optimizer_list = ['adam', 'sgd', 'rmsprop']  # list of optimizers

    batch_size = 200  # batch size for training
    num_epochs = 300  # maximum number of epochs

    total_combinations = (len(num_hidden_layers_list) *  # compute total combinations
                          len(num_neurons_list) *
                          len(activation_list) *
                          len(learning_rates) *
                          len(optimizer_list))  # multiply all dimensions

    print(f"Total hyper-parameter combinations: {total_combinations}")  # print total combinations

    best_val_loss = np.inf  # initialize best validation loss
    best_history = None  # placeholder for best training history
    best_params = None  # placeholder for best hyper-parameters
    best_model = None  # placeholder for best model

    combo_counter = 0  # initialize combination counter

    for num_hidden_layers in num_hidden_layers_list:  # loop over hidden layer options
        for num_neurons in num_neurons_list:  # loop over neuron options
            for activation_name in activation_list:  # loop over activations
                for lr in learning_rates:  # loop over learning rates
                    for optimizer_name in optimizer_list:  # loop over optimizers
                        combo_counter += 1  # increment combination counter

                        print("\n======================================")  # separator line
                        print(f"Training combination {combo_counter}/{total_combinations}")  # print combo index
                        print(f"Hidden layers: {num_hidden_layers}")  # print hidden layers
                        print(f"Neurons per layer: {num_neurons}")  # print neurons per layer
                        print(f"Activation: {activation_name}")  # print activation
                        print(f"Learning rate: {lr}")  # print learning rate
                        print(f"Optimizer: {optimizer_name}")  # print optimizer
                        print("======================================")  # separator line

                        tf.keras.backend.clear_session()  # clear previous graph from memory

                        model = build_model(  # build model with current configuration
                            num_hidden_layers=num_hidden_layers,  # pass hidden layers
                            num_neurons=num_neurons,  # pass neurons per layer
                            activation_name=activation_name,  # pass activation
                            learning_rate=lr,  # pass learning rate
                            optimizer_name=optimizer_name,  # pass optimizer
                            input_dim=input_dim,  # pass input dimension
                            output_dim=output_dim,  # pass output dimension
                            dropout_rate=0.3  # pass dropout rate
                        )

                        early_stop = EarlyStopping(  # define early stopping callback
                            monitor='val_loss',  # monitor validation loss
                            patience=10,  # stop after 10 epochs with no improvement
                            restore_best_weights=True,  # restore best weights
                            verbose=1  # print early stopping logs
                        )

                        lr_scheduler = ReduceLROnPlateau(  # define learning rate scheduler
                            monitor='loss',  # monitor training loss
                            factor=0.5,  # reduce LR by factor of 0.5
                            patience=5,  # wait 5 epochs before reducing LR
                            min_lr=1e-6,  # minimum learning rate
                            verbose=1  # print LR change logs
                        )

                        history = model.fit(  # train the model
                            X_train,  # training inputs
                            Y_train,  # training targets
                            sample_weight=w_train,  # training sample weights
                            validation_data=(X_val, Y_val, w_val),  # validation data
                            epochs=num_epochs,  # number of epochs
                            batch_size=batch_size,  # batch size
                            callbacks=[early_stop, lr_scheduler],  # callbacks list
                            verbose=1  # print training logs per epoch
                        )

                        min_val_loss = min(history.history['val_loss'])  # get best val_loss for this combo

                        print(f"Finished combination {combo_counter}. "  # print summary for combo
                              f"Best val_loss for this model: {min_val_loss:.6f}")  # print best val_loss

                        if min_val_loss < best_val_loss:  # check if this is the best so far
                            print(">>> New best model found! Updating best model ...")  # log improvement

                            best_val_loss = min_val_loss  # update best validation loss
                            best_history = history  # store best history
                            best_params = {  # store best hyper-parameters
                                'num_hidden_layers': num_hidden_layers,
                                'num_neurons': num_neurons,
                                'activation': activation_name,
                                'learning_rate': lr,
                                'optimizer': optimizer_name
                            }
                            best_model = model  # store best model

    print("\n================ BEST MODEL =================")  # print header for best model
    print(f"Best validation loss: {best_val_loss}")  # print best validation loss
    print("Best hyper-parameters:")  # header for hyper-parameters
    for k, v in best_params.items():  # loop over best_params
        print(f"  {k}: {v}")  # print each parameter
    print("=============================================")  # footer line

    model_dir = os.environ.get('SM_MODEL_DIR', '/opt/ml/model')  # get model directory
    os.makedirs(model_dir, exist_ok=True)  # create model directory if needed

    local_model_path = os.path.join(model_dir, 'vit_base_ann_version2.h5')  # define local model path

    print(f"\nSaving best model to {local_model_path} ...")  # log save path
    best_model.save(local_model_path)  # save best model as .h5
    print("Model saved.")  # confirm save

    target_bucket = 'ai-bmi-predictor'  # set target S3 bucket
    target_key = 'trained-models/vit-base-models/vit_base_ann_version2.h5'  # set S3 object key

    print(f"Uploading best model to s3://{target_bucket}/{target_key} ...")  # log upload target
    s3_client = boto3.client('s3')  # create S3 client
    s3_client.upload_file(local_model_path, target_bucket, target_key)  # upload model file
    print("Upload complete.")  # confirm upload


if __name__ == '__main__':  # run when script is executed directly
    main()  # call main function


Writing train_vit_base_ann.py


In [None]:
# import os for directory handling
import os  # import os module
# import numpy for saving arrays
import numpy as np  # import numpy
# import SageMaker SDK
import sagemaker  # import sagemaker SDK
# import TensorFlow estimator
from sagemaker.tensorflow import TensorFlow  # import TensorFlow estimator
# import TrainingInput for data channels
from sagemaker.inputs import TrainingInput  # import TrainingInput

# create local base directory for temporary numpy files
local_data_root = "local_np_data"  # set local data root folder
os.makedirs(local_data_root, exist_ok=True)  # create folder if it does not exist

# create separate subfolder for training data
local_train_dir = os.path.join(local_data_root, "train")  # set local train folder
os.makedirs(local_train_dir, exist_ok=True)  # create train folder if needed

# create separate subfolder for validation data
local_val_dir = os.path.join(local_data_root, "validation")  # set local validation folder
os.makedirs(local_val_dir, exist_ok=True)  # create validation folder if needed

# save training feature array
np.save(os.path.join(local_train_dir, "X_train.npy"), X_train)  # save X_train to disk
# save training target array
np.save(os.path.join(local_train_dir, "Y_train.npy"), Y_train)  # save Y_train to disk
# save training weight array
np.save(os.path.join(local_train_dir, "w_train.npy"), w_train)  # save w_train to disk

# save validation feature array
np.save(os.path.join(local_val_dir, "X_val.npy"), X_val)  # save X_val to disk
# save validation target array
np.save(os.path.join(local_val_dir, "Y_val.npy"), Y_val)  # save Y_val to disk
# save validation weight array
np.save(os.path.join(local_val_dir, "w_val.npy"), w_val)  # save w_val to disk

# create SageMaker session
sess = sagemaker.Session()  # create SageMaker session object
# get execution role for this notebook
role = sagemaker.get_execution_role()  # get IAM execution role

# set S3 bucket name for data uploads
bucket = "ai-bmi-predictor"  # target S3 bucket for data
# set base S3 prefix under trained-models/efficientnet-models/train-val-arrays
base_prefix = "trained-models/vit-base-models/train-val-arrays"  # base S3 prefix for arrays
# set S3 prefix for training data inside base prefix
train_prefix = f"{base_prefix}/train"  # S3 prefix for training data
# set S3 prefix for validation data inside base prefix
val_prefix = f"{base_prefix}/validation"  # S3 prefix for validation data

# upload training folder to S3
train_s3_path = sess.upload_data(  # upload training data
    path=local_train_dir,  # local train folder path
    bucket=bucket,  # S3 bucket name
    key_prefix=train_prefix  # S3 key prefix for training data
)

# upload validation folder to S3
val_s3_path = sess.upload_data(  # upload validation data
    path=local_val_dir,  # local validation folder path
    bucket=bucket,  # S3 bucket name
    key_prefix=val_prefix  # S3 key prefix for validation data
)

# create TrainingInput for training channel
train_input = TrainingInput(  # define training channel input
    s3_data=train_s3_path,  # S3 path of training data
    content_type="application/x-npy"  # content type for numpy files
)

# create TrainingInput for validation channel
validation_input = TrainingInput(  # define validation channel input
    s3_data=val_s3_path,  # S3 path of validation data
    content_type="application/x-npy"  # content type for numpy files
)

# define TensorFlow estimator for SageMaker training job
estimator = TensorFlow(  # create TensorFlow estimator
    entry_point="train_vit_base_ann.py",  # training script file
    role=role,  # IAM role for training job
    instance_type="ml.g4dn.12xlarge",  # instance type to use
    instance_count=1,  # number of instances
    framework_version="2.12",  # TensorFlow version
    py_version="py310",  # Python version
    sagemaker_session=sess,  # attached SageMaker session
    base_job_name="vit-base-ann-v2-training",  # base name for job
    output_path="s3://ai-bmi-predictor/trained-models/vit-base-models/",  # S3 output path
    script_mode=True  # enable script mode
)

# launch SageMaker training job
estimator.fit(  # start training
    inputs={"train": train_input, "validation": validation_input},  # map channels to inputs
    wait=True,  # block until job finishes
    logs="All"  # stream all training logs to this notebook
)


INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: vit-base-ann-v2-training-2025-12-12-14-14-15-747


2025-12-12 14:14:16 Starting - Starting the training job
2025-12-12 14:14:16 Pending - Training job waiting for capacity............
2025-12-12 14:15:57 Pending - Preparing the instances for training...
2025-12-12 14:16:36 Downloading - Downloading the training image...............
2025-12-12 14:18:57 Training - Training image download completed. Training in progress....[34m2025-12-12 14:19:34.978114: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.[0m
[34m2025-12-12 14:19:35.026073: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.[0m
[34mTo enable the following instructions: AVX512F AVX512_VNNI, in other operations, rebuild TensorFlow with the appropriate com