1. load the dataset

In [11]:
import boto3
import pandas as pd

bucket = "ai-bmi-predictor"
key = "data/eff_training.csv"

s3 = boto3.client("s3")
obj = s3.get_object(Bucket=bucket, Key=key)

data = pd.read_csv(obj["Body"])

data.head()


Unnamed: 0,photo_id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,hip,leg-length,shoulder-breadth,shoulder-to-crotch,thigh,waist,wrist,gender,height_cm,weight_kg
0,6ab1d061f51c6079633aeceed2faeb0b,6.8e-05,0.108145,-0.138813,0.633156,0.346266,-0.046055,0.016021,-0.058632,0.097968,...,105.3339,76.817467,35.362858,65.993683,54.459591,88.813789,16.764332,female,170.5,72.0
1,e94e2e05fb8b099955bbc4fa5ce81e22,0.020843,0.026005,-0.093442,0.736929,0.240569,0.089982,-0.112391,0.000435,-0.07611,...,101.478989,85.154358,37.25676,65.861588,52.773052,89.176338,15.690955,male,178.3,71.8
2,ba6951a4f37fc9302243370e927a02e2,0.014542,-0.071332,-0.154407,0.577781,0.196485,-0.125341,-0.056713,-0.027295,0.094879,...,97.488243,81.410393,37.503147,66.042679,57.059261,82.201988,16.686253,male,176.25,76.5
3,947d16539d4702427aa74f737329ffb9,0.041775,0.075746,-0.128497,0.48501,0.120409,0.011227,0.017852,-0.089796,-0.011273,...,120.586845,69.361534,34.084633,60.41333,65.0,102.323845,17.693762,female,152.1,88.9
4,9326695bf62926ec22690f576a633bba,0.004397,0.05859,-0.154224,0.52814,0.290956,-0.108486,-0.021441,-0.099909,0.08077,...,110.543564,77.160583,38.086231,68.400543,57.172279,107.378578,16.594791,male,171.5,88.4


2. data preprocessing

2.1. categorical encoding for 'gender' feature

In [12]:
import pandas as pd                 # import pandas for data handling

data['gender'] = data['gender'].astype('category')  # convert 'gender' values to categorical type
data['gender'] = data['gender'].cat.codes           # replace 'gender' with its numeric category codes

In [13]:
data['gender'].head()

0    0
1    1
2    1
3    0
4    1
Name: gender, dtype: int8

In [14]:
#data['height_cm'].head()

2.2. define weight frequencies for class imbalance issue for weight_kg

In [15]:
import pandas as pd                     # import pandas for data handling
import numpy as np                      # import numpy to help with safe division

# Assume 'data' is your DataFrame and already loaded
#print("Preview of data:\n", data.head())  # print first few rows to check data
print("\nTotal samples in dataset:", len(data))  # print total number of rows

# -----------------------------
# 1. Create boolean masks for the three weight_kg classes
# -----------------------------
class_1_mask = data['weight_kg'] < 60                      # True where weight_kg is less than 60
class_2_mask = data['weight_kg'] > 100                     # True where weight_kg is greater than 100
class_3_mask = (data['weight_kg'] >= 60) & (data['weight_kg'] <= 100)  # True where weight is between 60 and 100

# -----------------------------
# 2. Calculate class frequencies (counts)
# -----------------------------
freq_class_1 = class_1_mask.sum()          # number of samples with weight_kg < 60
freq_class_2 = class_2_mask.sum()          # number of samples with weight_kg > 100
freq_class_3 = class_3_mask.sum()          # number of samples with 60 <= weight_kg <= 100

print("\nClass frequencies:")              # header for clarity
print("Class 1 (weight_kg < 60):", freq_class_1)   # print frequency of class 1
print("Class 2 (weight_kg > 100):", freq_class_2)  # print frequency of class 2
print("Class 3 (60 <= weight_kg <= 100):", freq_class_3)  # print frequency of class 3

# -----------------------------
# 3. Number of classes according to the strategy
# -----------------------------
num_classes = 3                             # we defined three classes by the rules above
print("\nNumber of classes:", num_classes)  # print number of classes

# -----------------------------
# 4. Compute inverse-frequency weights for each class
#    Formula: w = total_samples / (num_classes * class_frequency)
# -----------------------------
total_samples = len(data)                   # total number of rows in the dataset

def safe_weight(class_freq):                # helper function to avoid division by zero
    if class_freq == 0:                     # check if a class has zero samples
        return np.nan                       # return NaN if no samples exist for that class
    return total_samples / (num_classes * class_freq)  # apply weighting formula

weight_class_1 = safe_weight(freq_class_1)  # compute weight for class 1
weight_class_2 = safe_weight(freq_class_2)  # compute weight for class 2
weight_class_3 = safe_weight(freq_class_3)  # compute weight for class 3

print("\nClass weights (inverse frequency):")          # header for class weights
print("Weight for Class 1 (weight_kg < 60):", weight_class_1)   # print weight of class 1
print("Weight for Class 2 (weight_kg > 100):", weight_class_2)  # print weight of class 2
print("Weight for Class 3 (60 <= weight_kg <= 100):", weight_class_3)  # print weight of class 3



Total samples in dataset: 6134

Class frequencies:
Class 1 (weight_kg < 60): 1049
Class 2 (weight_kg > 100): 514
Class 3 (60 <= weight_kg <= 100): 4571

Number of classes: 3

Class weights (inverse frequency):
Weight for Class 1 (weight_kg < 60): 1.9491579281855735
Weight for Class 2 (weight_kg > 100): 3.9779507133592737
Weight for Class 3 (60 <= weight_kg <= 100): 0.4473127689054182


2.3. define weight frequencies for class imbalance issue for gender feature

In [16]:
import numpy as np                                      # import numpy for numeric utilities (like NaN)

print("Preview of gender column:\n", data['gender'].head())  # show first few gender values to inspect

# -----------------------------------
# 1. Calculate class frequencies for gender
# -----------------------------------
gender_counts = data['gender'].value_counts()           # count how many samples belong to each gender class

print("\nClass frequencies for gender:")                # header for class frequency output
for gender_class, freq in gender_counts.items():        # loop over each gender class and its frequency
    print(f"Class {gender_class}: {freq}")              # print the class label and its frequency

# -----------------------------------
# 2. Number of gender classes
# -----------------------------------
num_gender_classes = len(gender_counts)                 # compute how many distinct gender classes we have
print("\nNumber of gender classes:", num_gender_classes)  # print number of gender classes

# -----------------------------------
# 3. Compute inverse-frequency weights for each gender class
#    Formula: w = total_samples / (num_classes * class_frequency)
# -----------------------------------
total_samples = len(data)                               # total number of samples in the dataset

def safe_weight(class_freq):                            # define helper function to compute class weight safely
    if class_freq == 0:                                 # check for zero frequency to avoid division by zero
        return np.nan                                   # return NaN if a class somehow has zero samples
    return total_samples / (num_gender_classes * class_freq)  # apply the inverse-frequency weight formula

gender_weights = {}                                     # create an empty dictionary to store weights per class
for gender_class, freq in gender_counts.items():        # loop through each gender class and its frequency
    gender_weights[gender_class] = safe_weight(freq)    # compute and store the weight for this gender class

print("\nClass weights (inverse frequency) for gender:")  # header for weight output
for gender_class, weight in gender_weights.items():     # loop over each class and its weight
    print(f"Weight for class {gender_class}: {weight}") # print the computed weight for this gender class


Preview of gender column:
 0    0
1    1
2    1
3    0
4    1
Name: gender, dtype: int8

Class frequencies for gender:
Class 1: 3650
Class 0: 2484

Number of gender classes: 2

Class weights (inverse frequency) for gender:
Weight for class 1: 0.8402739726027397
Weight for class 0: 1.2347020933977455


2.4. weight frequencies for weight classes and gender classes

In [17]:
import numpy as np   # import numpy for numeric operations

# -------------------------------------------------
# 1. Store the already-computed weights for weight classes
#    (use the variables you created when handling weight_kg)
# -------------------------------------------------
weight_class_weights = {                          # dictionary to hold weight-class weights
    'weight_<60':  weight_class_1,                # weight for class: weight_kg < 60
    'weight_>100': weight_class_2,                # weight for class: weight_kg > 100
    'weight_60_100': weight_class_3               # weight for class: 60 <= weight_kg <= 100
}

print("Weight-class weights:", weight_class_weights)  # print weight-class weights to check

# gender_weights dict is assumed from previous step, e.g. {0: w0, 1: w1}
print("Gender-class weights:", gender_weights)        # print gender-class weights to check

# -------------------------------------------------
# 2. Multiply each gender class with each weight class
#    wi = w_weight * w_gender
# -------------------------------------------------
combined_weights = {}                                # dictionary to store combined class weights

print("\nCombined weights for each (weight_class, gender_class):")  # header
for w_label, w_w in weight_class_weights.items():    # loop over weight classes
    for g_label, w_g in gender_weights.items():      # loop over gender classes
        wi = w_w * w_g                               # multiply weight and gender class weights
        combined_weights[(w_label, g_label)] = wi    # store in dictionary
        print(f"{w_label} & gender {g_label}: {wi}") # print each combination

Weight-class weights: {'weight_<60': 1.9491579281855735, 'weight_>100': 3.9779507133592737, 'weight_60_100': 0.4473127689054182}
Gender-class weights: {1: 0.8402739726027397, 0: 1.2347020933977455}

Combined weights for each (weight_class, gender_class):
weight_<60 & gender 1: 1.6378266755466175
weight_<60 & gender 0: 2.4066293742935403
weight_>100 & gender 1: 3.3425684487322993
weight_>100 & gender 0: 4.9115840732177505
weight_60_100 & gender 1: 0.37586527732408703
weight_60_100 & gender 0: 0.5522980121710618


2.5. create a dictionary for weights and row index

In [18]:
# Check current columns in the DataFrame
print("Columns before adding index column:\n", data.columns)

# Add a new column named 'index' with values from 0 to number_of_rows-1
data['index'] = range(len(data))

# Move 'index' to the front (optional, just for nicer viewing)
cols = ['index'] + [c for c in data.columns if c != 'index']  # build new column order
data = data[cols]                                            # reorder columns

# Show first few rows to verify the new indexing column
#print("\nDataFrame after adding 'index' column:\n", data.head())


Columns before adding index column:
 Index(['photo_id', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9',
       ...
       'hip', 'leg-length', 'shoulder-breadth', 'shoulder-to-crotch', 'thigh',
       'waist', 'wrist', 'gender', 'height_cm', 'weight_kg'],
      dtype='object', length=5138)


In [19]:
import numpy as np               # import numpy for numeric operations
import pickle                    # import pickle to save Python objects

# -------------------------------------------------
# 0. We assume these already exist:
#    - weight_class_1, weight_class_2, weight_class_3
#    - gender_weights   (dict: {gender_class: weight})
# -------------------------------------------------

# create a dictionary of weight-class weights (same as before)
weight_class_weights = {         # dictionary mapping weight class labels to their weights
    'weight_<60':  weight_class_1,      # weight for class: weight_kg < 60
    'weight_>100': weight_class_2,      # weight for class: weight_kg > 100
    'weight_60_100': weight_class_3     # weight for class: 60 <= weight_kg <= 100
}

print("Weight-class weights:", weight_class_weights)  # print weight-class weights
print("Gender-class weights:", gender_weights)        # print gender-class weights

# -------------------------------------------------
# 1. Helper function to get the weight class label for a given weight_kg
# -------------------------------------------------
def get_weight_class(w):         # define a function that receives a single weight value
    if w < 60:                   # check if weight is less than 60
        return 'weight_<60'      # return label for class 1
    elif w > 100:                # check if weight is greater than 100
        return 'weight_>100'     # return label for class 2
    else:                        # otherwise weight is between 60 and 100 (inclusive)
        return 'weight_60_100'   # return label for class 3

# -------------------------------------------------
# 2. Build dictionary: keys = index values, values = combined weights
# -------------------------------------------------
final_weights = {}               # create empty dictionary to store final weights

print("\nBuilding final_weights dictionary...")  # message to track progress

for _, row in data.iterrows():   # loop over each row of the DataFrame
    idx_val = row['index']       # get the value from the 'index' column for this row
    gender_val = row['gender']   # get the gender class value for this row
    weight_val = row['weight_kg']# get the weight_kg value for this row

    w_class = get_weight_class(weight_val)        # determine weight class label from weight_kg
    w_weight = weight_class_weights[w_class]      # look up the weight-class weight
    w_gender = gender_weights[gender_val]         # look up the gender-class weight

    combined_w = w_weight * w_gender             # multiply to get combined weight w_i
    final_weights[idx_val] = combined_w          # store combined weight in dictionary with key=index

print("Number of entries in final_weights:", len(final_weights))  # print number of entries
print("First 5 items in final_weights:", list(final_weights.items())[:5])  # show first few items

# -------------------------------------------------
# 3. Check index 0: gender, weight_kg, and combined weight
# -------------------------------------------------
print("\nChecking entry with index 0...")        # message to show what we're doing

row0 = data.loc[data['index'] == 0].iloc[0]      # select the row where 'index' column equals 0

gender0 = row0['gender']                         # get gender value for index 0
weight0 = row0['weight_kg']                      # get weight_kg value for index 0
w_class0 = get_weight_class(weight0)             # get weight class label for index 0

w_weight0 = weight_class_weights[w_class0]       # get weight-class weight for index 0
w_gender0 = gender_weights[gender0]              # get gender-class weight for index 0
combined0_calc = w_weight0 * w_gender0           # calculate combined weight for index 0

print("Row 0 -> gender:", gender0)               # print gender class for index 0
print("Row 0 -> weight_kg:", weight0)            # print weight_kg for index 0
print("Row 0 -> weight class:", w_class0)        # print weight class label for index 0
print("w_weight for row 0:", w_weight0)          # print weight-class weight for index 0
print("w_gender for row 0:", w_gender0)          # print gender-class weight for index 0
print("Combined weight (calculated):", combined0_calc)        # print calculated combined weight
print("Combined weight from final_weights[0]:", final_weights[0])  # print value from dictionary

# -------------------------------------------------
# 4. Save final_weights dictionary as a pickle file
# -------------------------------------------------
print("\nSaving final_weights dictionary as pickle file...")   # message to track saving step

with open('final_weights.pkl', 'wb') as f:       # open a file named 'final_weights.pkl' in binary write mode
    pickle.dump(final_weights, f)                # write dictionary to the file using pickle

print("Dictionary saved to 'final_weights.pkl'.")# confirmation message


Weight-class weights: {'weight_<60': 1.9491579281855735, 'weight_>100': 3.9779507133592737, 'weight_60_100': 0.4473127689054182}
Gender-class weights: {1: 0.8402739726027397, 0: 1.2347020933977455}

Building final_weights dictionary...
Number of entries in final_weights: 6134
First 5 items in final_weights: [(0, 0.5522980121710618), (1, 0.37586527732408703), (2, 0.37586527732408703), (3, 0.5522980121710618), (4, 0.37586527732408703)]

Checking entry with index 0...
Row 0 -> gender: 0
Row 0 -> weight_kg: 72.0
Row 0 -> weight class: weight_60_100
w_weight for row 0: 0.4473127689054182
w_gender for row 0: 1.2347020933977455
Combined weight (calculated): 0.5522980121710618
Combined weight from final_weights[0]: 0.5522980121710618

Saving final_weights dictionary as pickle file...
Dictionary saved to 'final_weights.pkl'.


2.6. apply min-max scaling with range -1 to 1 for body measurements

In [20]:
from sklearn.preprocessing import MinMaxScaler   # import the scaler for min-max normalization

# list of columns to scale between -1 and 1
cols_to_scale = [
    'ankle', 'arm-length', 'bicep', 'calf', 'chest', 'forearm', 'hip',
    'leg-length', 'shoulder-breadth', 'shoulder-to-crotch', 'thigh',
    'waist', 'wrist', 'height_cm', 'weight_kg'
]

scaler = MinMaxScaler(feature_range=(-1, 1))    # create a scaler that maps values to range [-1, 1]

data[cols_to_scale] = scaler.fit_transform(     # fit the scaler and transform the selected columns
    data[cols_to_scale]
)

3. model training

3.1. Split the data for independent and dependent features

In [21]:
# 3.1. split the data for independent and dependent features

# List of columns to be used as dependent (target) features
target_cols = [
    'ankle',              # target: ankle circumference
    'arm-length',         # target: arm length
    'bicep',              # target: bicep circumference
    'calf',               # target: calf circumference
    'chest',              # target: chest circumference
    'forearm',            # target: forearm circumference
    'hip',                # target: hip circumference
    'leg-length',         # target: leg length
    'shoulder-breadth',   # target: shoulder breadth
    'shoulder-to-crotch', # target: shoulder-to-crotch distance
    'thigh',              # target: thigh circumference
    'waist',              # target: waist circumference
    'wrist',              # target: wrist circumference
    'weight_kg'           # target: body weight (already scaled)
]

# Select these columns from the DataFrame as the multi-target Y
Y = data[target_cols]     # Y will hold all dependent variables for multi-target regression

# Columns to drop from X (independent features)
drop_cols = ['photo_id', 'subject_id', 'index'] + target_cols  # ID + target columns to remove from feature matrix

# Create X by dropping unwanted columns
X = data.drop(columns=drop_cols)  # X will contain only independent features (e.g., gender + scaled body measures)

# Print shapes to check that everything is correct
print("Selected target columns:", target_cols)           # show which columns are used as targets
print("Shape of Y (samples, targets):", Y.shape)         # show size of target matrix
print("Shape of X (samples, features):", X.shape)        # show size of feature matrix


Selected target columns: ['ankle', 'arm-length', 'bicep', 'calf', 'chest', 'forearm', 'hip', 'leg-length', 'shoulder-breadth', 'shoulder-to-crotch', 'thigh', 'waist', 'wrist', 'weight_kg']
Shape of Y (samples, targets): (6134, 14)
Shape of X (samples, features): (6134, 5122)


3.2. Prepare sample weights and train/validation split

In [22]:
# 3.2. prepare sample weights and train/validation split

import numpy as np                                      # numerical operations on arrays
import pickle                                           # load saved Python objects
from sklearn.model_selection import train_test_split    # split arrays into train and validation sets

# Load final sample weights dictionary created in step 2.5
with open('final_weights.pkl', 'rb') as f:              # open the pickle file in read-binary mode
    final_weights_dict = pickle.load(f)                 # load dictionary {index: combined_weight}

# Build sample_weight array, aligned with DataFrame row order via the 'index' column
sample_weights = data['index'].map(final_weights_dict).values.astype('float32')  # map each row index to its weight

# Print some basic information about the weights
print("Sample weights shape:", sample_weights.shape)    # show number of sample weights
print("First 10 sample weights:", sample_weights[:10])  # preview the first few weights

# Split X, Y, and sample_weights into training and validation sets
X_train, X_val, Y_train, Y_val, w_train, w_val = train_test_split(
    X,                      # full feature matrix
    Y,                      # full target matrix
    sample_weights,         # sample weights array
    test_size=0.2,          # allocate 20% of data for validation
    random_state=42,        # fixed random seed for reproducibility
    shuffle=True            # shuffle samples before splitting
)

# Print shapes of the resulting arrays to verify
print("X_train shape:", X_train.shape)                  # shape of training features
print("Y_train shape:", Y_train.shape)                  # shape of training targets
print("X_val shape:", X_val.shape)                      # shape of validation features
print("Y_val shape:", Y_val.shape)                      # shape of validation targets
print("w_train shape:", w_train.shape)                  # shape of training weights
print("w_val shape:", w_val.shape)                      # shape of validation weights


Sample weights shape: (6134,)
First 10 sample weights: [0.552298   0.37586528 0.37586528 0.552298   0.37586528 0.552298
 0.37586528 0.37586528 0.552298   0.37586528]
X_train shape: (4907, 5122)
Y_train shape: (4907, 14)
X_val shape: (1227, 5122)
Y_val shape: (1227, 14)
w_train shape: (4907,)
w_val shape: (1227,)


3.3. Save train/validation arrays and upload them to S3

In [23]:
# 3.3. save train/validation arrays and upload them to S3 for the training job

import os                                           # filesystem and path operations
import numpy as np                                  # saving numpy arrays
import sagemaker                                    # SageMaker SDK
from sagemaker.inputs import TrainingInput          # helper class for defining input channels

# Create a local root directory to hold temporary .npy files
local_data_root = "local_np_data"                   # base folder for local numpy data
os.makedirs(local_data_root, exist_ok=True)         # create the folder if it does not already exist

# Create separate subfolders for training and validation data
local_train_dir = os.path.join(local_data_root, "train")      # path to local training folder
local_val_dir   = os.path.join(local_data_root, "validation") # path to local validation folder
os.makedirs(local_train_dir, exist_ok=True)         # create training folder if needed
os.makedirs(local_val_dir, exist_ok=True)           # create validation folder if needed

# Save training arrays as .npy files
np.save(os.path.join(local_train_dir, "X_train.npy"), X_train)  # save training features
np.save(os.path.join(local_train_dir, "Y_train.npy"), Y_train)  # save training targets
np.save(os.path.join(local_train_dir, "w_train.npy"), w_train)  # save training sample weights

# Save validation arrays as .npy files
np.save(os.path.join(local_val_dir, "X_val.npy"), X_val)        # save validation features
np.save(os.path.join(local_val_dir, "Y_val.npy"), Y_val)        # save validation targets
np.save(os.path.join(local_val_dir, "w_val.npy"), w_val)        # save validation sample weights

# Create a SageMaker session object
sess = sagemaker.Session()                         # create a SageMaker session for interacting with AWS

# Get the execution role for this notebook environment
role = sagemaker.get_execution_role()              # get IAM role that SageMaker will use

# Define the S3 bucket and key prefixes used for the training data
bucket = "ai-bmi-predictor"                        # target S3 bucket
base_prefix = "trained-models/efficientnet-models/train-val-arrays"  # base S3 prefix for numpy arrays
train_prefix = f"{base_prefix}/train"              # S3 prefix for training data
val_prefix   = f"{base_prefix}/validation"         # S3 prefix for validation data

# Upload local training folder to S3
train_s3_path = sess.upload_data(                  # upload training data to S3
    path=local_train_dir,                          # local training folder path
    bucket=bucket,                                 # destination S3 bucket
    key_prefix=train_prefix                        # S3 key prefix for training data
)

# Upload local validation folder to S3
val_s3_path = sess.upload_data(                    # upload validation data to S3
    path=local_val_dir,                            # local validation folder path
    bucket=bucket,                                 # destination S3 bucket
    key_prefix=val_prefix                          # S3 key prefix for validation data
)

# Create TrainingInput objects for each channel so SageMaker can mount them in the container
train_input = TrainingInput(                       # training channel configuration
    s3_data=train_s3_path,                         # S3 path with training .npy files
    content_type="application/x-npy"               # MIME type for numpy arrays
)

validation_input = TrainingInput(                  # validation channel configuration
    s3_data=val_s3_path,                           # S3 path with validation .npy files
    content_type="application/x-npy"               # MIME type for numpy arrays
)


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


3.4. LightGBM training script (multi-target, weighted MSE) for SKLearn container

In [29]:
%%writefile train_eff_gbm.py
# 3.4. LightGBM multi-output regression training script for SageMaker SKLearn container

import os                                           # OS utilities for paths and environment variables
import numpy as np                                  # numerical operations with arrays
import joblib                                       # saving and loading sklearn-style models
import boto3                                        # AWS SDK for uploading the model to S3

from sklearn.multioutput import MultiOutputRegressor        # wrapper for multi-target regression
from sklearn.metrics import mean_squared_error              # function to compute (weighted) MSE

from lightgbm import LGBMRegressor                         # LightGBM regressor (ensure lightgbm is installed)

def load_data():                                           # function to load .npy arrays from SageMaker channels
    train_dir = os.environ.get("SM_CHANNEL_TRAIN")         # directory path for training channel
    val_dir   = os.environ.get("SM_CHANNEL_VALIDATION")    # directory path for validation channel

    print(f"Train dir: {train_dir}")                       # show training directory
    print(f"Validation dir: {val_dir}")                    # show validation directory

    X_train = np.load(os.path.join(train_dir, "X_train.npy"))  # load training features
    Y_train = np.load(os.path.join(train_dir, "Y_train.npy"))  # load training targets
    w_train = np.load(os.path.join(train_dir, "w_train.npy"))  # load training sample weights

    X_val = np.load(os.path.join(val_dir, "X_val.npy"))        # load validation features
    Y_val = np.load(os.path.join(val_dir, "Y_val.npy"))        # load validation targets
    w_val = np.load(os.path.join(val_dir, "w_val.npy"))        # load validation sample weights

    return X_train, Y_train, w_train, X_val, Y_val, w_val      # return all loaded arrays

def weighted_mse(y_true, y_pred, sample_weight):               # function to compute weighted MSE
    """
    Compute weighted mean squared error over all targets.      # short description of the function
    """
    return mean_squared_error(                                # use sklearn's mean_squared_error
        y_true,                                               # true target values
        y_pred,                                               # predicted target values
        sample_weight=sample_weight,                          # per-sample weights
        multioutput="uniform_average"                         # average error uniformly across all targets
    )

def main():                                                   # main training function
    print("Loading data ...")                                 # status message
    X_train, Y_train, w_train, X_val, Y_val, w_val = load_data()  # load training and validation arrays

    print("X_train shape:", X_train.shape)                    # print shape of training features
    print("Y_train shape:", Y_train.shape)                    # print shape of training targets
    print("X_val shape:", X_val.shape)                        # print shape of validation features
    print("Y_val shape:", Y_val.shape)                        # print shape of validation targets

    # Define hyper-parameter grids to search over             # comment for grid definition
    num_leaves_list      = [35, 100, 200]                     # number of leaves for each tree
    boosting_type_list   = ['gbdt', 'dart']                   # boosting type (standard gradient boosting or DART)
    learning_rate_list   = [0.1, 0.001, 0.01]                 # learning rates to try
    num_boost_round_list = [100, 200, 50]                   # number of boosting iterations (n_estimators)

    # Compute total number of combinations                    # comment for combination count
    total_combinations = (len(num_leaves_list) *
                          len(boosting_type_list) *
                          len(learning_rate_list) *
                          len(num_boost_round_list))          # multiply all grid dimensions

    print(f"Total hyper-parameter combinations: {total_combinations}")  # show grid size

    best_val_wmse = np.inf                                   # initialize best validation weighted MSE
    best_model = None                                        # placeholder for best model
    best_params = None                                       # placeholder for best hyper-parameters
    combo_counter = 0                                        # counter for combinations

    # Manual grid search over all hyper-parameter combinations  # comment for outer loops
    for num_leaves in num_leaves_list:                       # loop over num_leaves options
        for boosting_type in boosting_type_list:             # loop over boosting types
            for learning_rate in learning_rate_list:         # loop over learning rates
                for num_boost_round in num_boost_round_list: # loop over number of boosting rounds
                    combo_counter += 1                       # increment combination counter

                    print("\n======================================")      # separator for readability
                    print(f"Training combination {combo_counter}/{total_combinations}")  # show index
                    print(f"num_leaves: {num_leaves}")                        # print current num_leaves
                    print(f"boosting_type: {boosting_type}")                  # print current boosting type
                    print(f"learning_rate: {learning_rate}")                  # print current learning rate
                    print(f"num_boost_round: {num_boost_round}")              # print current num_boost_round
                    print("======================================")          # separator line

                    # Define base LightGBM regressor with current hyper-parameters  # comment
                    base_model = LGBMRegressor(
                        objective="regression",             # regression objective (MSE-based)
                        num_leaves=num_leaves,              # number of leaves for each tree
                        boosting_type=boosting_type,        # boosting algorithm type
                        learning_rate=learning_rate,        # learning rate for boosting
                        n_estimators=num_boost_round,       # number of boosting iterations
                        random_state=42,                    # random seed for reproducibility
                        n_jobs=-1                           # use all available CPU cores
                    )

                    # Wrap the base model for multi-target regression           # comment
                    model = MultiOutputRegressor(
                        estimator=base_model                # base LightGBM regressor used for each target
                    )

                    # Fit the model using training data and sample weights      # comment
                    model.fit(
                        X_train,                            # training features
                        Y_train,                            # training targets
                        sample_weight=w_train               # sample weights for balancing
                    )

                    # Predict on the validation set                             # comment
                    Y_val_pred = model.predict(X_val)       # compute predictions for validation inputs

                    # Compute weighted MSE on the validation set                # comment
                    val_wmse = weighted_mse(
                        Y_val,                              # true validation targets
                        Y_val_pred,                         # predicted validation targets
                        w_val                               # validation sample weights
                    )

                    print(f"Validation weighted MSE: {val_wmse:.6f}")  # show current combination score

                    # Check if this model is better than any previous model     # comment
                    if val_wmse < best_val_wmse:           # if current score improves best score
                        print(">>> New best model found! Updating best model ...")  # log improvement
                        best_val_wmse = val_wmse           # update best validation weighted MSE
                        best_model = model                  # store current model as best
                        best_params = {                     # store hyper-parameters of best model
                            "num_leaves": num_leaves,
                            "boosting_type": boosting_type,
                            "learning_rate": learning_rate,
                            "num_boost_round": num_boost_round
                        }

    print("\n================ BEST MODEL =================")      # header for best model summary
    print(f"Best validation weighted MSE: {best_val_wmse}")    # print best validation weighted MSE
    print("Best hyper-parameters:")                            # header for parameters
    if best_params is not None:                                # check that best_params exists
        for key, value in best_params.items():                 # iterate over parameter dictionary
            print(f"  {key}: {value}")                         # print each name and value
    print("=============================================")      # footer line

    # Determine model directory inside the container            # comment
    model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")  # get model directory from environment
    os.makedirs(model_dir, exist_ok=True)                      # create directory if it does not exist

    # Path where the best model will be saved                   # comment
    local_model_path = os.path.join(model_dir, "eff_gbm_v1.pkl")  # full path to output pickle file

    print(f"\nSaving best model to {local_model_path} ...")    # log save path
    joblib.dump(best_model, local_model_path)                  # save best model as a pickle file
    print("Model saved.")                                      # confirm save

    # Upload the saved model directly to the specified S3 bucket # comment
    target_bucket = "ai-bmi-predictor"                         # S3 bucket where model will be stored
    target_key = "trained-models/efficientnet-models/eff_gbm_v1.pkl"  # S3 object key for the model

    print(f"Uploading best model to s3://{target_bucket}/{target_key} ...")  # log upload target
    s3_client = boto3.client("s3")                             # create S3 client
    s3_client.upload_file(local_model_path, target_bucket, target_key)  # upload model file to S3
    print("Upload complete.")                                  # confirm upload

if __name__ == "__main__":                                     # execute main() when script is run
    main()                                                     # call main training function


Overwriting train_eff_gbm.py


3.5. Configure and launch the SageMaker SKLearn training job

In [30]:
# NEW CELL: create requirements.txt so the SKLearn container installs LightGBM

requirements_text = "lightgbm\n"        # define the contents of requirements.txt (request LightGBM package)

with open("requirements.txt", "w") as f:  # open a new file named requirements.txt in write mode
    f.write(requirements_text)            # write the required packages into the file

print("requirements.txt created with contents:")  # confirm that the file was created
print(requirements_text)                          # print the contents of requirements.txt


requirements.txt created with contents:
lightgbm



In [None]:
# UPDATED CELL: configure and launch the SageMaker SKLearn training job for LightGBM

from sagemaker.sklearn import SKLearn                 # import the SKLearn estimator class from SageMaker

# define the SKLearn estimator that will run train_eff_gbm.py inside the SKLearn container
sklearn_estimator = SKLearn(
    entry_point="train_eff_gbm.py",                   # training script that runs the LightGBM grid search
    dependencies=["requirements.txt"],                # extra files to package with the script (installs lightgbm)
    role=role,                                       # IAM role used by the training job
    instance_type="ml.c6i.32xlarge",                   # instance type for the training job
    instance_count=1,                                # number of instances for training
    framework_version="1.2-1",                       # version of the SKLearn SageMaker container
    py_version="py3",                                # Python version in the container
    sagemaker_session=sess,                          # SageMaker session created earlier
    base_job_name="eff-gbm-v1-training",             # base name for the training job
    output_path="s3://ai-bmi-predictor/trained-models/efficientnet-models/"  # S3 path for model artifacts
)

# launch the training job with the prepared S3 inputs for train and validation channels
sklearn_estimator.fit(
    inputs={                                         # map channel names to TrainingInput objects
        "train": train_input,                        # training data channel
        "validation": validation_input               # validation data channel
    },
    wait=True,                                       # wait until the training job completes
    logs="All"                                       # stream all logs from the training job into the notebook
)


INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: eff-gbm-v1-training-2025-12-11-07-28-55-278


2025-12-11 07:28:57 Starting - Starting the training job...
2025-12-11 07:29:11 Starting - Preparing the instances for training......
2025-12-11 07:30:16 Downloading - Downloading the training image...
  import pkg_resources[0m
[34m2025-12-11 07:31:10,439 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2025-12-11 07:31:10,441 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2025-12-11 07:31:10,443 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2025-12-11 07:31:10,453 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2025-12-11 07:31:10,651 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt[0m
[34mCollecting lightgbm (from -r requirements.txt (line 1))
  Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl.metadata (17 kB)[0m
[34mDownloading lightgbm-4.6.0-py3-none-ma