1. load measurements dataset

In [1]:
import pandas as pd
import boto3
from botocore import UNSIGNED
from botocore.config import Config

s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))
obj = s3.get_object(Bucket="amazon-bodym", Key="testB/measurements.csv")
measurements = pd.read_csv(obj["Body"])

print(measurements.shape)
measurements.head()

(400, 15)


Unnamed: 0,subject_id,ankle,arm-length,bicep,calf,chest,forearm,height,hip,leg-length,shoulder-breadth,shoulder-to-crotch,thigh,waist,wrist
0,07P1Yi2alvpi0MtJTu7-TmoT5E3x2dABQKiuu_jcnLU,23.424223,47.128738,29.744926,37.697201,98.719429,25.687422,168.497925,101.086189,74.17186,37.112251,64.66703,52.378044,87.121025,16.179781
1,0AQqSm49JJoFaYTzjDcmRHM4gdwcYZyOqXBZ_yvNy0E,23.465136,48.513802,28.611294,33.724255,96.476715,24.859922,173.204254,98.461998,76.29821,35.401356,66.202171,48.386967,85.32827,16.165899
2,0PghUpgAgKJiVgHZpWydh6WqIYKYTlzV-5QR7eSR318,21.773336,45.14246,25.588623,32.983479,85.886726,21.98181,157.488388,89.685318,72.312012,31.207792,55.746342,46.059982,72.437881,14.523148
3,0QW7SNzqmRooRGXql2nRJG5HmfgDtvxaxDcovjh9DRQ,23.121952,47.717579,27.691643,36.397129,85.084686,23.563515,165.294891,100.694473,76.831047,31.613873,59.997318,52.054901,76.603951,15.654763
4,0RRTDLueddykSM_-MbQYeCfx43VR1uf19kN8n6L7sJA,22.359692,46.879379,33.301228,35.578388,114.84845,25.253469,165.55452,106.579338,75.1987,33.318371,63.201122,57.078968,100.175446,15.805896


2. load hwg_metadata dataset

In [2]:
import pandas as pd
import boto3
from botocore import UNSIGNED
from botocore.config import Config

s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))
obj = s3.get_object(Bucket="amazon-bodym", Key="testB/hwg_metadata.csv")
hwg = pd.read_csv(obj["Body"])

print(hwg.shape)
hwg.head()

(400, 4)


Unnamed: 0,subject_id,gender,height_cm,weight_kg
0,ikt5hpg4u--nWYDSMAfZXOVZhhah-YtX7p6p7q8Hg6I,female,174.0,56.7
1,AcXtVN5DaisfM7J3T6N5F-ZePD30WawSdWJa-8IEPpA,male,186.0,93.6
2,6JWWSl4hJ08_707MzdBolsPOvdgW74O4q1U9WVUG4co,male,179.5,116.8
3,HaE2Jiu7OrRAJrWV2k6EHxguAzayIdgigQFgAO4lK_Y,male,171.5,70.9
4,aHeOj3t0BEox4aP4LnA_-f1WdN2R74ZTzLBTt-i5C8g,male,160.0,74.7


3. Concatenate 'measurements' and 'hwg' datasets

In [3]:
import pandas as pd  

# Align the join key types so they match  # prevent dtype mismatches on the key
measurements['subject_id'] = measurements['subject_id'].astype(str)  # cast subject_id in measurements to string
hwg['subject_id'] = hwg['subject_id'].astype(str)  # cast subject_id in hwg to string

# Normalize IDs by trimming whitespace  # avoids missed matches due to stray spaces
measurements['subject_id'] = measurements['subject_id'].str.strip()  # strip spaces in measurements IDs
hwg['subject_id'] = hwg['subject_id'].str.strip()  # strip spaces in hwg IDs

# Perform an INNER JOIN on subject_id so only matching IDs are kept 
merged = pd.merge(  # start merge call
    measurements,  # left table
    hwg,  # right table
    on='subject_id',  # join key present in both tables
    how='inner',  # inner join = intersection of keys
    suffixes=('_meas', '_hwg')  # add suffixes to overlapping column names
)  # end merge call

# Inspect the result shape (rows x columns)  # quick sanity check on output size
print(f"Rows: {merged.shape[0]}, Columns: {merged.shape[1]}")  # display dimensions

# Peek at the first few rows to verify the join looks correct  # spot-check data
merged.head()  # print sample of merged DataFrame


Rows: 400, Columns: 18


Unnamed: 0,subject_id,ankle,arm-length,bicep,calf,chest,forearm,height,hip,leg-length,shoulder-breadth,shoulder-to-crotch,thigh,waist,wrist,gender,height_cm,weight_kg
0,07P1Yi2alvpi0MtJTu7-TmoT5E3x2dABQKiuu_jcnLU,23.424223,47.128738,29.744926,37.697201,98.719429,25.687422,168.497925,101.086189,74.17186,37.112251,64.66703,52.378044,87.121025,16.179781,male,167.0,70.0
1,0AQqSm49JJoFaYTzjDcmRHM4gdwcYZyOqXBZ_yvNy0E,23.465136,48.513802,28.611294,33.724255,96.476715,24.859922,173.204254,98.461998,76.29821,35.401356,66.202171,48.386967,85.32827,16.165899,male,171.5,63.8
2,0PghUpgAgKJiVgHZpWydh6WqIYKYTlzV-5QR7eSR318,21.773336,45.14246,25.588623,32.983479,85.886726,21.98181,157.488388,89.685318,72.312012,31.207792,55.746342,46.059982,72.437881,14.523148,female,155.5,49.8
3,0QW7SNzqmRooRGXql2nRJG5HmfgDtvxaxDcovjh9DRQ,23.121952,47.717579,27.691643,36.397129,85.084686,23.563515,165.294891,100.694473,76.831047,31.613873,59.997318,52.054901,76.603951,15.654763,female,163.5,59.3
4,0RRTDLueddykSM_-MbQYeCfx43VR1uf19kN8n6L7sJA,22.359692,46.879379,33.301228,35.578388,114.84845,25.253469,165.55452,106.579338,75.1987,33.318371,63.201122,57.078968,100.175446,15.805896,female,164.0,77.3


In [4]:
merged.drop(columns=['height'], inplace=True)

In [5]:
merged.columns

Index(['subject_id', 'ankle', 'arm-length', 'bicep', 'calf', 'chest',
       'forearm', 'hip', 'leg-length', 'shoulder-breadth',
       'shoulder-to-crotch', 'thigh', 'waist', 'wrist', 'gender', 'height_cm',
       'weight_kg'],
      dtype='object')

4. send merged dataset to s3 bucket

In [6]:
import io                # for in-memory text buffer
import boto3             # AWS SDK for Python (to talk to S3)

# --- assume 'merged' DataFrame is already created and cleaned above this code ---

# create an in-memory text buffer
csv_buffer = io.StringIO()  

# write the DataFrame to the buffer as CSV (without index)
merged.to_csv(csv_buffer, index=False)  

# create an S3 client (uses your configured AWS credentials/region)
s3 = boto3.client("s3")  

# define the bucket name
bucket_name = "ai-bmi-predictor"  

# define the S3 object key (this creates the "data" folder logically)
object_key = "test-data/body_measurements-testB.csv"  

# upload the CSV content from the buffer to S3
s3.put_object(
    Bucket=bucket_name,          # target S3 bucket
    Key=object_key,              # path + filename in the bucket
    Body=csv_buffer.getvalue()   # CSV content as string
)

# print confirmation
print(f"Uploaded 'merged' dataset to s3://{bucket_name}/{object_key}")


Uploaded 'merged' dataset to s3://ai-bmi-predictor/test-data/body_measurements-testB.csv
