1. load measurements dataset

In [2]:
import pandas as pd
import boto3
from botocore import UNSIGNED
from botocore.config import Config

s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))
obj = s3.get_object(Bucket="amazon-bodym", Key="testA/measurements.csv")
measurements = pd.read_csv(obj["Body"])

print(measurements.shape)
measurements.head()

(87, 15)


Unnamed: 0,subject_id,ankle,arm-length,bicep,calf,chest,forearm,height,hip,leg-length,shoulder-breadth,shoulder-to-crotch,thigh,waist,wrist
0,-JtWlHEvrLPWagPaAhm0eQ_UKMSg8By1yoe-SkdrKSs,22.739975,46.199041,26.398842,35.114396,97.744168,23.393059,161.383472,98.066856,72.135391,34.749683,59.790292,52.042527,80.647595,15.284669
1,1T9SoRhbCpQ0zXf4Le6WjwTOPdfnmEDebvWaECZg_HM,25.477705,48.231339,31.915012,38.683422,107.267212,28.925291,172.555771,103.848091,77.136208,38.38118,68.413773,55.454933,90.152718,17.260105
2,2LAYE-qYtPZMSeH-gUgP-H4o-WRPzC0Cg9Nf5_AkLBE,25.254661,50.602189,29.626611,37.51582,100.673547,27.386687,178.774821,99.342301,82.275874,36.059983,66.440526,53.742692,82.100598,17.086464
3,3KCeTD5hLgY5qDa-Dhg0lcrnsRM_EndCLV4DMSOQfQM,23.995326,51.213376,25.230598,35.231322,92.436885,24.257676,175.332153,95.405576,79.827815,36.394586,67.045784,48.81131,82.786952,17.30264
4,3beF_SfHubarfkfoyCeA6EDR7UqE8j-bgj4bzBFzGFM,24.038458,53.719632,28.889543,35.938774,107.578957,26.352268,179.71786,97.628494,85.352661,37.451279,66.207982,53.045108,89.971087,16.362741


2. load hwg_metadata dataset

In [3]:
import pandas as pd
import boto3
from botocore import UNSIGNED
from botocore.config import Config

s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))
obj = s3.get_object(Bucket="amazon-bodym", Key="testA/hwg_metadata.csv")
hwg = pd.read_csv(obj["Body"])

print(hwg.shape)
hwg.head()

(87, 4)


Unnamed: 0,subject_id,gender,height_cm,weight_kg
0,5a1e03cb9f17b800040cc77c,male,172.5,104.1
1,6JfuBS-duD_BLzV6gxsQjcPc7gv6mgQBTuNzdnp6gfA,female,159.0,87.2
2,6Xz1aGZP_g084xrBEy0UYJjzozZhj3G1wx_cFS9q-UM,female,163.0,81.45
3,hjG7-UJ-SuVAgiJ3WEgKdS2nc7fAwQ7iXJCP5oS9omg,female,169.0,84.2
4,Mrq4MhWptluvgMB-LCjGQl950hLNxVxQhBbzo--Il5E,male,180.0,94.6


3. Concatenate 'measurements' and 'hwg' datasets

In [4]:
import pandas as pd  

# Align the join key types so they match  # prevent dtype mismatches on the key
measurements['subject_id'] = measurements['subject_id'].astype(str)  # cast subject_id in measurements to string
hwg['subject_id'] = hwg['subject_id'].astype(str)  # cast subject_id in hwg to string

# Normalize IDs by trimming whitespace  # avoids missed matches due to stray spaces
measurements['subject_id'] = measurements['subject_id'].str.strip()  # strip spaces in measurements IDs
hwg['subject_id'] = hwg['subject_id'].str.strip()  # strip spaces in hwg IDs

# Perform an INNER JOIN on subject_id so only matching IDs are kept 
merged = pd.merge(  # start merge call
    measurements,  # left table
    hwg,  # right table
    on='subject_id',  # join key present in both tables
    how='inner',  # inner join = intersection of keys
    suffixes=('_meas', '_hwg')  # add suffixes to overlapping column names
)  # end merge call

# Inspect the result shape (rows x columns)  # quick sanity check on output size
print(f"Rows: {merged.shape[0]}, Columns: {merged.shape[1]}")  # display dimensions

# Peek at the first few rows to verify the join looks correct  # spot-check data
merged.head()  # print sample of merged DataFrame


Rows: 87, Columns: 18


Unnamed: 0,subject_id,ankle,arm-length,bicep,calf,chest,forearm,height,hip,leg-length,shoulder-breadth,shoulder-to-crotch,thigh,waist,wrist,gender,height_cm,weight_kg
0,-JtWlHEvrLPWagPaAhm0eQ_UKMSg8By1yoe-SkdrKSs,22.739975,46.199041,26.398842,35.114396,97.744168,23.393059,161.383472,98.066856,72.135391,34.749683,59.790292,52.042527,80.647595,15.284669,female,160.9,61.95
1,1T9SoRhbCpQ0zXf4Le6WjwTOPdfnmEDebvWaECZg_HM,25.477705,48.231339,31.915012,38.683422,107.267212,28.925291,172.555771,103.848091,77.136208,38.38118,68.413773,55.454933,90.152718,17.260105,male,170.7,80.0
2,2LAYE-qYtPZMSeH-gUgP-H4o-WRPzC0Cg9Nf5_AkLBE,25.254661,50.602189,29.626611,37.51582,100.673547,27.386687,178.774821,99.342301,82.275874,36.059983,66.440526,53.742692,82.100598,17.086464,male,179.7,73.85
3,3KCeTD5hLgY5qDa-Dhg0lcrnsRM_EndCLV4DMSOQfQM,23.995326,51.213376,25.230598,35.231322,92.436885,24.257676,175.332153,95.405576,79.827815,36.394586,67.045784,48.81131,82.786952,17.30264,male,176.1,63.3
4,3beF_SfHubarfkfoyCeA6EDR7UqE8j-bgj4bzBFzGFM,24.038458,53.719632,28.889543,35.938774,107.578957,26.352268,179.71786,97.628494,85.352661,37.451279,66.207982,53.045108,89.971087,16.362741,male,179.7,73.8


In [5]:
merged.drop(columns=['height'], inplace=True)

In [6]:
merged.columns

Index(['subject_id', 'ankle', 'arm-length', 'bicep', 'calf', 'chest',
       'forearm', 'hip', 'leg-length', 'shoulder-breadth',
       'shoulder-to-crotch', 'thigh', 'waist', 'wrist', 'gender', 'height_cm',
       'weight_kg'],
      dtype='object')

4. send merged dataset to s3 bucket

In [7]:
import io                # for in-memory text buffer
import boto3             # AWS SDK for Python (to talk to S3)

# --- assume 'merged' DataFrame is already created and cleaned above this code ---

# create an in-memory text buffer
csv_buffer = io.StringIO()  

# write the DataFrame to the buffer as CSV (without index)
merged.to_csv(csv_buffer, index=False)  

# create an S3 client (uses your configured AWS credentials/region)
s3 = boto3.client("s3")  

# define the bucket name
bucket_name = "ai-bmi-predictor"  

# define the S3 object key (this creates the "data" folder logically)
object_key = "test-data/body_measurements-testA.csv"  

# upload the CSV content from the buffer to S3
s3.put_object(
    Bucket=bucket_name,          # target S3 bucket
    Key=object_key,              # path + filename in the bucket
    Body=csv_buffer.getvalue()   # CSV content as string
)

# print confirmation
print(f"Uploaded 'merged' dataset to s3://{bucket_name}/{object_key}")


Uploaded 'merged' dataset to s3://ai-bmi-predictor/test-data/body_measurements-testA.csv
