# Assignment 3.1: Feature Store

**Student name: John Kalaiselvan**

## Setup SageMaker FeatureStore

In [3]:
import boto3
import sagemaker

original_boto3_version = boto3.__version__
%pip install 'boto3>1.17.21'

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
Note: you may need to restart the kernel to use updated packages.


In [4]:
from sagemaker.session import Session

region = boto3.Session().region_name

boto_session = boto3.Session(region_name=region)

sagemaker_client = boto_session.client(service_name="sagemaker", region_name=region)
featurestore_runtime = boto_session.client(
    service_name="sagemaker-featurestore-runtime", region_name=region
)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime,
)

## S3 Bucket Setup For The OfflineStore

In [5]:
default_s3_bucket_name = feature_store_session.default_bucket()
prefix = "sagemaker-featurestore"

print(default_s3_bucket_name)

sagemaker-us-east-1-823460696669


In [6]:
from sagemaker import get_execution_role

# You can modify the following to use a role of your choosing. See the documentation for how to create this.
role = get_execution_role()
print(role)

arn:aws:iam::823460696669:role/LabRole


## Copy Data From the GitHub to the Private S3 Bucket

In [7]:
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
account_id = boto3.client("sts").get_caller_identity().get("Account")

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

In [8]:
dataset0_path = "housing.csv"
dataset1_path = "housing_gmaps_data_raw.csv"
s3_dest_path = "s3://{}/homework-3-1/data/".format(bucket)

In [10]:
!aws s3 cp $dataset0_path $s3_dest_path

upload: ./housing.csv to s3://sagemaker-us-east-1-823460696669/homework-3-1/data/housing.csv


In [11]:
!aws s3 cp $dataset1_path $s3_dest_path

upload: ./housing_gmaps_data_raw.csv to s3://sagemaker-us-east-1-823460696669/homework-3-1/data/housing_gmaps_data_raw.csv


In [9]:
!aws s3 ls $s3_dest_path

2026-01-25 15:01:58    1423529 housing.csv
2026-01-25 15:02:07    2057894 housing_gmaps_data_raw.csv


## Inspect dataset

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import io

s3_client = boto3.client("s3", region_name=region)

housing_bucket_name = f"sagemaker-us-east-1-823460696669"
housing_file_key = (
    "homework-3-1/data/housing.csv"
)
housing_gmaps_file_key = (
    "homework-3-1/data/housing_gmaps_data_raw.csv"
)

housing_data_object = s3_client.get_object(
    Bucket=housing_bucket_name, Key=housing_file_key
)
housing_gmaps_data_object = s3_client.get_object(
    Bucket=housing_bucket_name, Key=housing_gmaps_file_key
)

housing_data = pd.read_csv(io.BytesIO(housing_data_object["Body"].read()))
housing_gmaps_data = pd.read_csv(io.BytesIO(housing_gmaps_data_object["Body"].read()))

In [12]:
housing_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [13]:
housing_gmaps_data.head()

Unnamed: 0,street_number,route,locality-political,administrative_area_level_2-political,administrative_area_level_1-political,country-political,postal_code,address,longitude,latitude,...,establishment-natural_feature,airport-establishment-point_of_interest,political-sublocality-sublocality_level_1,administrative_area_level_3-political,post_box,establishment-light_rail_station-point_of_interest-transit_station,establishment-point_of_interest,aquarium-establishment-park-point_of_interest-tourist_attraction-zoo,campground-establishment-lodging-park-point_of_interest-rv_park-tourist_attraction,cemetery-establishment-park-point_of_interest
0,3130,Grizzly Peak Boulevard,Berkeley,Alameda County,California,United States,94705.0,"3130 Grizzly Peak Blvd, Berkeley, CA 94705, USA",-122.23,37.88,...,,,,,,,,,,
1,2005,Tunnel Road,Oakland,Alameda County,California,United States,94611.0,"2005 Tunnel Rd, Oakland, CA 94611, USA",-122.22,37.86,...,,,,,,,,,,
2,6886,Chabot Road,Oakland,Alameda County,California,United States,94618.0,"6886 Chabot Rd, Oakland, CA 94618, USA",-122.24,37.85,...,,,,,,,,,,
3,6365,Florio Street,Oakland,Alameda County,California,United States,94618.0,"6365 Florio St, Oakland, CA 94618, USA",-122.25,37.85,...,,,,,,,,,,
4,5407,Bryant Avenue,Oakland,Alameda County,California,United States,94618.0,"5407 Bryant Ave, Oakland, CA 94618, USA",-122.25,37.84,...,,,,,,,,,,


## Feature engineering

### Handle missing values

In [14]:
# Report missing values
housing_data.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [15]:
# Handle missing values
housing_data["total_bedrooms"] = housing_data["total_bedrooms"].fillna(
    housing_data["total_bedrooms"].median()
)

In [16]:
# Report missing values
housing_gmaps_data.isna().sum()

street_number                                                                          1402
route                                                                                   380
locality-political                                                                      187
administrative_area_level_2-political                                                    47
administrative_area_level_1-political                                                     3
country-political                                                                         0
postal_code                                                                             180
address                                                                                   0
longitude                                                                                 0
latitude                                                                                  0
neighborhood-political                                                          

In [17]:
housing_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [18]:
housing_gmaps_data.head()

Unnamed: 0,street_number,route,locality-political,administrative_area_level_2-political,administrative_area_level_1-political,country-political,postal_code,address,longitude,latitude,...,establishment-natural_feature,airport-establishment-point_of_interest,political-sublocality-sublocality_level_1,administrative_area_level_3-political,post_box,establishment-light_rail_station-point_of_interest-transit_station,establishment-point_of_interest,aquarium-establishment-park-point_of_interest-tourist_attraction-zoo,campground-establishment-lodging-park-point_of_interest-rv_park-tourist_attraction,cemetery-establishment-park-point_of_interest
0,3130,Grizzly Peak Boulevard,Berkeley,Alameda County,California,United States,94705.0,"3130 Grizzly Peak Blvd, Berkeley, CA 94705, USA",-122.23,37.88,...,,,,,,,,,,
1,2005,Tunnel Road,Oakland,Alameda County,California,United States,94611.0,"2005 Tunnel Rd, Oakland, CA 94611, USA",-122.22,37.86,...,,,,,,,,,,
2,6886,Chabot Road,Oakland,Alameda County,California,United States,94618.0,"6886 Chabot Rd, Oakland, CA 94618, USA",-122.24,37.85,...,,,,,,,,,,
3,6365,Florio Street,Oakland,Alameda County,California,United States,94618.0,"6365 Florio St, Oakland, CA 94618, USA",-122.25,37.85,...,,,,,,,,,,
4,5407,Bryant Avenue,Oakland,Alameda County,California,United States,94618.0,"5407 Bryant Ave, Oakland, CA 94618, USA",-122.25,37.84,...,,,,,,,,,,


### Merge housing and gmaps data

In [19]:
merge_data = housing_data.merge(
    housing_gmaps_data,
    on=["latitude", "longitude"],
    how="inner"
)

# Keep only rows with a neighborhood
merge_data = merge_data[merge_data["neighborhood-political"].notna()]


### Bedrooms-per-household

In [20]:
merge_data["bedrooms_per_household"] = (
    merge_data["total_bedrooms"] / merge_data["households"]
)

# Impute using postal_code averages
merge_data["bedrooms_per_household"] = merge_data.groupby("postal_code")[
    "bedrooms_per_household"
].transform(lambda x: x.fillna(x.mean()))

### One-Hot Encode Ocean Proximity

In [21]:
ocean_dummies = pd.get_dummies(
    merge_data["ocean_proximity"],
    prefix="ocean"
)

merge_data = pd.concat([merge_data, ocean_dummies], axis=1)


### Neighborhood-Level Aggregations

In [22]:
neighborhood_fg = (
    merge_data.groupby("neighborhood-political")
    .agg(
        median_house_value=(
            "median_house_value",
            lambda x: min(x.mean(), 500_000)
        ),
        median_house_age=("housing_median_age", "mean"),
        total_households=("households", "mean"),
        bedrooms_per_household=("bedrooms_per_household", "mean"),
        **{col: (col, "mean") for col in ocean_dummies.columns}
    )
    .reset_index()
)


### Discretize Median House Age

In [23]:
def age_bucket(age):
    lower = int(age // 10) * 10
    return f"{lower}-{lower+9}"

neighborhood_fg["median_house_age"] = (
    neighborhood_fg["median_house_age"]
    .apply(age_bucket)
)


### Round up total households

In [24]:
neighborhood_fg["total_households"] = (
    neighborhood_fg["total_households"]
    .apply(np.ceil)
    .astype(int)
)


### Add Feature Store Metadata

In [25]:
from datetime import datetime

neighborhood_fg["neighborhood"] = neighborhood_fg["neighborhood-political"]
neighborhood_fg["event_time"] = datetime.utcnow()

neighborhood_fg = neighborhood_fg.drop(
    columns=["neighborhood-political"]
)


  neighborhood_fg["event_time"] = datetime.utcnow()


In [31]:
neighborhood_fg["event_time"] = (
    pd.to_datetime(neighborhood_fg["event_time"])
    .dt.strftime("%Y-%m-%dT%H:%M:%SZ")
)


In [32]:
neighborhood_fg.head()

Unnamed: 0,median_house_value,median_house_age,total_households,bedrooms_per_household,ocean_<1H OCEAN,ocean_INLAND,ocean_NEAR BAY,ocean_NEAR OCEAN,neighborhood,event_time
0,222200.0,20-29,923,1.017335,1.0,0.0,0.0,0.0,28 Palms,2026-01-25T16:22:58Z
1,81300.0,50-59,147,1.659864,0.0,0.0,1.0,0.0,Acorn Industrial,2026-01-25T16:22:58Z
2,250733.333333,30-39,494,1.034649,1.0,0.0,0.0,0.0,Adams Hill,2026-01-25T16:22:58Z
3,112300.0,10-19,516,1.102713,0.0,1.0,0.0,0.0,Agua Mansa Industrial Corridor,2026-01-25T16:22:58Z
4,109180.0,20-29,249,1.641739,0.0,1.0,0.0,0.0,Al Tahoe,2026-01-25T16:22:58Z


In [36]:
neighborhood_fg.columns = (
    neighborhood_fg.columns
    .str.lower()
    .str.replace(" ", "_")
    .str.replace("<", "lt_", regex=False)
)

In [37]:
ocean_cols = [c for c in neighborhood_fg.columns if c.startswith("ocean_")]

neighborhood_fg[ocean_cols] = neighborhood_fg[ocean_cols].round().astype(int)


In [42]:
if "ocean_island" not in neighborhood_fg.columns:
    neighborhood_fg["ocean_island"] = 0


In [47]:
ordered_cols = [
    "neighborhood",
    "event_time",
    "ocean_lt_1h_ocean",
    "ocean_inland",
    "ocean_island",
    "ocean_near_bay",
    "ocean_near_ocean",
    "median_house_value",
    "median_house_age",
    "total_households",
    "bedrooms_per_household"
]

neighborhood_fg = neighborhood_fg[ordered_cols]


In [48]:
neighborhood_fg.head()

Unnamed: 0,neighborhood,event_time,ocean_lt_1h_ocean,ocean_inland,ocean_island,ocean_near_bay,ocean_near_ocean,median_house_value,median_house_age,total_households,bedrooms_per_household
0,28 Palms,2026-01-25T16:22:58Z,1,0,0,0,0,222200.0,20-29,923,1.017335
1,Acorn Industrial,2026-01-25T16:22:58Z,0,0,0,1,0,81300.0,50-59,147,1.659864
2,Adams Hill,2026-01-25T16:22:58Z,1,0,0,0,0,250733.333333,30-39,494,1.034649
3,Agua Mansa Industrial Corridor,2026-01-25T16:22:58Z,0,1,0,0,0,112300.0,10-19,516,1.102713
4,Al Tahoe,2026-01-25T16:22:58Z,0,1,0,0,0,109180.0,20-29,249,1.641739


## Ingest Data into FeatureStore

### Define FeatureStore

In [49]:
neighborhood_feature_group_name = "neighborhood_feature_group"

In [50]:
from sagemaker.feature_store.feature_group import FeatureGroup

neighborhood_feature_group = FeatureGroup(
    name=neighborhood_feature_group_name, sagemaker_session=feature_store_session
)


In [51]:
neighborhood_feature_group.load_feature_definitions(
    data_frame=neighborhood_fg
)

[FeatureDefinition(feature_name='neighborhood', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None),
 FeatureDefinition(feature_name='event_time', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None),
 FeatureDefinition(feature_name='ocean_lt_1h_ocean', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='ocean_inland', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='ocean_island', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='ocean_near_bay', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='ocean_near_ocean', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='median_house_value', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>, collection_type=None

### Create FeatureGroups in SageMaker FeatureStore

In [54]:
import time

def wait_for_feature_group_creation_complete(feature_group):
    while True:
        status = feature_group.describe()["FeatureGroupStatus"]
        if status == "Created":
            print(f"FeatureGroup {feature_group.name} successfully created.")
            break
        elif status == "Creating":
            print("Waiting for Feature Group Creation...")
            time.sleep(5)
        else:
            raise RuntimeError(
                f"Failed to create feature group {feature_group.name}. Status: {status}"
            )


In [55]:
neighborhood_feature_group.create(
    record_identifier_name="neighborhood",
    event_time_feature_name="event_time",
    role_arn=role,
    s3_uri=f"s3://{default_s3_bucket_name}/{prefix}",
    enable_online_store=True,
)

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:823460696669:feature-group/neighborhood_feature_group',
 'ResponseMetadata': {'RequestId': 'c18e4226-9006-4dbd-8b85-609bc4229b84',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'c18e4226-9006-4dbd-8b85-609bc4229b84',
   'strict-transport-security': 'max-age=47304000; includeSubDomains',
   'x-frame-options': 'DENY',
   'content-security-policy': "frame-ancestors 'none'",
   'cache-control': 'no-cache, no-store, must-revalidate',
   'x-content-type-options': 'nosniff',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '103',
   'date': 'Sun, 25 Jan 2026 16:52:24 GMT'},
  'RetryAttempts': 0}}

In [56]:
wait_for_feature_group_creation_complete(feature_group=neighborhood_feature_group)

Waiting for Feature Group Creation...
Waiting for Feature Group Creation...
Waiting for Feature Group Creation...
FeatureGroup neighborhood_feature_group successfully created.


In [63]:
neighborhood_feature_group.describe()

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:823460696669:feature-group/neighborhood_feature_group',
 'FeatureGroupName': 'neighborhood_feature_group',
 'RecordIdentifierFeatureName': 'neighborhood',
 'EventTimeFeatureName': 'event_time',
 'FeatureDefinitions': [{'FeatureName': 'neighborhood',
   'FeatureType': 'String'},
  {'FeatureName': 'event_time', 'FeatureType': 'String'},
  {'FeatureName': 'ocean_lt_1h_ocean', 'FeatureType': 'Integral'},
  {'FeatureName': 'ocean_inland', 'FeatureType': 'Integral'},
  {'FeatureName': 'ocean_island', 'FeatureType': 'Integral'},
  {'FeatureName': 'ocean_near_bay', 'FeatureType': 'Integral'},
  {'FeatureName': 'ocean_near_ocean', 'FeatureType': 'Integral'},
  {'FeatureName': 'median_house_value', 'FeatureType': 'Fractional'},
  {'FeatureName': 'median_house_age', 'FeatureType': 'String'},
  {'FeatureName': 'total_households', 'FeatureType': 'Integral'},
  {'FeatureName': 'bedrooms_per_household', 'FeatureType': 'Fractional'}],
 'CreationTime': d

In [64]:
sagemaker_client.list_feature_groups()

{'FeatureGroupSummaries': [{'FeatureGroupName': 'neighborhood_feature_group',
   'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:823460696669:feature-group/neighborhood_feature_group',
   'CreationTime': datetime.datetime(2026, 1, 25, 16, 52, 24, 31000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created'}],
 'ResponseMetadata': {'RequestId': '447b3387-75cb-4870-9d7e-0206f139e114',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '447b3387-75cb-4870-9d7e-0206f139e114',
   'strict-transport-security': 'max-age=47304000; includeSubDomains',
   'x-frame-options': 'DENY',
   'content-security-policy': "frame-ancestors 'none'",
   'cache-control': 'no-cache, no-store, must-revalidate',
   'x-content-type-options': 'nosniff',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '242',
   'date': 'Sun, 25 Jan 2026 17:06:45 GMT'},
  'RetryAttempts': 0}}

### PutRecords into FeatureGroup

In [65]:
neighborhood_feature_group.ingest(
    data_frame=neighborhood_fg,
    wait=True
)

IngestionManagerPandas(feature_group_name='neighborhood_feature_group', feature_definitions={'neighborhood': {'FeatureName': 'neighborhood', 'FeatureType': 'String'}, 'event_time': {'FeatureName': 'event_time', 'FeatureType': 'String'}, 'ocean_lt_1h_ocean': {'FeatureName': 'ocean_lt_1h_ocean', 'FeatureType': 'Integral'}, 'ocean_inland': {'FeatureName': 'ocean_inland', 'FeatureType': 'Integral'}, 'ocean_island': {'FeatureName': 'ocean_island', 'FeatureType': 'Integral'}, 'ocean_near_bay': {'FeatureName': 'ocean_near_bay', 'FeatureType': 'Integral'}, 'ocean_near_ocean': {'FeatureName': 'ocean_near_ocean', 'FeatureType': 'Integral'}, 'median_house_value': {'FeatureName': 'median_house_value', 'FeatureType': 'Fractional'}, 'median_house_age': {'FeatureName': 'median_house_age', 'FeatureType': 'String'}, 'total_households': {'FeatureName': 'total_households', 'FeatureType': 'Integral'}, 'bedrooms_per_household': {'FeatureName': 'bedrooms_per_household', 'FeatureType': 'Fractional'}}, sagema

## Query the Feature Values


### Query "Brooktree" from feature store

In [68]:
record_id = "Brooktree"

response = neighborhood_feature_group.get_record(
    record_identifier_value_as_string=record_id
)

if response is None or len(response) == 0:
    print(f"No online-store record found for neighborhood: {record_id}")
else:
    for f in response:
        print(f"{f['FeatureName']}: {f['ValueAsString']}")


neighborhood: Brooktree
event_time: 2026-01-25T16:22:58Z
ocean_lt_1h_ocean: 1
ocean_inland: 0
ocean_island: 0
ocean_near_bay: 0
ocean_near_ocean: 0
median_house_value: 257400.0
median_house_age: 0-9
total_households: 1438
bedrooms_per_household: 0.30250347705146036


### Query "Fisherman’s Wharf" from feature store

In [72]:
record_id = "Fisherman's Wharf"

response = neighborhood_feature_group.get_record(
    record_identifier_value_as_string=record_id
)

if response is None or len(response) == 0:
    print(f"No online-store record found for neighborhood: {record_id}")
else:
    for f in response:
        print(f"{f['FeatureName']}: {f['ValueAsString']}")


neighborhood: Fisherman's Wharf
event_time: 2026-01-25T16:22:58Z
ocean_lt_1h_ocean: 0
ocean_inland: 0
ocean_island: 0
ocean_near_bay: 1
ocean_near_ocean: 0
median_house_value: 500000.0
median_house_age: 50-59
total_households: 250
bedrooms_per_household: 1.268


### Query "Los Osos" from feature store

In [73]:
record_id = "Los Osos"

response = neighborhood_feature_group.get_record(
    record_identifier_value_as_string=record_id
)

if response is None or len(response) == 0:
    print(f"No online-store record found for neighborhood: {record_id}")
else:
    for f in response:
        print(f"{f['FeatureName']}: {f['ValueAsString']}")


neighborhood: Los Osos
event_time: 2026-01-25T16:22:58Z
ocean_lt_1h_ocean: 0
ocean_inland: 0
ocean_island: 0
ocean_near_bay: 0
ocean_near_ocean: 1
median_house_value: 221612.5
median_house_age: 10-19
total_households: 612
bedrooms_per_household: 1.0478845404823531
