## Setup SageMaker FeatureStore

In [1]:
import boto3
import sagemaker

original_boto3_version = boto3.__version__
%pip install 'boto3>1.17.21'

Note: you may need to restart the kernel to use updated packages.


In [2]:
from sagemaker.session import Session

region = boto3.Session().region_name

boto_session = boto3.Session(region_name=region)

sagemaker_client = boto_session.client(service_name="sagemaker", region_name=region)
featurestore_runtime = boto_session.client(
    service_name="sagemaker-featurestore-runtime", region_name=region
)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime,
)

#### S3 Bucket Setup For The OfflineStore

In [3]:
#Creating Bucket
default_s3_bucket_name = feature_store_session.default_bucket()
prefix = "sagemaker-assignment-3"

print(default_s3_bucket_name)

sagemaker-us-east-1-339129315232


In [4]:
from sagemaker import get_execution_role

# Setting role
role = get_execution_role()
print(role)

arn:aws:iam::339129315232:role/LabRole


## Inspecting/Cleaning Datasets

In [5]:
#Setting directory
new_working_directory = '/home/sagemaker-user/'

In [6]:
#Resetting directory
import os
os.chdir(new_working_directory)

In [7]:
#showing directory
print("Current Working Directory:", os.getcwd())

Current Working Directory: /home/sagemaker-user


In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Setting paths to files
gmaps_path = 'housing_gmaps_data_raw.csv'
housing_path = 'aai-540-homework/homework-3-1/housing.csv'

# Read data into Pandas DataFrames
gmaps_data = pd.read_csv(gmaps_path)
housing_data = pd.read_csv(housing_path)

In [9]:
#Displaying Gmaps Dataframe
gmaps_data.head()

Unnamed: 0,street_number,route,locality-political,administrative_area_level_2-political,administrative_area_level_1-political,country-political,postal_code,address,longitude,latitude,...,establishment-natural_feature,airport-establishment-point_of_interest,political-sublocality-sublocality_level_1,administrative_area_level_3-political,post_box,establishment-light_rail_station-point_of_interest-transit_station,establishment-point_of_interest,aquarium-establishment-park-point_of_interest-tourist_attraction-zoo,campground-establishment-lodging-park-point_of_interest-rv_park-tourist_attraction,cemetery-establishment-park-point_of_interest
0,3130,Grizzly Peak Boulevard,Berkeley,Alameda County,California,United States,94705.0,"3130 Grizzly Peak Blvd, Berkeley, CA 94705, USA",-122.23,37.88,...,,,,,,,,,,
1,2005,Tunnel Road,Oakland,Alameda County,California,United States,94611.0,"2005 Tunnel Rd, Oakland, CA 94611, USA",-122.22,37.86,...,,,,,,,,,,
2,6886,Chabot Road,Oakland,Alameda County,California,United States,94618.0,"6886 Chabot Rd, Oakland, CA 94618, USA",-122.24,37.85,...,,,,,,,,,,
3,6365,Florio Street,Oakland,Alameda County,California,United States,94618.0,"6365 Florio St, Oakland, CA 94618, USA",-122.25,37.85,...,,,,,,,,,,
4,5407,Bryant Avenue,Oakland,Alameda County,California,United States,94618.0,"5407 Bryant Ave, Oakland, CA 94618, USA",-122.25,37.84,...,,,,,,,,,,


In [10]:
# Checking for null data for gmaps data
null_count = gmaps_data.isnull().sum()
print(null_count)

street_number                                                                          1402
route                                                                                   380
locality-political                                                                      187
administrative_area_level_2-political                                                    47
administrative_area_level_1-political                                                     3
country-political                                                                         0
postal_code                                                                             180
address                                                                                   0
longitude                                                                                 0
latitude                                                                                  0
neighborhood-political                                                          

In [11]:
#Setting columns to be removed.
columns_to_remove = [
    'postal_code_suffix',
    'establishment-point_of_interest-transit_station',
    'establishment-park-point_of_interest',
    'premise',
    'establishment-point_of_interest-subway_station-transit_station',
    'airport-establishment-finance-moving_company-point_of_interest-storage',
    'subpremise',
    'bus_station-establishment-point_of_interest-transit_station',
    'establishment-park-point_of_interest-tourist_attraction',
    'establishment-natural_feature',
    'airport-establishment-point_of_interest',
    'political-sublocality-sublocality_level_1',
    'administrative_area_level_3-political',
    'post_box',
    'establishment-light_rail_station-point_of_interest-transit_station',
    'establishment-point_of_interest',
    'aquarium-establishment-park-point_of_interest-tourist_attraction-zoo',
    'campground-establishment-lodging-park-point_of_interest-rv_park-tourist_attraction',
    'cemetery-establishment-park-point_of_interest'
]

gmaps_data_cleaned = gmaps_data.drop(columns=columns_to_remove, errors='ignore')

In [12]:
# Checking null data post column removal
null_count = gmaps_data_cleaned.isnull().sum()
print(null_count)

street_number                            1402
route                                     380
locality-political                        187
administrative_area_level_2-political      47
administrative_area_level_1-political       3
country-political                           0
postal_code                               180
address                                     0
longitude                                   0
latitude                                    0
neighborhood-political                   8413
dtype: int64


In [13]:
#Checking gmap data post fill null with 0s
gmaps_data = gmaps_data_cleaned.fillna(0)
gmaps_data

Unnamed: 0,street_number,route,locality-political,administrative_area_level_2-political,administrative_area_level_1-political,country-political,postal_code,address,longitude,latitude,neighborhood-political
0,3130,Grizzly Peak Boulevard,Berkeley,Alameda County,California,United States,94705.0,"3130 Grizzly Peak Blvd, Berkeley, CA 94705, USA",-122.23,37.88,0
1,2005,Tunnel Road,Oakland,Alameda County,California,United States,94611.0,"2005 Tunnel Rd, Oakland, CA 94611, USA",-122.22,37.86,Merriewood
2,6886,Chabot Road,Oakland,Alameda County,California,United States,94618.0,"6886 Chabot Rd, Oakland, CA 94618, USA",-122.24,37.85,Upper Rockridge
3,6365,Florio Street,Oakland,Alameda County,California,United States,94618.0,"6365 Florio St, Oakland, CA 94618, USA",-122.25,37.85,Rockridge
4,5407,Bryant Avenue,Oakland,Alameda County,California,United States,94618.0,"5407 Bryant Ave, Oakland, CA 94618, USA",-122.25,37.84,Rockridge
...,...,...,...,...,...,...,...,...,...,...,...
12585,0,Garden Valley Road,Camptonville,Yuba County,California,United States,95922.0,"Garden Valley Rd, Camptonville, CA 95922, USA",-121.09,39.48,0
12586,0,La Porte Road,0,Yuba County,California,United States,95919.0,"La Porte Rd, Brownsville, CA 95919, USA",-121.21,39.49,Brownsville
12587,16492,Indiana Ranch Road,Dobbins,Yuba County,California,United States,95935.0,"16492 Indiana Ranch Rd, Dobbins, CA 95935, USA",-121.22,39.43,0
12588,16345,Vierra Road,Rackerby,Yuba County,California,United States,95972.0,"16345 Vierra Rd, Rackerby, CA 95972, USA",-121.32,39.43,0


In [14]:
#Displaying housing data
housing_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [15]:
#Filling null data in housing dataset with 0s and then displaying the dataframe. 
housing_data = housing_data.fillna(0)
housing_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [16]:
# Checking for null data in the housing dataset.
null_count = housing_data.isnull().sum()
print(null_count)

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64


## Merging Datasets

In [17]:
#Creating a locality-code with the longitude and latitude data in both datasets. 
housing_data['locality_code'] = housing_data['latitude'].astype(str) + '_' + housing_data['longitude'].astype(str)
gmaps_data['locality_code'] = gmaps_data['latitude'].astype(str) + '_' + gmaps_data['longitude'].astype(str)

# Merging datasets based on the new locality-code
merged_data = pd.merge(housing_data, gmaps_data, on='locality_code', how='inner')

# Droping the duplicate columns, longitude_x, latitude_x, longitude_y, latitude_y
merged_data = merged_data.drop(['longitude_y', 'latitude_y'], axis=1)

# Renaming the columns for longitude and latitude
merged_data = merged_data.rename(columns={'longitude_x': 'longitude', 'latitude_x': 'latitude'})

#Displaying new dataframe
print(merged_data)

       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0        -122.23     37.88                41.0        880.0           129.0   
1        -122.22     37.86                21.0       7099.0          1106.0   
2        -122.24     37.85                52.0       1467.0           190.0   
3        -122.24     37.85                37.0        334.0            54.0   
4        -122.24     37.85                52.0       2612.0           365.0   
...          ...       ...                 ...          ...             ...   
20635    -121.09     39.48                25.0       1665.0           374.0   
20636    -121.21     39.49                18.0        697.0           150.0   
20637    -121.22     39.43                17.0       2254.0           485.0   
20638    -121.32     39.43                18.0       1860.0           409.0   
20639    -121.24     39.37                16.0       2785.0           616.0   

       population  households  median_income  media

In [18]:
# Checking for null values and display the count
null_count = merged_data.isnull().sum()
print(null_count)

longitude                                0
latitude                                 0
housing_median_age                       0
total_rooms                              0
total_bedrooms                           0
population                               0
households                               0
median_income                            0
median_house_value                       0
ocean_proximity                          0
locality_code                            0
street_number                            0
route                                    0
locality-political                       0
administrative_area_level_2-political    0
administrative_area_level_1-political    0
country-political                        0
postal_code                              0
address                                  0
neighborhood-political                   0
dtype: int64


# Feature Engineering

In [19]:
# Create a new DataFrame for the neighborhood features.
neighborhood_feature_data = pd.DataFrame()

# 1. Primary Key
neighborhood_feature_data['primary_key'] = merged_data['neighborhood-political']

# 2. Event Time 
neighborhood_feature_data['event_time'] = pd.to_datetime('now')

# 3-7. One-hot encoding for ocean_proximity
ocean_proximity_columns = pd.get_dummies(merged_data['ocean_proximity'], prefix='', prefix_sep='')
neighborhood_feature_data['<1h_ocean'] = ocean_proximity_columns['<1H OCEAN'].astype(int)
neighborhood_feature_data['inland'] = ocean_proximity_columns['INLAND'].astype(int)
neighborhood_feature_data['island'] = ocean_proximity_columns['ISLAND'].astype(int)
neighborhood_feature_data['near_bay'] = ocean_proximity_columns['NEAR BAY'].astype(int)
neighborhood_feature_data['near_ocean'] = ocean_proximity_columns['NEAR OCEAN'].astype(int)

# 8. Median House Value
neighborhood_feature_data['median_house_value'] = merged_data['median_house_value']

# 9. Median House Age
neighborhood_feature_data['median_house_age'] = merged_data['housing_median_age']

# 10. Total Households
neighborhood_feature_data['total_households'] = merged_data['households']

# 11. Bedrooms per Household
neighborhood_feature_data['bedrooms_per_household'] = merged_data['total_bedrooms']/merged_data['households']

# 12. Encoded Locality-Political
locality_mapping = {loc: i for i, loc in enumerate(merged_data['locality-political'].unique())}
neighborhood_feature_data['encoded_locality'] = merged_data['locality-political'].map(locality_mapping)

# Imputing missing values by getting the average for locality-code
neighborhood_feature_data = neighborhood_feature_data.fillna(neighborhood_feature_data.groupby('encoded_locality')['bedrooms_per_household'].transform('mean'))

# Displaying the new DataFrame
neighborhood_feature_data

Unnamed: 0,primary_key,event_time,<1h_ocean,inland,island,near_bay,near_ocean,median_house_value,median_house_age,total_households,bedrooms_per_household,encoded_locality
0,0,2024-01-27 09:00:19.036447,0,0,0,1,0,452600.0,41.0,126.0,1.023810,0
1,Merriewood,2024-01-27 09:00:19.036447,0,0,0,1,0,358500.0,21.0,1138.0,0.971880,1
2,Upper Rockridge,2024-01-27 09:00:19.036447,0,0,0,1,0,352100.0,52.0,177.0,1.073446,1
3,Upper Rockridge,2024-01-27 09:00:19.036447,0,0,0,1,0,335000.0,37.0,47.0,1.148936,1
4,Upper Rockridge,2024-01-27 09:00:19.036447,0,0,0,1,0,391100.0,52.0,367.0,0.994550,1
...,...,...,...,...,...,...,...,...,...,...,...,...
20635,0,2024-01-27 09:00:19.036447,0,1,0,0,0,78100.0,25.0,330.0,1.133333,838
20636,Brownsville,2024-01-27 09:00:19.036447,0,1,0,0,0,77100.0,18.0,114.0,1.315789,76
20637,0,2024-01-27 09:00:19.036447,0,1,0,0,0,92300.0,17.0,433.0,1.120092,987
20638,0,2024-01-27 09:00:19.036447,0,1,0,0,0,84700.0,18.0,349.0,1.171920,988


In [20]:
# Grouping each column by the primary_key and calculating the mean for each group
neighborhood_feature_data = neighborhood_feature_data.groupby('primary_key').mean().reset_index()

# Displaying the updated grouped DataFrame
neighborhood_feature_data


Unnamed: 0,primary_key,event_time,<1h_ocean,inland,island,near_bay,near_ocean,median_house_value,median_house_age,total_households,bedrooms_per_household,encoded_locality
0,0,2024-01-27 09:00:19.036446976,0.385309,0.43445,0.00043,0.075601,0.10421,192184.221306,25.634880,494.447509,1.116721,477.861512
1,28 Palms,2024-01-27 09:00:19.036446976,1.000000,0.00000,0.00000,0.000000,0.00000,222200.000000,25.000000,923.000000,1.017335,12.000000
2,Acorn Industrial,2024-01-27 09:00:19.036446976,0.000000,0.00000,0.00000,1.000000,0.00000,81300.000000,52.000000,147.000000,1.659864,1.000000
3,Adams Hill,2024-01-27 09:00:19.036446976,1.000000,0.00000,0.00000,0.000000,0.00000,250733.333333,39.500000,493.666667,1.034649,260.000000
4,Agua Mansa Industrial Corridor,2024-01-27 09:00:19.036446976,0.000000,1.00000,0.00000,0.000000,0.00000,112300.000000,17.000000,516.000000,1.102713,627.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
1302,Woodside Plaza,2024-01-27 09:00:19.036446976,0.000000,0.00000,0.00000,0.000000,1.00000,346150.000000,32.750000,820.000000,1.026945,758.000000
1303,Wrigley Heights,2024-01-27 09:00:19.036446976,0.000000,0.00000,0.00000,0.000000,1.00000,225300.000000,32.666667,492.000000,1.073543,275.000000
1304,Wyndham,2024-01-27 09:00:19.036446976,0.000000,1.00000,0.00000,0.000000,0.00000,101200.000000,23.000000,420.000000,0.971429,815.000000
1305,Ygnacio Valley,2024-01-27 09:00:19.036446976,0.000000,0.00000,0.00000,1.000000,0.00000,351600.000000,23.333333,548.000000,1.019697,71.000000


In [21]:
# Rounding the ocean proximity columns to the nearest integer to change the numbers to integers
columns_to_round = ['<1h_ocean', 'inland', 'island', 'near_bay', 'near_ocean']
neighborhood_feature_data[columns_to_round] = neighborhood_feature_data[columns_to_round].round().astype(int)

# Display the updated DataFrame
neighborhood_feature_data

Unnamed: 0,primary_key,event_time,<1h_ocean,inland,island,near_bay,near_ocean,median_house_value,median_house_age,total_households,bedrooms_per_household,encoded_locality
0,0,2024-01-27 09:00:19.036446976,0,0,0,0,0,192184.221306,25.634880,494.447509,1.116721,477.861512
1,28 Palms,2024-01-27 09:00:19.036446976,1,0,0,0,0,222200.000000,25.000000,923.000000,1.017335,12.000000
2,Acorn Industrial,2024-01-27 09:00:19.036446976,0,0,0,1,0,81300.000000,52.000000,147.000000,1.659864,1.000000
3,Adams Hill,2024-01-27 09:00:19.036446976,1,0,0,0,0,250733.333333,39.500000,493.666667,1.034649,260.000000
4,Agua Mansa Industrial Corridor,2024-01-27 09:00:19.036446976,0,1,0,0,0,112300.000000,17.000000,516.000000,1.102713,627.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
1302,Woodside Plaza,2024-01-27 09:00:19.036446976,0,0,0,0,1,346150.000000,32.750000,820.000000,1.026945,758.000000
1303,Wrigley Heights,2024-01-27 09:00:19.036446976,0,0,0,0,1,225300.000000,32.666667,492.000000,1.073543,275.000000
1304,Wyndham,2024-01-27 09:00:19.036446976,0,1,0,0,0,101200.000000,23.000000,420.000000,0.971429,815.000000
1305,Ygnacio Valley,2024-01-27 09:00:19.036446976,0,0,0,1,0,351600.000000,23.333333,548.000000,1.019697,71.000000


In [22]:
# Limiting the values in the 'median_house_value' column at 500,000 max
neighborhood_feature_data['median_house_value'] = neighborhood_feature_data['median_house_value'].clip(upper=500000)

# Display the updated DataFrame
neighborhood_feature_data.describe()

Unnamed: 0,event_time,<1h_ocean,inland,island,near_bay,near_ocean,median_house_value,median_house_age,total_households,bedrooms_per_household,encoded_locality
count,1307,1307.0,1307.0,1307.0,1307.0,1307.0,1307.0,1307.0,1307.0,1307.0,1307.0
mean,2024-01-27 09:00:19.036447232,0.318286,0.298393,0.0,0.188217,0.185157,226020.077834,28.356936,581.452797,1.055849,491.627135
min,2024-01-27 09:00:19.036446976,0.0,0.0,0.0,0.0,0.0,40000.0,2.0,16.0,0.0,0.0
25%,2024-01-27 09:00:19.036446976,0.0,0.0,0.0,0.0,0.0,134625.0,19.0,365.25,1.015788,257.0
50%,2024-01-27 09:00:19.036446976,0.0,0.0,0.0,0.0,0.0,201300.0,29.0,483.0,1.041511,595.0
75%,2024-01-27 09:00:19.036446976,1.0,1.0,0.0,0.0,0.0,293125.0,36.666667,648.030303,1.072704,713.0
max,2024-01-27 09:00:19.036446976,1.0,1.0,0.0,1.0,1.0,500000.0,52.0,3522.0,3.93617,974.0
std,,0.46599,0.457728,0.0,0.391035,0.388573,117090.433761,12.029533,400.167942,0.187693,276.647959


In [23]:
# Defining the bins for discretization
bins = range(0, 100, 10)  # 0-9, 10-19, 20-29, ..., 90-99

# Defining the labels for each bin
labels = [f"{i}-{i+9}" for i in range(0, 90, 10)]

# Discretize the median_house_age column
neighborhood_feature_data['median_house_age'] = pd.cut(neighborhood_feature_data['median_house_age'], bins=bins, labels=labels, right=False)

# Displaying the updated DataFrame
neighborhood_feature_data

Unnamed: 0,primary_key,event_time,<1h_ocean,inland,island,near_bay,near_ocean,median_house_value,median_house_age,total_households,bedrooms_per_household,encoded_locality
0,0,2024-01-27 09:00:19.036446976,0,0,0,0,0,192184.221306,20-29,494.447509,1.116721,477.861512
1,28 Palms,2024-01-27 09:00:19.036446976,1,0,0,0,0,222200.000000,20-29,923.000000,1.017335,12.000000
2,Acorn Industrial,2024-01-27 09:00:19.036446976,0,0,0,1,0,81300.000000,50-59,147.000000,1.659864,1.000000
3,Adams Hill,2024-01-27 09:00:19.036446976,1,0,0,0,0,250733.333333,30-39,493.666667,1.034649,260.000000
4,Agua Mansa Industrial Corridor,2024-01-27 09:00:19.036446976,0,1,0,0,0,112300.000000,10-19,516.000000,1.102713,627.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
1302,Woodside Plaza,2024-01-27 09:00:19.036446976,0,0,0,0,1,346150.000000,30-39,820.000000,1.026945,758.000000
1303,Wrigley Heights,2024-01-27 09:00:19.036446976,0,0,0,0,1,225300.000000,30-39,492.000000,1.073543,275.000000
1304,Wyndham,2024-01-27 09:00:19.036446976,0,1,0,0,0,101200.000000,20-29,420.000000,0.971429,815.000000
1305,Ygnacio Valley,2024-01-27 09:00:19.036446976,0,0,0,1,0,351600.000000,20-29,548.000000,1.019697,71.000000


In [24]:
# Rounding the values in the 'total_households' column to the nearest integer
neighborhood_feature_data['total_households'] = neighborhood_feature_data['total_households'].round().astype(int)

# Displaying the updated DataFrame
neighborhood_feature_data

Unnamed: 0,primary_key,event_time,<1h_ocean,inland,island,near_bay,near_ocean,median_house_value,median_house_age,total_households,bedrooms_per_household,encoded_locality
0,0,2024-01-27 09:00:19.036446976,0,0,0,0,0,192184.221306,20-29,494,1.116721,477.861512
1,28 Palms,2024-01-27 09:00:19.036446976,1,0,0,0,0,222200.000000,20-29,923,1.017335,12.000000
2,Acorn Industrial,2024-01-27 09:00:19.036446976,0,0,0,1,0,81300.000000,50-59,147,1.659864,1.000000
3,Adams Hill,2024-01-27 09:00:19.036446976,1,0,0,0,0,250733.333333,30-39,494,1.034649,260.000000
4,Agua Mansa Industrial Corridor,2024-01-27 09:00:19.036446976,0,1,0,0,0,112300.000000,10-19,516,1.102713,627.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
1302,Woodside Plaza,2024-01-27 09:00:19.036446976,0,0,0,0,1,346150.000000,30-39,820,1.026945,758.000000
1303,Wrigley Heights,2024-01-27 09:00:19.036446976,0,0,0,0,1,225300.000000,30-39,492,1.073543,275.000000
1304,Wyndham,2024-01-27 09:00:19.036446976,0,1,0,0,0,101200.000000,20-29,420,0.971429,815.000000
1305,Ygnacio Valley,2024-01-27 09:00:19.036446976,0,0,0,1,0,351600.000000,20-29,548,1.019697,71.000000


In [25]:
# Checking for null values and displaying the count
null_count = neighborhood_feature_data.isnull().sum()
print(null_count)

primary_key               0
event_time                0
<1h_ocean                 0
inland                    0
island                    0
near_bay                  0
near_ocean                0
median_house_value        0
median_house_age          0
total_households          0
bedrooms_per_household    0
encoded_locality          0
dtype: int64


In [41]:
#Renaming the column to remove the < symbol
new_column_names = {
    '<1h_ocean': 'less_than_1h_ocean',
}

neighborhood_feature_data.rename(columns=new_column_names, inplace=True)

neighborhood_feature_data

Unnamed: 0,primary_key,event_time,less_than_1h_ocean,inland,island,near_bay,near_ocean,median_house_value,median_house_age,total_households,bedrooms_per_household,encoded_locality
0,0,1.706346e+09,0,0,0,0,0,192184.221306,20,494,1.116721,477.861512
1,28 Palms,1.706346e+09,1,0,0,0,0,222200.000000,20,923,1.017335,12.000000
2,Acorn Industrial,1.706346e+09,0,0,0,1,0,81300.000000,50,147,1.659864,1.000000
3,Adams Hill,1.706346e+09,1,0,0,0,0,250733.333333,30,494,1.034649,260.000000
4,Agua Mansa Industrial Corridor,1.706346e+09,0,1,0,0,0,112300.000000,10,516,1.102713,627.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
1302,Woodside Plaza,1.706346e+09,0,0,0,0,1,346150.000000,30,820,1.026945,758.000000
1303,Wrigley Heights,1.706346e+09,0,0,0,0,1,225300.000000,30,492,1.073543,275.000000
1304,Wyndham,1.706346e+09,0,1,0,0,0,101200.000000,20,420,0.971429,815.000000
1305,Ygnacio Valley,1.706346e+09,0,0,0,1,0,351600.000000,20,548,1.019697,71.000000


In [42]:
# Explicitly setting the feature type for 'primary_key' to be a string
neighborhood_feature_data['primary_key'] = neighborhood_feature_data['primary_key'].astype(str)

# Rounding the values in the 'total_households' column to the nearest integer
neighborhood_feature_data['total_households'] = neighborhood_feature_data['total_households'].round().astype(int)

# Display the updated DataFrame
neighborhood_feature_data

Unnamed: 0,primary_key,event_time,less_than_1h_ocean,inland,island,near_bay,near_ocean,median_house_value,median_house_age,total_households,bedrooms_per_household,encoded_locality
0,0,1.706346e+09,0,0,0,0,0,192184.221306,20,494,1.116721,477.861512
1,28 Palms,1.706346e+09,1,0,0,0,0,222200.000000,20,923,1.017335,12.000000
2,Acorn Industrial,1.706346e+09,0,0,0,1,0,81300.000000,50,147,1.659864,1.000000
3,Adams Hill,1.706346e+09,1,0,0,0,0,250733.333333,30,494,1.034649,260.000000
4,Agua Mansa Industrial Corridor,1.706346e+09,0,1,0,0,0,112300.000000,10,516,1.102713,627.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
1302,Woodside Plaza,1.706346e+09,0,0,0,0,1,346150.000000,30,820,1.026945,758.000000
1303,Wrigley Heights,1.706346e+09,0,0,0,0,1,225300.000000,30,492,1.073543,275.000000
1304,Wyndham,1.706346e+09,0,1,0,0,0,101200.000000,20,420,0.971429,815.000000
1305,Ygnacio Valley,1.706346e+09,0,0,0,1,0,351600.000000,20,548,1.019697,71.000000


In [43]:
#Displaying the data types
neighborhood_feature_data.dtypes

primary_key                       object
event_time                       float64
less_than_1h_ocean                 int64
inland                             int64
island                             int64
near_bay                           int64
near_ocean                         int64
median_house_value               float64
median_house_age          string[python]
total_households                   int64
bedrooms_per_household           float64
encoded_locality                 float64
dtype: object

In [44]:
# Extracting the numeric part from 'median_house_age' column
neighborhood_feature_data['median_house_age'] = neighborhood_feature_data['median_house_age'].str.extract('(\d+)')

# Converting 'category' dtype to 'object'
neighborhood_feature_data['median_house_age'] = neighborhood_feature_data['median_house_age'].astype('object')

In [45]:
#Displaying the data types
neighborhood_feature_data.dtypes

primary_key                object
event_time                float64
less_than_1h_ocean          int64
inland                      int64
island                      int64
near_bay                    int64
near_ocean                  int64
median_house_value        float64
median_house_age           object
total_households            int64
bedrooms_per_household    float64
encoded_locality          float64
dtype: object

In [46]:
#Exporting to csv
neighborhood_feature_data.to_csv('neighborhood_feature_data.csv', index=False)

## Ingest Data into FeatureStore


In [47]:
#Setting the feature group name. 
from time import gmtime, strftime, sleep

neighborhood_feature_group_name = "neighborhood-feature-group-" + strftime("%d-%H-%M-%S", gmtime())

In [48]:
#Creating the Feature Group
from sagemaker.feature_store.feature_group import FeatureGroup

neighborhood_feature_group = FeatureGroup(
    name=neighborhood_feature_group_name, sagemaker_session=feature_store_session
)

In [49]:
#Setting the feature definitions
from sagemaker.feature_store.feature_definition import FeatureDefinition, FeatureTypeEnum

feature_definitions = [
    FeatureDefinition(feature_name='primary_key', feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name='event_time', feature_type=FeatureTypeEnum.FRACTIONAL),
    FeatureDefinition(feature_name='less_than_1h_ocean', feature_type=FeatureTypeEnum.INTEGRAL),
    FeatureDefinition(feature_name='inland', feature_type=FeatureTypeEnum.INTEGRAL),
    FeatureDefinition(feature_name='island', feature_type=FeatureTypeEnum.INTEGRAL),
    FeatureDefinition(feature_name='near_bay', feature_type=FeatureTypeEnum.INTEGRAL),
    FeatureDefinition(feature_name='near_ocean', feature_type=FeatureTypeEnum.INTEGRAL),
    FeatureDefinition(feature_name='median_house_value', feature_type=FeatureTypeEnum.FRACTIONAL),
    FeatureDefinition(feature_name='median_house_age', feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name='total_households', feature_type=FeatureTypeEnum.INTEGRAL),
    FeatureDefinition(feature_name='bedrooms_per_household', feature_type=FeatureTypeEnum.FRACTIONAL),
    FeatureDefinition(feature_name='encoded_locality', feature_type=FeatureTypeEnum.INTEGRAL),
]

In [50]:
#Importing feature definitions to feature group
import time

current_time_sec = int(round(time.time()))


def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        if data_frame.dtypes[label] == "object":
            data_frame[label] = data_frame[label].astype("str").astype("string")


# casting the object dtype to string. 
cast_object_to_string(neighborhood_feature_data)

# recording the identifier and event time feature names
record_identifier_feature_name = "primary_key"
event_time_feature_name = "event_time"

# Appending the EventTime feature
neighborhood_feature_data[event_time_feature_name] = pd.Series(
    [current_time_sec] * len(neighborhood_feature_data), dtype="float64"
)

# loading feature definitions to the feature group.
neighborhood_feature_group.load_feature_definitions(data_frame=neighborhood_feature_data)
# output is suppressed

[FeatureDefinition(feature_name='primary_key', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='event_time', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>),
 FeatureDefinition(feature_name='less_than_1h_ocean', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='inland', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='island', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='near_bay', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='near_ocean', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='median_house_value', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>),
 FeatureDefinition(feature_name='median_house_age', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='total_households', feature_type=<FeatureTypeEnum.INTEG

#### Create FeatureGroups in SageMaker FeatureStore

In [51]:
#Creating the feature Group
def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group Creation")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")


neighborhood_feature_group.create(
    s3_uri=f"s3://{default_s3_bucket_name}/{prefix}",
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=role,
    enable_online_store=True,
)

wait_for_feature_group_creation_complete(feature_group=neighborhood_feature_group)

Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
FeatureGroup neighborhood-feature-group-27-09-02-30 successfully created.


In [52]:
#Describing the Feature Group
neighborhood_feature_group.describe()

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:339129315232:feature-group/neighborhood-feature-group-27-09-02-30',
 'FeatureGroupName': 'neighborhood-feature-group-27-09-02-30',
 'RecordIdentifierFeatureName': 'primary_key',
 'EventTimeFeatureName': 'event_time',
 'FeatureDefinitions': [{'FeatureName': 'primary_key',
   'FeatureType': 'String'},
  {'FeatureName': 'event_time', 'FeatureType': 'Fractional'},
  {'FeatureName': 'less_than_1h_ocean', 'FeatureType': 'Integral'},
  {'FeatureName': 'inland', 'FeatureType': 'Integral'},
  {'FeatureName': 'island', 'FeatureType': 'Integral'},
  {'FeatureName': 'near_bay', 'FeatureType': 'Integral'},
  {'FeatureName': 'near_ocean', 'FeatureType': 'Integral'},
  {'FeatureName': 'median_house_value', 'FeatureType': 'Fractional'},
  {'FeatureName': 'median_house_age', 'FeatureType': 'String'},
  {'FeatureName': 'total_households', 'FeatureType': 'Integral'},
  {'FeatureName': 'bedrooms_per_household', 'FeatureType': 'Fractional'},
  {'FeatureName'

In [53]:
#Listing Feature Groups
sagemaker_client.list_feature_groups()  # use boto client to list FeatureGroups

{'FeatureGroupSummaries': [{'FeatureGroupName': 'neighborhood-feature-group-27-09-02-30',
   'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:339129315232:feature-group/neighborhood-feature-group-27-09-02-30',
   'CreationTime': datetime.datetime(2024, 1, 27, 9, 2, 36, 197000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created'},
  {'FeatureGroupName': 'neighborhood-feature-group-27-08-14-47',
   'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:339129315232:feature-group/neighborhood-feature-group-27-08-14-47',
   'CreationTime': datetime.datetime(2024, 1, 27, 8, 15, 27, 276000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created',
   'OfflineStoreStatus': {'Status': 'Active'}},
  {'FeatureGroupName': 'neighborhood-feature-group-27-08-07-15',
   'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:339129315232:feature-group/neighborhood-feature-group-27-08-07-15',
   'CreationTime': datetime.datetime(2024, 1, 27, 8, 7, 29, 870000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created',
   '

#### PutRecords into FeatureGroup

After the FeatureGroups have been created, we can put data into the FeatureGroups by using the PutRecord API. This API can handle high TPS and is designed to be called by different streams. The data from all of these Put requests is buffered and written to S3 in chunks. The files will be written to the offline store within a few minutes of ingestion. For this example, to accelerate the ingestion process, we are specifying multiple workers to do the job simultaneously. It will take ~1min to ingest data to the 2 FeatureGroups, respectively.

In [54]:
#Ingesting the data
neighborhood_feature_group.ingest(data_frame=neighborhood_feature_data, max_workers=3, wait=True)

IngestionManagerPandas(feature_group_name='neighborhood-feature-group-27-09-02-30', sagemaker_session=<sagemaker.session.Session object at 0x7fdd002941f0>, data_frame=                         primary_key    event_time  less_than_1h_ocean  \
0                                  0  1.706346e+09                   0   
1                           28 Palms  1.706346e+09                   1   
2                   Acorn Industrial  1.706346e+09                   0   
3                         Adams Hill  1.706346e+09                   1   
4     Agua Mansa Industrial Corridor  1.706346e+09                   0   
...                              ...           ...                 ...   
1302                  Woodside Plaza  1.706346e+09                   0   
1303                 Wrigley Heights  1.706346e+09                   0   
1304                         Wyndham  1.706346e+09                   0   
1305                  Ygnacio Valley  1.706346e+09                   0   
1306               

In [55]:
#Testing
record_identifier_value = str('28 Palms')

featurestore_runtime.get_record(
    FeatureGroupName=neighborhood_feature_group_name,
    RecordIdentifierValueAsString=record_identifier_value,
)

{'ResponseMetadata': {'RequestId': 'e02441f7-419e-4a2f-95d8-37bf9056e71c',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'e02441f7-419e-4a2f-95d8-37bf9056e71c',
   'content-type': 'application/json',
   'content-length': '1001',
   'date': 'Sat, 27 Jan 2024 09:03:03 GMT'},
  'RetryAttempts': 0},
 'Record': [{'FeatureName': 'primary_key', 'ValueAsString': '28 Palms'},
  {'FeatureName': 'event_time', 'ValueAsString': '1706346152.0'},
  {'FeatureName': 'less_than_1h_ocean', 'ValueAsString': '1'},
  {'FeatureName': 'inland', 'ValueAsString': '0'},
  {'FeatureName': 'island', 'ValueAsString': '0'},
  {'FeatureName': 'near_bay', 'ValueAsString': '0'},
  {'FeatureName': 'near_ocean', 'ValueAsString': '0'},
  {'FeatureName': 'median_house_value', 'ValueAsString': '222200.0'},
  {'FeatureName': 'median_house_age', 'ValueAsString': '20'},
  {'FeatureName': 'total_households', 'ValueAsString': '923'},
  {'FeatureName': 'bedrooms_per_household',
   'ValueAsString': '1.017334777898

In [56]:
featurestore_runtime.batch_get_record(
    Identifiers=[
        {
            "FeatureGroupName": neighborhood_feature_group_name,
            "RecordIdentifiersValueAsString": ['Brooktree'],
        },
    ]
)

{'ResponseMetadata': {'RequestId': '63c4d216-4fc1-4e42-8a3d-26cc0dc1f2cb',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '63c4d216-4fc1-4e42-8a3d-26cc0dc1f2cb',
   'content-type': 'application/json',
   'content-length': '1147',
   'date': 'Sat, 27 Jan 2024 09:03:03 GMT'},
  'RetryAttempts': 0},
 'Records': [{'FeatureGroupName': 'neighborhood-feature-group-27-09-02-30',
   'RecordIdentifierValueAsString': 'Brooktree',
   'Record': [{'FeatureName': 'primary_key', 'ValueAsString': 'Brooktree'},
    {'FeatureName': 'event_time', 'ValueAsString': '1706346152.0'},
    {'FeatureName': 'less_than_1h_ocean', 'ValueAsString': '1'},
    {'FeatureName': 'inland', 'ValueAsString': '0'},
    {'FeatureName': 'island', 'ValueAsString': '0'},
    {'FeatureName': 'near_bay', 'ValueAsString': '0'},
    {'FeatureName': 'near_ocean', 'ValueAsString': '0'},
    {'FeatureName': 'median_house_value', 'ValueAsString': '257400.0'},
    {'FeatureName': 'median_house_age', 'ValueAsString': '0'},

In [57]:
print(neighborhood_feature_group.as_hive_ddl())

CREATE EXTERNAL TABLE IF NOT EXISTS sagemaker_featurestore.neighborhood-feature-group-27-09-02-30 (
  primary_key STRING
  event_time FLOAT
  less_than_1h_ocean INT
  inland INT
  island INT
  near_bay INT
  near_ocean INT
  median_house_value FLOAT
  median_house_age STRING
  total_households INT
  bedrooms_per_household FLOAT
  encoded_locality FLOAT
  write_time TIMESTAMP
  event_time TIMESTAMP
  is_deleted BOOLEAN
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
  STORED AS
  INPUTFORMAT 'parquet.hive.DeprecatedParquetInputFormat'
  OUTPUTFORMAT 'parquet.hive.DeprecatedParquetOutputFormat'
LOCATION 's3://sagemaker-us-east-1-339129315232/sagemaker-assignment-3/339129315232/sagemaker/us-east-1/offline-store/neighborhood-feature-group-27-09-02-30'


In [58]:
account_id = boto3.client("sts").get_caller_identity()["Account"]
print(account_id)

s3_client = boto3.client("s3", region_name=region)

neighborhood_feature_group_resolved_output_s3_uri = (
    neighborhood_feature_group.describe()
    .get("OfflineStoreConfig")
    .get("S3StorageConfig")
    .get("ResolvedOutputS3Uri")
)
neighborhood_feature_group_s3_prefix = neighborhood_feature_group_resolved_output_s3_uri.replace(
    f"s3://{default_s3_bucket_name}/", ""
)

offline_store_contents = None
while offline_store_contents is None:
    objects_in_bucket = s3_client.list_objects(
        Bucket=default_s3_bucket_name, Prefix=neighborhood_feature_group_s3_prefix
    )
    if "Contents" in objects_in_bucket and len(objects_in_bucket["Contents"]) > 1:
        offline_store_contents = objects_in_bucket["Contents"]
    else:
        print("Waiting for data in offline store...\n")
        sleep(60)

print("Data available.")

339129315232
Waiting for data in offline store...

Waiting for data in offline store...

Waiting for data in offline store...

Waiting for data in offline store...

Waiting for data in offline store...

Waiting for data in offline store...

Data available.


## Queries

In [59]:
#Brooktree
record_identifier_value = str('Brooktree')

featurestore_runtime.get_record(
    FeatureGroupName=neighborhood_feature_group_name,
    RecordIdentifierValueAsString=record_identifier_value,
)

{'ResponseMetadata': {'RequestId': '1a431d7f-4918-4470-8024-10528c8007f8',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '1a431d7f-4918-4470-8024-10528c8007f8',
   'content-type': 'application/json',
   'content-length': '989',
   'date': 'Sat, 27 Jan 2024 09:09:04 GMT'},
  'RetryAttempts': 0},
 'Record': [{'FeatureName': 'primary_key', 'ValueAsString': 'Brooktree'},
  {'FeatureName': 'event_time', 'ValueAsString': '1706346152.0'},
  {'FeatureName': 'less_than_1h_ocean', 'ValueAsString': '1'},
  {'FeatureName': 'inland', 'ValueAsString': '0'},
  {'FeatureName': 'island', 'ValueAsString': '0'},
  {'FeatureName': 'near_bay', 'ValueAsString': '0'},
  {'FeatureName': 'near_ocean', 'ValueAsString': '0'},
  {'FeatureName': 'median_house_value', 'ValueAsString': '257400.0'},
  {'FeatureName': 'median_house_age', 'ValueAsString': '0'},
  {'FeatureName': 'total_households', 'ValueAsString': '1438'},
  {'FeatureName': 'bedrooms_per_household', 'ValueAsString': '0.0'},
  {'Featur

In [60]:
#Fisherman's Wharf
record_identifier_value = str('Fisherman\'s Wharf')

featurestore_runtime.get_record(
    FeatureGroupName=neighborhood_feature_group_name,
    RecordIdentifierValueAsString=record_identifier_value,
)

{'ResponseMetadata': {'RequestId': '3441e31e-76ad-402a-90a4-217c9fea985d',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '3441e31e-76ad-402a-90a4-217c9fea985d',
   'content-type': 'application/json',
   'content-length': '999',
   'date': 'Sat, 27 Jan 2024 09:09:04 GMT'},
  'RetryAttempts': 0},
 'Record': [{'FeatureName': 'primary_key',
   'ValueAsString': "Fisherman's Wharf"},
  {'FeatureName': 'event_time', 'ValueAsString': '1706346152.0'},
  {'FeatureName': 'less_than_1h_ocean', 'ValueAsString': '0'},
  {'FeatureName': 'inland', 'ValueAsString': '0'},
  {'FeatureName': 'island', 'ValueAsString': '0'},
  {'FeatureName': 'near_bay', 'ValueAsString': '1'},
  {'FeatureName': 'near_ocean', 'ValueAsString': '0'},
  {'FeatureName': 'median_house_value', 'ValueAsString': '500000.0'},
  {'FeatureName': 'median_house_age', 'ValueAsString': '50'},
  {'FeatureName': 'total_households', 'ValueAsString': '250'},
  {'FeatureName': 'bedrooms_per_household', 'ValueAsString': '1.268'

In [61]:
#Los Osos
record_identifier_value = str('Los Osos')

featurestore_runtime.get_record(
    FeatureGroupName=neighborhood_feature_group_name,
    RecordIdentifierValueAsString=record_identifier_value,
)

{'ResponseMetadata': {'RequestId': '03a10f13-2704-4644-8950-750aa537e1f5',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '03a10f13-2704-4644-8950-750aa537e1f5',
   'content-type': 'application/json',
   'content-length': '1003',
   'date': 'Sat, 27 Jan 2024 09:09:04 GMT'},
  'RetryAttempts': 0},
 'Record': [{'FeatureName': 'primary_key', 'ValueAsString': 'Los Osos'},
  {'FeatureName': 'event_time', 'ValueAsString': '1706346152.0'},
  {'FeatureName': 'less_than_1h_ocean', 'ValueAsString': '0'},
  {'FeatureName': 'inland', 'ValueAsString': '0'},
  {'FeatureName': 'island', 'ValueAsString': '0'},
  {'FeatureName': 'near_bay', 'ValueAsString': '0'},
  {'FeatureName': 'near_ocean', 'ValueAsString': '1'},
  {'FeatureName': 'median_house_value', 'ValueAsString': '221612.5'},
  {'FeatureName': 'median_house_age', 'ValueAsString': '10'},
  {'FeatureName': 'total_households', 'ValueAsString': '612'},
  {'FeatureName': 'bedrooms_per_household',
   'ValueAsString': '1.047884540482

## Cleanup Resources

In [63]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>