## Setup SageMaker FeatureStore

In [1]:
import boto3
import sagemaker

original_boto3_version = boto3.__version__
%pip install 'boto3>1.17.21'

Note: you may need to restart the kernel to use updated packages.


In [2]:
from sagemaker.session import Session

region = boto3.Session().region_name

boto_session = boto3.Session(region_name=region)

sagemaker_client = boto_session.client(service_name="sagemaker", region_name=region)
featurestore_runtime = boto_session.client(
    service_name="sagemaker-featurestore-runtime", region_name=region
)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime,
)

#### S3 Bucket Setup For The OfflineStore

In [3]:
# You can modify the following to use a bucket of your choosing
default_s3_bucket_name = feature_store_session.default_bucket()
prefix = "sagemaker-assignment-3"

print(default_s3_bucket_name)

sagemaker-us-east-1-339129315232


In [72]:
from sagemaker import get_execution_role

# You can modify the following to use a role of your choosing. See the documentation for how to create this.
role = get_execution_role()
print(role)

arn:aws:iam::339129315232:role/LabRole


## Inspect Dataset

In [5]:
new_working_directory = '/home/sagemaker-user/'

In [6]:
import os
os.chdir(new_working_directory)

In [7]:
print("Current Working Directory:", os.getcwd())

Current Working Directory: /home/sagemaker-user


In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Specify the full path to the CSV files
gmaps_path = 'housing_gmaps_data_raw.csv'
housing_path = 'aai-540-homework/homework-3-1/housing.csv'

# Read data into Pandas DataFrames
gmaps_data = pd.read_csv(gmaps_path)
housing_data = pd.read_csv(housing_path)

In [9]:
gmaps_data.head()

Unnamed: 0,street_number,route,locality-political,administrative_area_level_2-political,administrative_area_level_1-political,country-political,postal_code,address,longitude,latitude,...,establishment-natural_feature,airport-establishment-point_of_interest,political-sublocality-sublocality_level_1,administrative_area_level_3-political,post_box,establishment-light_rail_station-point_of_interest-transit_station,establishment-point_of_interest,aquarium-establishment-park-point_of_interest-tourist_attraction-zoo,campground-establishment-lodging-park-point_of_interest-rv_park-tourist_attraction,cemetery-establishment-park-point_of_interest
0,3130,Grizzly Peak Boulevard,Berkeley,Alameda County,California,United States,94705.0,"3130 Grizzly Peak Blvd, Berkeley, CA 94705, USA",-122.23,37.88,...,,,,,,,,,,
1,2005,Tunnel Road,Oakland,Alameda County,California,United States,94611.0,"2005 Tunnel Rd, Oakland, CA 94611, USA",-122.22,37.86,...,,,,,,,,,,
2,6886,Chabot Road,Oakland,Alameda County,California,United States,94618.0,"6886 Chabot Rd, Oakland, CA 94618, USA",-122.24,37.85,...,,,,,,,,,,
3,6365,Florio Street,Oakland,Alameda County,California,United States,94618.0,"6365 Florio St, Oakland, CA 94618, USA",-122.25,37.85,...,,,,,,,,,,
4,5407,Bryant Avenue,Oakland,Alameda County,California,United States,94618.0,"5407 Bryant Ave, Oakland, CA 94618, USA",-122.25,37.84,...,,,,,,,,,,


In [10]:
# Check for null values and display the count
null_count = gmaps_data.isnull().sum()
print(null_count)

street_number                                                                          1402
route                                                                                   380
locality-political                                                                      187
administrative_area_level_2-political                                                    47
administrative_area_level_1-political                                                     3
country-political                                                                         0
postal_code                                                                             180
address                                                                                   0
longitude                                                                                 0
latitude                                                                                  0
neighborhood-political                                                          

In [12]:
columns_to_remove = [
    'neighborhood-political',
    'postal_code_suffix',
    'establishment-point_of_interest-transit_station',
    'establishment-park-point_of_interest',
    'premise',
    'establishment-point_of_interest-subway_station-transit_station',
    'airport-establishment-finance-moving_company-point_of_interest-storage',
    'subpremise',
    'bus_station-establishment-point_of_interest-transit_station',
    'establishment-park-point_of_interest-tourist_attraction',
    'establishment-natural_feature',
    'airport-establishment-point_of_interest',
    'political-sublocality-sublocality_level_1',
    'administrative_area_level_3-political',
    'post_box',
    'establishment-light_rail_station-point_of_interest-transit_station',
    'establishment-point_of_interest',
    'aquarium-establishment-park-point_of_interest-tourist_attraction-zoo',
    'campground-establishment-lodging-park-point_of_interest-rv_park-tourist_attraction',
    'cemetery-establishment-park-point_of_interest'
]

gmaps_data_cleaned = gmaps_data.drop(columns=columns_to_remove, errors='ignore')

In [13]:
# Check for null values and display the count
null_count = gmaps_data_cleaned.isnull().sum()
print(null_count)

street_number                            1402
route                                     380
locality-political                        187
administrative_area_level_2-political      47
administrative_area_level_1-political       3
country-political                           0
postal_code                               180
address                                     0
longitude                                   0
latitude                                    0
dtype: int64


In [14]:
gmaps_data = gmaps_data_cleaned.dropna()
gmaps_data

Unnamed: 0,street_number,route,locality-political,administrative_area_level_2-political,administrative_area_level_1-political,country-political,postal_code,address,longitude,latitude
0,3130,Grizzly Peak Boulevard,Berkeley,Alameda County,California,United States,94705.0,"3130 Grizzly Peak Blvd, Berkeley, CA 94705, USA",-122.23,37.88
1,2005,Tunnel Road,Oakland,Alameda County,California,United States,94611.0,"2005 Tunnel Rd, Oakland, CA 94611, USA",-122.22,37.86
2,6886,Chabot Road,Oakland,Alameda County,California,United States,94618.0,"6886 Chabot Rd, Oakland, CA 94618, USA",-122.24,37.85
3,6365,Florio Street,Oakland,Alameda County,California,United States,94618.0,"6365 Florio St, Oakland, CA 94618, USA",-122.25,37.85
4,5407,Bryant Avenue,Oakland,Alameda County,California,United States,94618.0,"5407 Bryant Ave, Oakland, CA 94618, USA",-122.25,37.84
...,...,...,...,...,...,...,...,...,...,...
12583,2383,Walnut Avenue,Marysville,Yuba County,California,United States,95901.0,"2383 Walnut Ave, Marysville, CA 95901, USA",-121.53,39.19
12584,1443,Ramirez Road,Marysville,Yuba County,California,United States,95901.0,"1443 Ramirez Rd, Marysville, CA 95901, USA",-121.56,39.27
12587,16492,Indiana Ranch Road,Dobbins,Yuba County,California,United States,95935.0,"16492 Indiana Ranch Rd, Dobbins, CA 95935, USA",-121.22,39.43
12588,16345,Vierra Road,Rackerby,Yuba County,California,United States,95972.0,"16345 Vierra Rd, Rackerby, CA 95972, USA",-121.32,39.43


In [15]:
housing_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [16]:
housing_data = housing_data.dropna()
housing_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [17]:
# Check for null values and display the count
null_count = housing_data.isnull().sum()
print(null_count)

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64


# Feature Engineering

In [18]:
from datetime import datetime

# Assuming locality-code is available in gmaps_data
gmaps_data['locality-code'] = gmaps_data['locality-political'].astype('category').cat.codes

# Create a 'locality-zone' in housing data based on latitude and longitude
housing_data['locality-zone'] = housing_data.apply(lambda row: (round(row['latitude'], 3), round(row['longitude'], 3)), axis=1)

# Map 'locality-code' from gmaps data to the corresponding data in housing data
housing_data['locality-code'] = housing_data['locality-zone'].map(gmaps_data.set_index(['latitude', 'longitude'])['locality-code'])

# Create a neighborhood feature group
neighborhood_feature_group = pd.DataFrame()

# Feature: primary_key - neighborhood
neighborhood_feature_group['primary_key'] = housing_data['locality-code']

# Feature: event_time (calculated using python)
neighborhood_feature_group['event_time'] = datetime.now()

# Feature: <1h ocean, inland, island, near bay, near ocean
housing_data['<1h ocean'] = pd.get_dummies(housing_data['ocean_proximity'] == '<1H OCEAN', drop_first=True)
housing_data['inland'] = pd.get_dummies(housing_data['ocean_proximity'] == 'INLAND', drop_first=True)
housing_data['island'] = pd.get_dummies(housing_data['ocean_proximity'] == 'ISLAND', drop_first=True)
housing_data['near bay'] = pd.get_dummies(housing_data['ocean_proximity'] == 'NEAR BAY', drop_first=True)
housing_data['near ocean'] = pd.get_dummies(housing_data['ocean_proximity'] == 'NEAR OCEAN', drop_first=True)

# Feature: median house value (Average and Cap)
neighborhood_median_value = housing_data.groupby('locality-code')['median_house_value'].mean().clip(upper=500000)
housing_data['median_house_value'] = housing_data['locality-code'].map(neighborhood_median_value)

# Feature: median house age (Average and Discretization)
neighborhood_median_age = housing_data.groupby('locality-code')['housing_median_age'].mean().apply(lambda x: round(x, -1))
housing_data['median_house_age'] = housing_data['locality-code'].map(neighborhood_median_age)

# Feature: total households (Average and Round)
neighborhood_feature_group['total households'] = housing_data.groupby('locality-code')['households'].mean().apply(lambda x: round(x))

# Feature: bedrooms per household (Derived and Imputed)
median_bedrooms_per_household = housing_data.groupby('locality-code')['total_bedrooms'].median().to_dict()
housing_data['bedrooms_per_household'] = housing_data['locality-code'].map(median_bedrooms_per_household)
neighborhood_feature_group['bedrooms per household'] = housing_data['bedrooms_per_household'].fillna(housing_data['bedrooms_per_household'].mean())

# Display the created neighborhood feature group
neighborhood_feature_group

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gmaps_data['locality-code'] = gmaps_data['locality-political'].astype('category').cat.codes


Unnamed: 0,primary_key,event_time,total households,bedrooms per household
0,60.0,2024-01-26 02:15:47.456304,503.0,383.500000
1,545.0,2024-01-26 02:15:47.456304,717.0,346.000000
2,545.0,2024-01-26 02:15:47.456304,473.0,346.000000
3,545.0,2024-01-26 02:15:47.456304,1880.0,346.000000
4,545.0,2024-01-26 02:15:47.456304,197.0,346.000000
...,...,...,...,...
20635,,2024-01-26 02:15:47.456304,,458.426379
20636,,2024-01-26 02:15:47.456304,,458.426379
20637,207.0,2024-01-26 02:15:47.456304,,485.000000
20638,627.0,2024-01-26 02:15:47.456304,,409.000000


In [19]:
# Assuming locality-code is available in gmaps_data
gmaps_data['locality-code'] = gmaps_data['locality-political'].astype('category').cat.codes

# Create a neighborhood feature group
neighborhood_feature_group = pd.DataFrame()

# Feature: primary_key - neighborhood
neighborhood_feature_group['primary_key'] = gmaps_data['locality-political']

# Feature: event_time (calculated using python)
neighborhood_feature_group['event_time'] = datetime.now()

# Feature: <1h ocean (one hot encoded column derived from ocean_proximity)
neighborhood_feature_group['<1h ocean'] = housing_data['<1h ocean']

# Feature: inland (one hot encoded column derived from ocean_proximity)
neighborhood_feature_group['inland'] = housing_data['inland']

# Feature: island (one hot encoded column derived from ocean_proximity)
neighborhood_feature_group['island'] = housing_data['island']

# Feature: near bay (one hot encoded column derived from ocean_proximity)
neighborhood_feature_group['near bay'] = housing_data['near bay']

# Feature: near ocean (one hot encoded column derived from ocean_proximity)
neighborhood_feature_group['near ocean'] = housing_data['near ocean']

# Feature: median house value (Average and Cap)
neighborhood_median_value = housing_data.groupby('locality-code')['median_house_value'].mean().clip(upper=500000)
neighborhood_feature_group['median house value'] = gmaps_data['locality-code'].map(neighborhood_median_value)

# Feature: median house age (Average and Discretization)
neighborhood_feature_group['median house age'] = housing_data.groupby('locality-code')['housing_median_age'].mean().apply(lambda x: 10 * (x // 10))

# Feature: total households (Average and Integer)
neighborhood_feature_group['total households'] = housing_data.groupby('locality-code')['households'].mean().apply(lambda x: round(x))

# Feature: bedrooms per household (Derived and Imputed)
median_bedrooms_per_household = housing_data.groupby('locality-code')['total_bedrooms'].median().to_dict()
housing_data['bedrooms_per_household'] = housing_data['locality-code'].map(median_bedrooms_per_household)
neighborhood_feature_group['bedrooms per household'] = housing_data['bedrooms_per_household'].fillna(housing_data['bedrooms_per_household'].mean())

# Feature: locality-code (Encoded locality-political value)
neighborhood_feature_group['locality-code'] = gmaps_data['locality-code']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gmaps_data['locality-code'] = gmaps_data['locality-political'].astype('category').cat.codes


In [20]:
neighborhood_feature_group

Unnamed: 0,primary_key,event_time,<1h ocean,inland,island,near bay,near ocean,median house value,median house age,total households,bedrooms per household,locality-code
0,Berkeley,2024-01-26 02:16:04.646783,False,False,False,True,False,253127.696429,20.0,503.0,383.5,60
1,Oakland,2024-01-26 02:16:04.646783,False,False,False,True,False,169325.931624,0.0,717.0,346.0,545
2,Oakland,2024-01-26 02:16:04.646783,False,False,False,True,False,169325.931624,10.0,473.0,346.0,545
3,Oakland,2024-01-26 02:16:04.646783,False,False,False,True,False,169325.931624,10.0,1880.0,346.0,545
4,Oakland,2024-01-26 02:16:04.646783,False,False,False,True,False,169325.931624,0.0,197.0,346.0,545
...,...,...,...,...,...,...,...,...,...,...,...,...
12583,Marysville,2024-01-26 02:16:04.646783,False,True,False,False,False,73204.166667,,,456.0,463
12584,Marysville,2024-01-26 02:16:04.646783,False,True,False,False,False,73204.166667,,,456.0,463
12587,Dobbins,2024-01-26 02:16:04.646783,False,True,False,False,False,92300.000000,,,456.0,207
12588,Rackerby,2024-01-26 02:16:04.646783,False,True,False,False,False,84700.000000,,,456.0,627


In [21]:
neighborhood_data= neighborhood_feature_group

In [22]:
# Check for null values and display the count
null_count = neighborhood_feature_group.isnull().sum()
print(null_count)

primary_key                   0
event_time                    0
<1h ocean                   108
inland                      108
island                      108
near bay                    108
near ocean                  108
median house value            0
median house age          10371
total households          10371
bedrooms per household      108
locality-code                 0
dtype: int64


In [25]:
#drop null
neighborhood_data =neighborhood_data.dropna()


In [26]:
neighborhood_data

Unnamed: 0,primary_key,event_time,<1h ocean,inland,island,near bay,near ocean,median house value,median house age,total households,bedrooms per household,locality-code
0,Berkeley,2024-01-26 02:16:04.646783,False,False,False,True,False,253127.696429,20.0,503.0,383.5,60
1,Oakland,2024-01-26 02:16:04.646783,False,False,False,True,False,169325.931624,0.0,717.0,346.0,545
2,Oakland,2024-01-26 02:16:04.646783,False,False,False,True,False,169325.931624,10.0,473.0,346.0,545
3,Oakland,2024-01-26 02:16:04.646783,False,False,False,True,False,169325.931624,10.0,1880.0,346.0,545
4,Oakland,2024-01-26 02:16:04.646783,False,False,False,True,False,169325.931624,0.0,197.0,346.0,545
...,...,...,...,...,...,...,...,...,...,...,...,...
857,Danville,2024-01-26 02:16:04.646783,False,False,False,True,False,370326.526316,10.0,724.0,400.5,189
858,Danville,2024-01-26 02:16:04.646783,False,False,False,True,False,370326.526316,10.0,435.0,400.5,189
859,San Ramon,2024-01-26 02:16:04.646783,True,False,False,False,False,320795.000000,20.0,602.0,717.0,699
861,San Ramon,2024-01-26 02:16:04.646783,True,False,False,False,False,320795.000000,20.0,649.0,717.0,699


In [27]:
# Encode boolean values
boolean_columns = ['<1h ocean', 'inland', 'island', 'near bay', 'near ocean']
neighborhood_data[boolean_columns] = neighborhood_data[boolean_columns].astype(int)

# Display the created and encoded neighborhood feature group
neighborhood_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  neighborhood_data[boolean_columns] = neighborhood_data[boolean_columns].astype(int)


Unnamed: 0,primary_key,event_time,<1h ocean,inland,island,near bay,near ocean,median house value,median house age,total households,bedrooms per household,locality-code
0,Berkeley,2024-01-26 02:16:04.646783,0,0,0,1,0,253127.696429,20.0,503.0,383.5,60
1,Oakland,2024-01-26 02:16:04.646783,0,0,0,1,0,169325.931624,0.0,717.0,346.0,545
2,Oakland,2024-01-26 02:16:04.646783,0,0,0,1,0,169325.931624,10.0,473.0,346.0,545
3,Oakland,2024-01-26 02:16:04.646783,0,0,0,1,0,169325.931624,10.0,1880.0,346.0,545
4,Oakland,2024-01-26 02:16:04.646783,0,0,0,1,0,169325.931624,0.0,197.0,346.0,545
...,...,...,...,...,...,...,...,...,...,...,...,...
857,Danville,2024-01-26 02:16:04.646783,0,0,0,1,0,370326.526316,10.0,724.0,400.5,189
858,Danville,2024-01-26 02:16:04.646783,0,0,0,1,0,370326.526316,10.0,435.0,400.5,189
859,San Ramon,2024-01-26 02:16:04.646783,1,0,0,0,0,320795.000000,20.0,602.0,717.0,699
861,San Ramon,2024-01-26 02:16:04.646783,1,0,0,0,0,320795.000000,20.0,649.0,717.0,699


In [39]:
new_column_names = {
    '<1h ocean': 'less_than_1h_ocean',
    'near bay': 'near_bay',
    'near ocean': 'near_ocean',
    'median house value': 'median_house_value',
    'median house age': 'median_house_age',
    'total households': 'total_households',
    'bedrooms per household': 'bedrooms_per_household',
}

neighborhood_data.rename(columns=new_column_names, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  neighborhood_data.rename(columns=new_column_names, inplace=True)


In [40]:
neighborhood_data

Unnamed: 0,primary_key,event_time,less_than_1h_ocean,inland,island,near_bay,near_ocean,median_house_value,median_house_age,total_households,bedrooms_per_household,locality-code
0,Berkeley,2024-01-26 02:16:04.646783,0,0,0,1,0,253127.696429,20.0,503.0,383.5,60
1,Oakland,2024-01-26 02:16:04.646783,0,0,0,1,0,169325.931624,0.0,717.0,346.0,545
2,Oakland,2024-01-26 02:16:04.646783,0,0,0,1,0,169325.931624,10.0,473.0,346.0,545
3,Oakland,2024-01-26 02:16:04.646783,0,0,0,1,0,169325.931624,10.0,1880.0,346.0,545
4,Oakland,2024-01-26 02:16:04.646783,0,0,0,1,0,169325.931624,0.0,197.0,346.0,545
...,...,...,...,...,...,...,...,...,...,...,...,...
857,Danville,2024-01-26 02:16:04.646783,0,0,0,1,0,370326.526316,10.0,724.0,400.5,189
858,Danville,2024-01-26 02:16:04.646783,0,0,0,1,0,370326.526316,10.0,435.0,400.5,189
859,San Ramon,2024-01-26 02:16:04.646783,1,0,0,0,0,320795.000000,20.0,602.0,717.0,699
861,San Ramon,2024-01-26 02:16:04.646783,1,0,0,0,0,320795.000000,20.0,649.0,717.0,699


In [38]:
# Assuming your DataFrame is named neighborhood_data
neighborhood_data.to_csv('neighborhood_data.csv', index=False)

In [29]:
# Check for null values and display the count
null_count = neighborhood_data.isnull().sum()
print(null_count)

primary_key               0
event_time                0
<1h ocean                 0
inland                    0
island                    0
near bay                  0
near ocean                0
median house value        0
median house age          0
total households          0
bedrooms per household    0
locality-code             0
dtype: int64


In [68]:
neighborhood_data.dtypes

primary_key                       object
event_time                datetime64[us]
less_than_1h_ocean                 int64
inland                             int64
island                             int64
near_bay                           int64
near_ocean                         int64
median_house_value               float64
median_house_age                 float64
total_households                 float64
bedrooms_per_household           float64
locality-code                      int16
dtype: object

## Ingest Data into FeatureStore


In [69]:
import pandas as pd
from sagemaker.feature_store.feature_definition import FeatureDefinition, FeatureTypeEnum

# Assuming neighborhood_data is your DataFrame
column_names = neighborhood_data.columns
column_types = neighborhood_data.dtypes

# Map pandas types to SageMaker Feature Store types
sagemaker_type_mapping = {
    'object': FeatureTypeEnum.STRING,
    'datetime64[us]': FeatureTypeEnum.FRACTIONAL,
    'int64': FeatureTypeEnum.INTEGRAL,
    'float64': FeatureTypeEnum.FRACTIONAL,
    'int16': FeatureTypeEnum.INTEGRAL,
}

# Define feature types based on the data types in your DataFrame
feature_types = [sagemaker_type_mapping[str(dtype)] for dtype in column_types]

# Create feature definitions
feature_definitions = [
    FeatureDefinition(feature_name=name, feature_type=ftype) for name, ftype in zip(column_names, feature_types)
]

# Print feature definitions for troubleshooting
for feature_def in feature_definitions:
    print(feature_def.to_dict())


{'FeatureName': 'primary_key', 'FeatureType': 'String'}
{'FeatureName': 'event_time', 'FeatureType': 'Fractional'}
{'FeatureName': 'less_than_1h_ocean', 'FeatureType': 'Integral'}
{'FeatureName': 'inland', 'FeatureType': 'Integral'}
{'FeatureName': 'island', 'FeatureType': 'Integral'}
{'FeatureName': 'near_bay', 'FeatureType': 'Integral'}
{'FeatureName': 'near_ocean', 'FeatureType': 'Integral'}
{'FeatureName': 'median_house_value', 'FeatureType': 'Fractional'}
{'FeatureName': 'median_house_age', 'FeatureType': 'Fractional'}
{'FeatureName': 'total_households', 'FeatureType': 'Fractional'}
{'FeatureName': 'bedrooms_per_household', 'FeatureType': 'Fractional'}
{'FeatureName': 'locality-code', 'FeatureType': 'Integral'}


In [81]:
from time import gmtime, strftime, sleep

neighborhood_feature_group_name = "neighborhood-feature-group-" + strftime("%d-%H-%M-%S", gmtime())

In [82]:
from sagemaker.feature_store.feature_group import FeatureGroup

neighborhood_feature_group = FeatureGroup(
    name=neighborhood_feature_group_name, sagemaker_session=feature_store_session
)

In [83]:
import time

current_time_sec = int(round(time.time()))


def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        if data_frame.dtypes[label] == "object":
            data_frame[label] = data_frame[label].astype("str").astype("string")


# cast object dtype to string. The SageMaker FeatureStore Python SDK will then map the string dtype to String feature type.
cast_object_to_string(neighborhood_data)

# record identifier and event time feature names
record_identifier_feature_name = "primary_key"
event_time_feature_name = "event_time"

# append EventTime feature
neighborhood_data[event_time_feature_name] = pd.Series(
    [current_time_sec] * len(neighborhood_data), dtype="float64"
)

# load feature definitions to the feature group. SageMaker FeatureStore Python SDK will auto-detect the data schema based on input data.
neighborhood_feature_group.load_feature_definitions(data_frame=neighborhood_data)
# output is suppressed

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_frame[label] = data_frame[label].astype("str").astype("string")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  neighborhood_data[event_time_feature_name] = pd.Series(


[FeatureDefinition(feature_name='primary_key', feature_type=<FeatureTypeEnum.STRING: 'String'>),
 FeatureDefinition(feature_name='event_time', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>),
 FeatureDefinition(feature_name='less_than_1h_ocean', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='inland', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='island', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='near_bay', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='near_ocean', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='median_house_value', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>),
 FeatureDefinition(feature_name='median_house_age', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>),
 FeatureDefinition(feature_name='total_households', feature_type=<FeatureTypeEn

## Cleanup Resources

In [26]:
predictor.delete_endpoint()

NameError: name 'predictor' is not defined

In [27]:
identity_feature_group.delete()
transaction_feature_group.delete()

NameError: name 'identity_feature_group' is not defined

In [28]:
# restore original boto3 version
%pip install 'boto3=={}'.format(original_boto3_version)

/bin/bash: -c: line 1: syntax error near unexpected token `('
/bin/bash: -c: line 1: `/opt/conda/bin/python -m pip install 'boto3=={}'.format(original_boto3_version)'
Note: you may need to restart the kernel to use updated packages.


In [29]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>