## Setup SageMaker FeatureStore

In [1]:
import boto3
import sagemaker

original_boto3_version = boto3.__version__
%pip install 'boto3>1.17.21'

Note: you may need to restart the kernel to use updated packages.


In [2]:
from sagemaker.session import Session

region = boto3.Session().region_name

boto_session = boto3.Session(region_name=region)

sagemaker_client = boto_session.client(service_name="sagemaker", region_name=region)
featurestore_runtime = boto_session.client(
    service_name="sagemaker-featurestore-runtime", region_name=region
)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime,
)

#### S3 Bucket Setup For The OfflineStore

In [3]:
# You can modify the following to use a bucket of your choosing
default_s3_bucket_name = feature_store_session.default_bucket()
prefix = "sagemaker-assignment-3"

print(default_s3_bucket_name)

sagemaker-us-east-1-339129315232


In [159]:
from sagemaker import get_execution_role

# You can modify the following to use a role of your choosing. See the documentation for how to create this.
role = get_execution_role()
print(role)

arn:aws:iam::339129315232:role/LabRole


## Inspect Dataset

In [5]:
new_working_directory = '/home/sagemaker-user/'

In [6]:
import os
os.chdir(new_working_directory)

In [7]:
print("Current Working Directory:", os.getcwd())

Current Working Directory: /home/sagemaker-user


In [103]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Specify the full path to the CSV files
gmaps_path = 'housing_gmaps_data_raw.csv'
housing_path = 'aai-540-homework/homework-3-1/housing.csv'

# Read data into Pandas DataFrames
gmaps_data = pd.read_csv(gmaps_path)
housing_data = pd.read_csv(housing_path)

In [104]:
gmaps_data.head()

Unnamed: 0,street_number,route,locality-political,administrative_area_level_2-political,administrative_area_level_1-political,country-political,postal_code,address,longitude,latitude,...,establishment-natural_feature,airport-establishment-point_of_interest,political-sublocality-sublocality_level_1,administrative_area_level_3-political,post_box,establishment-light_rail_station-point_of_interest-transit_station,establishment-point_of_interest,aquarium-establishment-park-point_of_interest-tourist_attraction-zoo,campground-establishment-lodging-park-point_of_interest-rv_park-tourist_attraction,cemetery-establishment-park-point_of_interest
0,3130,Grizzly Peak Boulevard,Berkeley,Alameda County,California,United States,94705.0,"3130 Grizzly Peak Blvd, Berkeley, CA 94705, USA",-122.23,37.88,...,,,,,,,,,,
1,2005,Tunnel Road,Oakland,Alameda County,California,United States,94611.0,"2005 Tunnel Rd, Oakland, CA 94611, USA",-122.22,37.86,...,,,,,,,,,,
2,6886,Chabot Road,Oakland,Alameda County,California,United States,94618.0,"6886 Chabot Rd, Oakland, CA 94618, USA",-122.24,37.85,...,,,,,,,,,,
3,6365,Florio Street,Oakland,Alameda County,California,United States,94618.0,"6365 Florio St, Oakland, CA 94618, USA",-122.25,37.85,...,,,,,,,,,,
4,5407,Bryant Avenue,Oakland,Alameda County,California,United States,94618.0,"5407 Bryant Ave, Oakland, CA 94618, USA",-122.25,37.84,...,,,,,,,,,,


In [105]:
# Check for null values and display the count
null_count = gmaps_data.isnull().sum()
print(null_count)

street_number                                                                          1402
route                                                                                   380
locality-political                                                                      187
administrative_area_level_2-political                                                    47
administrative_area_level_1-political                                                     3
country-political                                                                         0
postal_code                                                                             180
address                                                                                   0
longitude                                                                                 0
latitude                                                                                  0
neighborhood-political                                                          

In [106]:
columns_to_remove = [
    'neighborhood-political',
    'postal_code_suffix',
    'establishment-point_of_interest-transit_station',
    'establishment-park-point_of_interest',
    'premise',
    'establishment-point_of_interest-subway_station-transit_station',
    'airport-establishment-finance-moving_company-point_of_interest-storage',
    'subpremise',
    'bus_station-establishment-point_of_interest-transit_station',
    'establishment-park-point_of_interest-tourist_attraction',
    'establishment-natural_feature',
    'airport-establishment-point_of_interest',
    'political-sublocality-sublocality_level_1',
    'administrative_area_level_3-political',
    'post_box',
    'establishment-light_rail_station-point_of_interest-transit_station',
    'establishment-point_of_interest',
    'aquarium-establishment-park-point_of_interest-tourist_attraction-zoo',
    'campground-establishment-lodging-park-point_of_interest-rv_park-tourist_attraction',
    'cemetery-establishment-park-point_of_interest'
]

gmaps_data_cleaned = gmaps_data.drop(columns=columns_to_remove, errors='ignore')

In [95]:
# Check for null values and display the count
null_count = gmaps_data_cleaned.isnull().sum()
print(null_count)

street_number                            1402
route                                     380
locality-political                        187
administrative_area_level_2-political      47
administrative_area_level_1-political       3
country-political                           0
postal_code                               180
address                                     0
longitude                                   0
latitude                                    0
dtype: int64


In [108]:
gmaps_data = gmaps_data_cleaned.dropna()
gmaps_data

Unnamed: 0,street_number,route,locality-political,administrative_area_level_2-political,administrative_area_level_1-political,country-political,postal_code,address,longitude,latitude
0,3130,Grizzly Peak Boulevard,Berkeley,Alameda County,California,United States,94705.0,"3130 Grizzly Peak Blvd, Berkeley, CA 94705, USA",-122.23,37.88
1,2005,Tunnel Road,Oakland,Alameda County,California,United States,94611.0,"2005 Tunnel Rd, Oakland, CA 94611, USA",-122.22,37.86
2,6886,Chabot Road,Oakland,Alameda County,California,United States,94618.0,"6886 Chabot Rd, Oakland, CA 94618, USA",-122.24,37.85
3,6365,Florio Street,Oakland,Alameda County,California,United States,94618.0,"6365 Florio St, Oakland, CA 94618, USA",-122.25,37.85
4,5407,Bryant Avenue,Oakland,Alameda County,California,United States,94618.0,"5407 Bryant Ave, Oakland, CA 94618, USA",-122.25,37.84
...,...,...,...,...,...,...,...,...,...,...
12583,2383,Walnut Avenue,Marysville,Yuba County,California,United States,95901.0,"2383 Walnut Ave, Marysville, CA 95901, USA",-121.53,39.19
12584,1443,Ramirez Road,Marysville,Yuba County,California,United States,95901.0,"1443 Ramirez Rd, Marysville, CA 95901, USA",-121.56,39.27
12587,16492,Indiana Ranch Road,Dobbins,Yuba County,California,United States,95935.0,"16492 Indiana Ranch Rd, Dobbins, CA 95935, USA",-121.22,39.43
12588,16345,Vierra Road,Rackerby,Yuba County,California,United States,95972.0,"16345 Vierra Rd, Rackerby, CA 95972, USA",-121.32,39.43


In [109]:
housing_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [110]:
housing_data = housing_data.dropna()
housing_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [111]:
# Check for null values and display the count
null_count = housing_data.isnull().sum()
print(null_count)

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64


In [112]:
# Create a unique key based on latitude and longitude
housing_data['locality_code'] = housing_data['latitude'].astype(str) + '_' + housing_data['longitude'].astype(str)
gmaps_data['locality_code'] = gmaps_data['latitude'].astype(str) + '_' + gmaps_data['longitude'].astype(str)

# Merge datasets based on the new location_key
merged_data = pd.merge(housing_data, gmaps_data, on='locality_code', how='inner')

# Drop the duplicate columns, like longitude_x, latitude_x, longitude_y, latitude_y
merged_data = merged_data.drop(['longitude_y', 'latitude_y'], axis=1)

# Rename the columns if necessary
merged_data = merged_data.rename(columns={'longitude_x': 'longitude', 'latitude_x': 'latitude'})

# Print the merged dataset
print(merged_data)

       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0        -122.23     37.88                41.0        880.0           129.0   
1        -122.22     37.86                21.0       7099.0          1106.0   
2        -122.24     37.85                52.0       1467.0           190.0   
3        -122.24     37.85                37.0        334.0            54.0   
4        -122.24     37.85                52.0       2612.0           365.0   
...          ...       ...                 ...          ...             ...   
18760    -121.53     39.19                27.0       2080.0           412.0   
18761    -121.56     39.27                28.0       2332.0           395.0   
18762    -121.22     39.43                17.0       2254.0           485.0   
18763    -121.32     39.43                18.0       1860.0           409.0   
18764    -121.24     39.37                16.0       2785.0           616.0   

       population  households  median_income  media

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gmaps_data['locality_code'] = gmaps_data['latitude'].astype(str) + '_' + gmaps_data['longitude'].astype(str)


In [113]:
# Check for null values and display the count
null_count = merged_data.isnull().sum()
print(null_count)

longitude                                0
latitude                                 0
housing_median_age                       0
total_rooms                              0
total_bedrooms                           0
population                               0
households                               0
median_income                            0
median_house_value                       0
ocean_proximity                          0
locality_code                            0
street_number                            0
route                                    0
locality-political                       0
administrative_area_level_2-political    0
administrative_area_level_1-political    0
country-political                        0
postal_code                              0
address                                  0
dtype: int64


# Feature Engineering

In [137]:
import pandas as pd

# Assuming 'merged_data' is the merged DataFrame

# Create a new DataFrame
neighborhood_feature_data = pd.DataFrame()

# 1. Primary Key
neighborhood_feature_data['primary_key'] = merged_data['locality-political']

# 2. Event Time (assuming it's the current timestamp)
neighborhood_feature_data['event_time'] = pd.to_datetime('now')

# 3-7. One-hot encoding for ocean_proximity
ocean_proximity_columns = pd.get_dummies(merged_data['ocean_proximity'], prefix='', prefix_sep='')
neighborhood_feature_data['<1h_ocean'] = ocean_proximity_columns['<1H OCEAN'].astype(int)
neighborhood_feature_data['inland'] = ocean_proximity_columns['INLAND'].astype(int)
neighborhood_feature_data['island'] = ocean_proximity_columns['ISLAND'].astype(int)
neighborhood_feature_data['near_bay'] = ocean_proximity_columns['NEAR BAY'].astype(int)
neighborhood_feature_data['near_ocean'] = ocean_proximity_columns['NEAR OCEAN'].astype(int)

# 8. Median House Value
neighborhood_feature_data['median_house_value'] = merged_data['median_house_value']

# 9. Median House Age
neighborhood_feature_data['median_house_age'] = merged_data['housing_median_age']

# 10. Total Households
neighborhood_feature_data['total_households'] = merged_data['households']

# 11. Bedrooms per Household
neighborhood_feature_data['bedrooms_per_household'] = merged_data['total_bedrooms']/merged_data['households']

# 12. Encoded Locality-Political
locality_mapping = {loc: i for i, loc in enumerate(merged_data['locality-political'].unique())}
neighborhood_feature_data['encoded_locality'] = merged_data['locality-political'].map(locality_mapping)

# Impute missing values by getting the average for locality-code
neighborhood_feature_data = neighborhood_feature_data.fillna(neighborhood_feature_data.groupby('encoded_locality')['bedrooms_per_household'].transform('mean'))

# Display the new DataFrame
print(neighborhood_feature_data)

        primary_key                 event_time  <1h_ocean  inland  island  \
0          Berkeley 2024-01-27 06:12:52.490105          0       0       0   
1           Oakland 2024-01-27 06:12:52.490105          0       0       0   
2           Oakland 2024-01-27 06:12:52.490105          0       0       0   
3           Oakland 2024-01-27 06:12:52.490105          0       0       0   
4           Oakland 2024-01-27 06:12:52.490105          0       0       0   
...             ...                        ...        ...     ...     ...   
18760    Marysville 2024-01-27 06:12:52.490105          0       1       0   
18761    Marysville 2024-01-27 06:12:52.490105          0       1       0   
18762       Dobbins 2024-01-27 06:12:52.490105          0       1       0   
18763      Rackerby 2024-01-27 06:12:52.490105          0       1       0   
18764  Oregon House 2024-01-27 06:12:52.490105          0       1       0   

       near_bay  near_ocean  median_house_value  median_house_age  \
0     

In [138]:
neighborhood_feature_data

Unnamed: 0,primary_key,event_time,<1h_ocean,inland,island,near_bay,near_ocean,median_house_value,median_house_age,total_households,bedrooms_per_household,encoded_locality
0,Berkeley,2024-01-27 06:12:52.490105,0,0,0,1,0,452600.0,41.0,126.0,1.023810,0
1,Oakland,2024-01-27 06:12:52.490105,0,0,0,1,0,358500.0,21.0,1138.0,0.971880,1
2,Oakland,2024-01-27 06:12:52.490105,0,0,0,1,0,352100.0,52.0,177.0,1.073446,1
3,Oakland,2024-01-27 06:12:52.490105,0,0,0,1,0,335000.0,37.0,47.0,1.148936,1
4,Oakland,2024-01-27 06:12:52.490105,0,0,0,1,0,391100.0,52.0,367.0,0.994550,1
...,...,...,...,...,...,...,...,...,...,...,...,...
18760,Marysville,2024-01-27 06:12:52.490105,0,1,0,0,0,98300.0,27.0,382.0,1.078534,855
18761,Marysville,2024-01-27 06:12:52.490105,0,1,0,0,0,116800.0,28.0,344.0,1.148256,855
18762,Dobbins,2024-01-27 06:12:52.490105,0,1,0,0,0,92300.0,17.0,433.0,1.120092,861
18763,Rackerby,2024-01-27 06:12:52.490105,0,1,0,0,0,84700.0,18.0,349.0,1.171920,862


In [139]:
# Group by primary_key and calculate the mean for each group
neighborhood_feature_data = neighborhood_feature_data.groupby('primary_key').mean().reset_index()

# Display the condensed DataFrame
neighborhood_feature_data


Unnamed: 0,primary_key,event_time,<1h_ocean,inland,island,near_bay,near_ocean,median_house_value,median_house_age,total_households,bedrooms_per_household,encoded_locality
0,Acampo,2024-01-27 06:12:52.490105088,0.000000,1.000000,0.0,0.0,0.0,154433.333333,22.000000,502.666667,1.031443,624.0
1,Acton,2024-01-27 06:12:52.490105088,0.000000,1.000000,0.0,0.0,0.0,311600.000000,7.000000,717.000000,1.025105,320.0
2,Adelanto,2024-01-27 06:12:52.490105088,0.000000,1.000000,0.0,0.0,0.0,70080.000000,14.000000,473.000000,1.197837,565.0
3,Agoura Hills,2024-01-27 06:12:52.490105088,0.000000,0.000000,0.0,0.0,1.0,419000.400000,13.400000,1880.000000,1.110333,309.0
4,Aguanga,2024-01-27 06:12:52.490105088,0.333333,0.666667,0.0,0.0,0.0,126633.333333,7.333333,196.666667,1.384497,498.0
...,...,...,...,...,...,...,...,...,...,...,...,...
859,Yreka,2024-01-27 06:12:52.490105088,0.000000,1.000000,0.0,0.0,0.0,65780.000000,29.800000,602.200000,1.044472,738.0
860,Yuba City,2024-01-27 06:12:52.490105088,0.000000,1.000000,0.0,0.0,0.0,100242.105263,23.210526,455.500000,1.032625,786.0
861,Yucaipa,2024-01-27 06:12:52.490105088,0.000000,1.000000,0.0,0.0,0.0,144145.050000,23.550000,649.200000,1.081528,560.0
862,Yucca Valley,2024-01-27 06:12:52.490105088,0.000000,1.000000,0.0,0.0,0.0,85612.500000,14.250000,839.625000,1.137924,576.0


In [140]:
# Round specific columns to the nearest integer
columns_to_round = ['<1h_ocean', 'inland', 'island', 'near_bay', 'near_ocean']
neighborhood_feature_data[columns_to_round] = neighborhood_feature_data[columns_to_round].round().astype(int)

# Display the updated DataFrame
neighborhood_feature_data

Unnamed: 0,primary_key,event_time,<1h_ocean,inland,island,near_bay,near_ocean,median_house_value,median_house_age,total_households,bedrooms_per_household,encoded_locality
0,Acampo,2024-01-27 06:12:52.490105088,0,1,0,0,0,154433.333333,22.000000,502.666667,1.031443,624.0
1,Acton,2024-01-27 06:12:52.490105088,0,1,0,0,0,311600.000000,7.000000,717.000000,1.025105,320.0
2,Adelanto,2024-01-27 06:12:52.490105088,0,1,0,0,0,70080.000000,14.000000,473.000000,1.197837,565.0
3,Agoura Hills,2024-01-27 06:12:52.490105088,0,0,0,0,1,419000.400000,13.400000,1880.000000,1.110333,309.0
4,Aguanga,2024-01-27 06:12:52.490105088,0,1,0,0,0,126633.333333,7.333333,196.666667,1.384497,498.0
...,...,...,...,...,...,...,...,...,...,...,...,...
859,Yreka,2024-01-27 06:12:52.490105088,0,1,0,0,0,65780.000000,29.800000,602.200000,1.044472,738.0
860,Yuba City,2024-01-27 06:12:52.490105088,0,1,0,0,0,100242.105263,23.210526,455.500000,1.032625,786.0
861,Yucaipa,2024-01-27 06:12:52.490105088,0,1,0,0,0,144145.050000,23.550000,649.200000,1.081528,560.0
862,Yucca Valley,2024-01-27 06:12:52.490105088,0,1,0,0,0,85612.500000,14.250000,839.625000,1.137924,576.0


In [141]:
# Cap the values in the 'median_house_value' column at 500,000
neighborhood_feature_data['median_house_value'] = neighborhood_feature_data['median_house_value'].clip(upper=500000)

# Display the updated DataFrame
neighborhood_feature_data.describe()

Unnamed: 0,event_time,<1h_ocean,inland,island,near_bay,near_ocean,median_house_value,median_house_age,total_households,bedrooms_per_household,encoded_locality
count,864,864.0,864.0,864.0,864.0,864.0,864.0,864.0,864.0,864.0,864.0
mean,2024-01-27 06:12:52.490105088,0.262731,0.545139,0.001157,0.063657,0.121528,181653.777957,24.204351,486.558777,1.210784,431.5
min,2024-01-27 06:12:52.490104832,0.0,0.0,0.0,0.0,0.0,30000.0,3.0,27.0,0.913043,0.0
25%,2024-01-27 06:12:52.490105088,0.0,0.0,0.0,0.0,0.0,94875.0,18.0,343.807692,1.037294,215.75
50%,2024-01-27 06:12:52.490105088,0.0,1.0,0.0,0.0,0.0,153298.924731,23.6,449.75,1.064221,431.5
75%,2024-01-27 06:12:52.490105088,1.0,1.0,0.0,0.0,0.0,238842.592593,30.0,578.05649,1.148702,647.25
max,2024-01-27 06:12:52.490105344,1.0,1.0,1.0,1.0,1.0,500000.0,52.0,2118.0,9.703704,863.0
std,,0.440373,0.498247,0.034021,0.244283,0.326929,109800.237612,8.98096,255.78398,0.590851,249.559612


In [142]:
import pandas as pd

# Assuming 'neighborhood_feature_data' is the DataFrame with the desired features

# Define the bins for discretization
bins = range(0, 100, 10)  # 0-9, 10-19, 20-29, ..., 90-99

# Define the labels for each bin
labels = [f"{i}-{i+9}" for i in range(0, 90, 10)]

# Discretize the 'median_house_age' column
neighborhood_feature_data['median_house_age'] = pd.cut(neighborhood_feature_data['median_house_age'], bins=bins, labels=labels, right=False)

# Display the updated DataFrame
neighborhood_feature_data

Unnamed: 0,primary_key,event_time,<1h_ocean,inland,island,near_bay,near_ocean,median_house_value,median_house_age,total_households,bedrooms_per_household,encoded_locality
0,Acampo,2024-01-27 06:12:52.490105088,0,1,0,0,0,154433.333333,20-29,502.666667,1.031443,624.0
1,Acton,2024-01-27 06:12:52.490105088,0,1,0,0,0,311600.000000,0-9,717.000000,1.025105,320.0
2,Adelanto,2024-01-27 06:12:52.490105088,0,1,0,0,0,70080.000000,10-19,473.000000,1.197837,565.0
3,Agoura Hills,2024-01-27 06:12:52.490105088,0,0,0,0,1,419000.400000,10-19,1880.000000,1.110333,309.0
4,Aguanga,2024-01-27 06:12:52.490105088,0,1,0,0,0,126633.333333,0-9,196.666667,1.384497,498.0
...,...,...,...,...,...,...,...,...,...,...,...,...
859,Yreka,2024-01-27 06:12:52.490105088,0,1,0,0,0,65780.000000,20-29,602.200000,1.044472,738.0
860,Yuba City,2024-01-27 06:12:52.490105088,0,1,0,0,0,100242.105263,20-29,455.500000,1.032625,786.0
861,Yucaipa,2024-01-27 06:12:52.490105088,0,1,0,0,0,144145.050000,20-29,649.200000,1.081528,560.0
862,Yucca Valley,2024-01-27 06:12:52.490105088,0,1,0,0,0,85612.500000,10-19,839.625000,1.137924,576.0


In [143]:
# Round the values in the 'total_households' column to the nearest integer
neighborhood_feature_data['total_households'] = neighborhood_feature_data['total_households'].round().astype(int)

# Display the updated DataFrame
neighborhood_feature_data

Unnamed: 0,primary_key,event_time,<1h_ocean,inland,island,near_bay,near_ocean,median_house_value,median_house_age,total_households,bedrooms_per_household,encoded_locality
0,Acampo,2024-01-27 06:12:52.490105088,0,1,0,0,0,154433.333333,20-29,503,1.031443,624.0
1,Acton,2024-01-27 06:12:52.490105088,0,1,0,0,0,311600.000000,0-9,717,1.025105,320.0
2,Adelanto,2024-01-27 06:12:52.490105088,0,1,0,0,0,70080.000000,10-19,473,1.197837,565.0
3,Agoura Hills,2024-01-27 06:12:52.490105088,0,0,0,0,1,419000.400000,10-19,1880,1.110333,309.0
4,Aguanga,2024-01-27 06:12:52.490105088,0,1,0,0,0,126633.333333,0-9,197,1.384497,498.0
...,...,...,...,...,...,...,...,...,...,...,...,...
859,Yreka,2024-01-27 06:12:52.490105088,0,1,0,0,0,65780.000000,20-29,602,1.044472,738.0
860,Yuba City,2024-01-27 06:12:52.490105088,0,1,0,0,0,100242.105263,20-29,456,1.032625,786.0
861,Yucaipa,2024-01-27 06:12:52.490105088,0,1,0,0,0,144145.050000,20-29,649,1.081528,560.0
862,Yucca Valley,2024-01-27 06:12:52.490105088,0,1,0,0,0,85612.500000,10-19,840,1.137924,576.0


In [144]:
# Check for null values and display the count
null_count = neighborhood_feature_data.isnull().sum()
print(null_count)

primary_key               0
event_time                0
<1h_ocean                 0
inland                    0
island                    0
near_bay                  0
near_ocean                0
median_house_value        0
median_house_age          0
total_households          0
bedrooms_per_household    0
encoded_locality          0
dtype: int64


In [148]:
new_column_names = {
    '<1h ocean': 'less_than_1h_ocean',
}

neighborhood_feature_data.rename(columns=new_column_names, inplace=True)

In [149]:
neighborhood_feature_data

Unnamed: 0,primary_key,event_time,<1h_ocean,inland,island,near_bay,near_ocean,median_house_value,median_house_age,total_households,bedrooms_per_household,encoded_locality
0,Acampo,2024-01-27 06:12:52.490105088,0,1,0,0,0,154433.333333,20-29,503,1.031443,624.0
1,Acton,2024-01-27 06:12:52.490105088,0,1,0,0,0,311600.000000,0-9,717,1.025105,320.0
2,Adelanto,2024-01-27 06:12:52.490105088,0,1,0,0,0,70080.000000,10-19,473,1.197837,565.0
3,Agoura Hills,2024-01-27 06:12:52.490105088,0,0,0,0,1,419000.400000,10-19,1880,1.110333,309.0
4,Aguanga,2024-01-27 06:12:52.490105088,0,1,0,0,0,126633.333333,0-9,197,1.384497,498.0
...,...,...,...,...,...,...,...,...,...,...,...,...
859,Yreka,2024-01-27 06:12:52.490105088,0,1,0,0,0,65780.000000,20-29,602,1.044472,738.0
860,Yuba City,2024-01-27 06:12:52.490105088,0,1,0,0,0,100242.105263,20-29,456,1.032625,786.0
861,Yucaipa,2024-01-27 06:12:52.490105088,0,1,0,0,0,144145.050000,20-29,649,1.081528,560.0
862,Yucca Valley,2024-01-27 06:12:52.490105088,0,1,0,0,0,85612.500000,10-19,840,1.137924,576.0


In [165]:
# Explicitly set the feature type for 'primary_key' to be a string
neighborhood_feature_data['primary_key'] = neighborhood_feature_data['primary_key'].astype(str)

# Round the values in the 'total_households' column to the nearest integer
neighborhood_feature_data['total_households'] = neighborhood_feature_data['total_households'].round().astype(int)

# Assuming 'neighborhood_feature_data' is your DataFrame
neighborhood_feature_data.rename(columns={'<1h_ocean': 'less_than_1h_ocean'}, inplace=True)

# Display the updated DataFrame
neighborhood_feature_data

Unnamed: 0,primary_key,event_time,less_than_1h_ocean,inland,island,near_bay,near_ocean,median_house_value,median_house_age,total_households,bedrooms_per_household,encoded_locality
0,Acampo,2024-01-27 06:12:52.490105088,0,1,0,0,0,154433.333333,20-29,503,1.031443,624.0
1,Acton,2024-01-27 06:12:52.490105088,0,1,0,0,0,311600.000000,0-9,717,1.025105,320.0
2,Adelanto,2024-01-27 06:12:52.490105088,0,1,0,0,0,70080.000000,10-19,473,1.197837,565.0
3,Agoura Hills,2024-01-27 06:12:52.490105088,0,0,0,0,1,419000.400000,10-19,1880,1.110333,309.0
4,Aguanga,2024-01-27 06:12:52.490105088,0,1,0,0,0,126633.333333,0-9,197,1.384497,498.0
...,...,...,...,...,...,...,...,...,...,...,...,...
859,Yreka,2024-01-27 06:12:52.490105088,0,1,0,0,0,65780.000000,20-29,602,1.044472,738.0
860,Yuba City,2024-01-27 06:12:52.490105088,0,1,0,0,0,100242.105263,20-29,456,1.032625,786.0
861,Yucaipa,2024-01-27 06:12:52.490105088,0,1,0,0,0,144145.050000,20-29,649,1.081528,560.0
862,Yucca Valley,2024-01-27 06:12:52.490105088,0,1,0,0,0,85612.500000,10-19,840,1.137924,576.0


In [150]:
# Assuming your DataFrame is named neighborhood_data
neighborhood_feature_data.to_csv('neighborhood_feature_data.csv', index=False)

## Ingest Data into FeatureStore


In [151]:
from time import gmtime, strftime, sleep

neighborhood_feature_group_name = "neighborhood-feature-group-" + strftime("%d-%H-%M-%S", gmtime())

In [152]:
from sagemaker.feature_store.feature_group import FeatureGroup

neighborhood_feature_group = FeatureGroup(
    name=neighborhood_feature_group_name, sagemaker_session=feature_store_session
)

In [173]:
import pandas as pd
from sagemaker.feature_store.feature_definition import FeatureTypeEnum
from sagemaker import Session

# Assuming 'neighborhood_feature_data' is the DataFrame with the desired features

# Explicitly set the feature type for 'primary_key' to be a string
neighborhood_feature_data['primary_key'] = neighborhood_feature_data['primary_key'].astype(str)

# Round the values in the 'total_households' column to the nearest integer
neighborhood_feature_data['total_households'] = neighborhood_feature_data['total_households'].round().astype(int)

# Create a list of dictionaries for feature definitions
feature_definitions = [
    {'FeatureName': 'primary_key', 'FeatureType': 'String'},
    {'FeatureName': 'event_time', 'FeatureType': 'Fractional'},
    # Add other feature definitions here
    {'FeatureName': 'less_than_1h_ocean', 'FeatureType': 'Integral'},
    {'FeatureName': 'inland', 'FeatureType': 'Integral'},
    {'FeatureName': 'island', 'FeatureType': 'Integral'},
    {'FeatureName': 'near_bay', 'FeatureType': 'Integral'},
    {'FeatureName': 'near_ocean', 'FeatureType': 'Integral'},
    {'FeatureName': 'median_house_value', 'FeatureType': 'Fractional'},
    {'FeatureName': 'median_house_age', 'FeatureType': 'Fractional'},
    {'FeatureName': 'total_households', 'FeatureType': 'Integral'},
    {'FeatureName': 'bedrooms_per_household', 'FeatureType': 'Fractional'},
    {'FeatureName': 'encoded_locality', 'FeatureType': 'Integral'},
]

# Define your Feature Group name
feature_group_name = 'neighborhood_feature_group_name'

# Create a SageMaker Feature Store client
sagemaker_client = Session().boto_session.client('sagemaker')

# Create a Feature Group with specified feature definitions
neighborhood_feature_group = sagemaker_client.create_feature_group(
    FeatureGroupName=feature_group_name,
    FeatureDefinitions=feature_definitions,
    RecordIdentifierFeatureName='primary_key',  # Corrected parameter name
    EventTimeFeatureName='event_time',
    RoleArn=role,  # Replace with your IAM role ARN
    OfflineStoreConfig={
        'S3StorageConfig': {
            'S3Uri': f"s3://{default_s3_bucket_name}/{prefix}"  # Replace with your S3 bucket and path
        }
    }
)

# Load the data into the Feature Group
neighborhood_feature_group.load(data_frame=neighborhood_feature_data)

AttributeError: 'dict' object has no attribute 'load'

In [176]:
# Assuming you have created the feature group using create_feature_group and obtained feature_group_info
feature_group_info = sagemaker_client.create_feature_group(
    FeatureGroupName=neighborhood_feature_group_name,
    FeatureDefinitions=feature_definitions,
    RecordIdentifierFeatureName='primary_key',  # Corrected parameter name
    EventTimeFeatureName='event_time',
    RoleArn=role,  # Replace with your IAM role ARN
    OfflineStoreConfig={
        'S3StorageConfig': {
            'S3Uri': f"s3://{default_s3_bucket_name}/{prefix}" 
        }
    }
)

# Extract the OfflineStoreConfig from the feature group info
offline_store_s3_uri = feature_group_info['OfflineStoreConfig']['S3StorageConfig']['S3Uri']

# Use the sagemaker_featurestore_runtime client to load the data
sagemaker_featurestore_runtime = boto3.client('sagemaker-featurestore-runtime')

# Specify the feature group name and the OfflineStoreConfig S3 URI
feature_group_name = feature_group_info['FeatureGroupName']

# Load the data into the feature group
sagemaker_featurestore_runtime.batch_put_record(
    FeatureGroupName=feature_group_name,
    Records=neighborhood_feature_data.to_dict(orient='records'),
    DisableGlueTableCreation=True,  # Disable Glue table creation for offline store
    S3OutputPath=offline_store_s3_uri
)


KeyError: 'OfflineStoreConfig'

In [155]:
import time

current_time_sec = int(round(time.time()))

def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        if data_frame.dtypes[label] == "object":
            data_frame.loc[:, label] = data_frame[label].astype("str").astype("string").copy()

# Cast object dtype to string
cast_object_to_string(neighborhood_feature_data)

# Record identifier and event time feature names
record_identifier_feature_name = "primary_key"
event_time_feature_name = "event_time"

# Append EventTime feature
neighborhood_data[event_time_feature_name] = pd.Series([current_time_sec] * len(neighborhood_feature_data), dtype="float64")

# Load feature definitions to the feature group
neighborhood_feature_group.load_feature_definitions(data_frame=neighborhood_feature_data)


ValueError: Failed to infer Feature type based on dtype object for column primary_key.

In [77]:
import pandas as pd
import time
from urllib.parse import urlparse
from sagemaker.feature_store.feature_group import AthenaQuery

# Create an Athena query instance
neighborhood_query = neighborhood_feature_group.athena_query()
neighborhood_table = neighborhood_query.table_name

# Construct the query string
query_string = f'SELECT * FROM "{neighborhood_table}" LIMIT 10'  # Adjust the query as needed
print("Running query:", query_string)

# Run the Athena query
try:
    neighborhood_query.run(query_string=query_string, output_location=f"s3://{default_s3_bucket_name}/{prefix}/query_results/")
    neighborhood_query.wait()
    dataset = neighborhood_query.as_dataframe()
    print("Query executed successfully.")
    print(dataset.head())
except RuntimeError as e:
    print("Failed to execute query:", e)

# Optional: Check Athena query execution details
query_execution_id = neighborhood_query._current_query_execution_id
query_execution_details = neighborhood_query.sagemaker_session.athena_client.get_query_execution(QueryExecutionId=query_execution_id)
print("Query execution details:", query_execution_details)


Running query: SELECT * FROM "neighborhood_feature_group_27_03_09_53_1706325287" LIMIT 10
Query executed successfully.
  primary_key    event_time  less_than_1h_ocean  inland  island  near_bay  \
0       Chico  1.706326e+09                   0       0       0         1   
1       Chico  1.706326e+09                   0       0       0         1   
2       Chico  1.706326e+09                   0       0       0         1   
3       Chico  1.706326e+09                   0       0       0         1   
4       Chico  1.706326e+09                   0       0       0         1   

   near_ocean  median_house_value  median_house_age  total_households  \
0           0        105921.95122              30.0             444.0   
1           0        105921.95122              20.0             852.0   
2           0        105921.95122              20.0             535.0   
3           0        105921.95122              40.0             148.0   
4           0        105921.95122              30.0  

AttributeError: 'Session' object has no attribute 'athena_client'

#### Create FeatureGroups in SageMaker FeatureStore

In [96]:
def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group Creation")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")


neighborhood_feature_group.create(
    s3_uri=f"s3://{default_s3_bucket_name}/{prefix}",
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=role,
    enable_online_store=True,
)

wait_for_feature_group_creation_complete(feature_group=neighborhood_feature_group)

ResourceInUse: An error occurred (ResourceInUse) when calling the CreateFeatureGroup operation: Resource Already Exists: FeatureGroup with name neighborhood-feature-group-27-03-09-53 already exists. Choose a different name.

In [100]:
neighborhood_feature_group.describe()

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:339129315232:feature-group/neighborhood-feature-group-27-03-09-53',
 'FeatureGroupName': 'neighborhood-feature-group-27-03-09-53',
 'RecordIdentifierFeatureName': 'primary_key',
 'EventTimeFeatureName': 'event_time',
 'FeatureDefinitions': [{'FeatureName': 'primary_key',
   'FeatureType': 'String'},
  {'FeatureName': 'event_time', 'FeatureType': 'Fractional'},
  {'FeatureName': 'less_than_1h_ocean', 'FeatureType': 'Integral'},
  {'FeatureName': 'inland', 'FeatureType': 'Integral'},
  {'FeatureName': 'island', 'FeatureType': 'Integral'},
  {'FeatureName': 'near_bay', 'FeatureType': 'Integral'},
  {'FeatureName': 'near_ocean', 'FeatureType': 'Integral'},
  {'FeatureName': 'median_house_value', 'FeatureType': 'Fractional'},
  {'FeatureName': 'median_house_age', 'FeatureType': 'Fractional'},
  {'FeatureName': 'total_households', 'FeatureType': 'Fractional'},
  {'FeatureName': 'bedrooms_per_household', 'FeatureType': 'Fractional'},
  {'Featur

In [101]:
sagemaker_client.list_feature_groups()  # use boto client to list FeatureGroups

{'FeatureGroupSummaries': [{'FeatureGroupName': 'neighborhood-feature-group-27-03-09-53',
   'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:339129315232:feature-group/neighborhood-feature-group-27-03-09-53',
   'CreationTime': datetime.datetime(2024, 1, 27, 3, 14, 47, 478000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created',
   'OfflineStoreStatus': {'Status': 'Active'}}],
 'ResponseMetadata': {'RequestId': '9d3e9b1f-54ec-466c-8f8e-a87195eca8c8',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '9d3e9b1f-54ec-466c-8f8e-a87195eca8c8',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '307',
   'date': 'Sat, 27 Jan 2024 04:02:05 GMT'},
  'RetryAttempts': 0}}

#### PutRecords into FeatureGroup

After the FeatureGroups have been created, we can put data into the FeatureGroups by using the PutRecord API. This API can handle high TPS and is designed to be called by different streams. The data from all of these Put requests is buffered and written to S3 in chunks. The files will be written to the offline store within a few minutes of ingestion. For this example, to accelerate the ingestion process, we are specifying multiple workers to do the job simultaneously. It will take ~1min to ingest data to the 2 FeatureGroups, respectively.

In [126]:
neighborhood_feature_group.ingest(data_frame=neighborhood_data, max_workers=3, wait=True)

Failed to ingest row 203 to 406: An error occurred (ValidationError) when calling the PutRecord operation: Validation Error: Attempted to parse the feature value for the feature named [event_time] into a FeatureValue of type Fractional. The provided value must be within the range of a double precision floating point number defined by the IEEE 754 standard. The input format can be in either decimal form or scientific notation.
Failed to ingest row 0 to 203: An error occurred (ValidationError) when calling the PutRecord operation: Validation Error: Attempted to parse the feature value for the feature named [event_time] into a FeatureValue of type Fractional. The provided value must be within the range of a double precision floating point number defined by the IEEE 754 standard. The input format can be in either decimal form or scientific notation.
Failed to ingest row 406 to 608: An error occurred (ValidationError) when calling the PutRecord operation: Validation Error: Attempted to pars

RuntimeError: Failed to ingest some data into FeatureGroup neighborhood-feature-group-27-03-09-53

In [111]:
record_identifier_value = str(2990130)

featurestore_runtime.get_record(
    FeatureGroupName=neighborhood_feature_group_name,
    RecordIdentifierValueAsString=record_identifier_value,
)

{'ResponseMetadata': {'RequestId': 'f82706b7-59e0-44b2-b6fd-0da3f235b071',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'f82706b7-59e0-44b2-b6fd-0da3f235b071',
   'content-type': 'application/json',
   'content-length': '32',
   'date': 'Sat, 27 Jan 2024 04:09:47 GMT'},
  'RetryAttempts': 0}}

In [112]:
featurestore_runtime.batch_get_record(
    Identifiers=[
        {
            "FeatureGroupName": neighborhood_feature_group_name,
            "RecordIdentifiersValueAsString": ["2990130"],
        },
    ]
)

{'ResponseMetadata': {'RequestId': '4f74f9b0-64ab-4d00-8276-9ca16e6e9eed',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '4f74f9b0-64ab-4d00-8276-9ca16e6e9eed',
   'content-type': 'application/json',
   'content-length': '54',
   'date': 'Sat, 27 Jan 2024 04:09:55 GMT'},
  'RetryAttempts': 0},
 'Records': [],
 'Errors': [],
 'UnprocessedIdentifiers': []}

In [114]:
# Check data types
print(neighborhood_data.dtypes)

# Convert 'event_time' to datetime64 if needed
neighborhood_data['event_time'] = pd.to_datetime(neighborhood_data['event_time'])

primary_key               string[python]
event_time                       float64
less_than_1h_ocean                 int64
inland                             int64
island                             int64
near_bay                           int64
near_ocean                         int64
median_house_value               float64
median_house_age                 float64
total_households                 float64
bedrooms_per_household           float64
locality-code                      int16
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  neighborhood_data['event_time'] = pd.to_datetime(neighborhood_data['event_time'])


In [69]:
print(neighborhood_feature_group.as_hive_ddl())

CREATE EXTERNAL TABLE IF NOT EXISTS sagemaker_featurestore.neighborhood-feature-group-27-03-09-53 (
  primary_key STRING
  event_time FLOAT
  less_than_1h_ocean INT
  inland INT
  island INT
  near_bay INT
  near_ocean INT
  median_house_value FLOAT
  median_house_age FLOAT
  total_households FLOAT
  bedrooms_per_household FLOAT
  locality-code INT
  write_time TIMESTAMP
  event_time TIMESTAMP
  is_deleted BOOLEAN
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
  STORED AS
  INPUTFORMAT 'parquet.hive.DeprecatedParquetInputFormat'
  OUTPUTFORMAT 'parquet.hive.DeprecatedParquetOutputFormat'
LOCATION 's3://sagemaker-us-east-1-339129315232/sagemaker-assignment-3/339129315232/sagemaker/us-east-1/offline-store/neighborhood-feature-group-27-03-09-53'


In [74]:
account_id = boto3.client("sts").get_caller_identity()["Account"]
print(account_id)

s3_client = boto3.client("s3", region_name=region)

neighborhood_feature_group_resolved_output_s3_uri = (
    neighborhood_feature_group.describe()
    .get("OfflineStoreConfig")
    .get("S3StorageConfig")
    .get("ResolvedOutputS3Uri")
)
neighborhood_feature_group_s3_prefix = neighborhood_feature_group_resolved_output_s3_uri.replace(
    f"s3://{default_s3_bucket_name}/", ""
)

offline_store_contents = None
while offline_store_contents is None:
    objects_in_bucket = s3_client.list_objects(
        Bucket=default_s3_bucket_name, Prefix=neighborhood_feature_group_s3_prefix
    )
    if "Contents" in objects_in_bucket and len(objects_in_bucket["Contents"]) > 1:
        offline_store_contents = objects_in_bucket["Contents"]
    else:
        print("Waiting for data in offline store...\n")
        sleep(60)

print("Data available.")

339129315232
Data available.


## Queries

In [81]:
# Assume you have a feature store and a feature group named "my_feature_group"
feature_group_name = "neighborhood_feature_group"
sagemaker_session = sagemaker.Session()

# Create an Athena query object for the feature group
feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=sagemaker_session)
query = feature_group.athena_query()

# Define and run a SQL query with a WHERE clause
keyword_value = "Brooktree"
query_string = f"SELECT * FROM {feature_group.table_name} WHERE keyword_column = '{keyword_value}'"
query.run(query_string=query_string, output_location="s3://your-output-location/")

# Wait for the query to complete
query.wait()

# Retrieve the results as a Pandas DataFrame
query_results = query.as_dataframe()
print(query_results)


ResourceNotFound: An error occurred (ResourceNotFound) when calling the DescribeFeatureGroup operation: Resource Not Found: Amazon SageMaker can't find a FeatureGroup with name neighborhood_feature_group

In [84]:
import sagemaker
from sagemaker.feature_store.feature_group import FeatureGroup

feature_groups = FeatureGroup.list()
print(feature_groups)


AttributeError: type object 'FeatureGroup' has no attribute 'list'

## Cleanup Resources

In [26]:
predictor.delete_endpoint()

NameError: name 'predictor' is not defined

In [27]:
identity_feature_group.delete()
transaction_feature_group.delete()

NameError: name 'identity_feature_group' is not defined

In [28]:
# restore original boto3 version
%pip install 'boto3=={}'.format(original_boto3_version)

/bin/bash: -c: line 1: syntax error near unexpected token `('
/bin/bash: -c: line 1: `/opt/conda/bin/python -m pip install 'boto3=={}'.format(original_boto3_version)'
Note: you may need to restart the kernel to use updated packages.


In [29]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>