# Query Values from Databases

In [1]:
import boto3
import sagemaker
from pyathena import connect
import pandas as pd
import numpy as np

# Reinitialize SageMaker session
session = boto3.session.Session()
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()  # Get bucket dynamically

# Retrieve stored variables
%store -r region
%store -r role
%store -r s3_staging_dir
%store -r database_name
%store -r dev_table_name
%store -r prod_table_name

# Set up Athena connection
conn = connect(s3_staging_dir=s3_staging_dir, region_name=region)

# Query development data
query = f"SELECT * FROM {database_name}.{dev_table_name};"
dev_data_df = pd.read_sql(query, conn)

# Display the first few rows of development data
print("Development Data:")
print(dev_data_df.head())

# Query production data
query = f"SELECT * FROM {database_name}.{prod_table_name};"
prod_data_df = pd.read_sql(query, conn)

# Display the first few rows of production data
print("Production Data:")
print(prod_data_df.head())




sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


  dev_data_df = pd.read_sql(query, conn)


Development Data:
   year  month carrier airport  arr_flights  arr_del15  carrier_ct  \
0  2004      1      DL     PBI          650        126          21   
1  2004      1      DL     PDX          314         61          14   
2  2004      1      DL     PHL          513         97          27   
3  2004      1      DL     PHX          334         78          20   
4  2004      1      DL     PIT          217         47           8   

   weather_ct  nas_ct  security_ct  ...  arr_cancelled  arr_diverted  \
0           6      51            1  ...              4             0   
1           2      34            0  ...             30             3   
2           0      51            0  ...             15             0   
3           2      39            0  ...              3             1   
4           0      21            0  ...              4             1   

   arr_delay  carrier_delay  weather_delay  nas_delay  security_delay  \
0       5425            881            397       2016  

  prod_data_df = pd.read_sql(query, conn)


Production Data:
   year  month carrier airport  arr_flights  arr_del15  carrier_ct  \
0  2016      1      AA     DFW        11956       1534         507   
1  2016      1      AA     DTW          588         98          34   
2  2016      1      AA     SEA          607         92          35   
3  2016      1      AA     JFK         1595        335         117   
4  2016      1      AA     SJC          327         59          23   

   weather_ct  nas_ct  security_ct  ...  arr_cancelled  arr_diverted  \
0          39     452            4  ...            201             9   
1           4      19            0  ...             13             2   
2           5      23            0  ...             12             2   
3          10     117            0  ...            137             6   
4           1      14            0  ...              0             0   

   arr_delay  carrier_delay  weather_delay  nas_delay  security_delay  \
0     106950          50027           2842      13913   

# Preprocess Data and Cast/Convert

In [2]:
import time

# ✅ Convert categorical features to `string`
def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        if data_frame.dtypes[label] == "object":
            data_frame[label] = data_frame[label].astype("str").astype("string")

# Apply to both development and production datasets
cast_object_to_string(dev_data_df)
cast_object_to_string(prod_data_df)

# ✅ Convert all numeric columns to `int64` (except `event_time`)
def convert_numeric_to_int(data_frame):
    for col in data_frame.select_dtypes(include=['int', 'float']).columns:
        if col not in ["event_time"]:
            data_frame[col] = data_frame[col].astype("int64")

# Apply to both datasets
convert_numeric_to_int(dev_data_df)
convert_numeric_to_int(prod_data_df)

# ✅ Feature Engineering: Binary 'on_time' column
def is_on_time(row):
    return 1 if row['arr_del15'] == 0 and row['arr_cancelled'] == 0 else 0

# Apply to both datasets
dev_data_df['on_time'] = dev_data_df.apply(is_on_time, axis=1)
prod_data_df['on_time'] = prod_data_df.apply(is_on_time, axis=1)

# ✅ Ensure `event_time` is a FLOAT UNIX timestamp
current_time_sec = int(round(time.time()))

dev_data_df['event_time'] = pd.Series([current_time_sec] * len(dev_data_df), dtype="float64")
prod_data_df['event_time'] = pd.Series([current_time_sec] * len(prod_data_df), dtype="float64")

# ✅ Ensure `record_id` is a unique string identifier
dev_data_df['record_id'] = dev_data_df.index.astype("string")
prod_data_df['record_id'] = prod_data_df.index.astype("string")

print("✅ Data Preprocessing Complete. Ready for Feature Store Upload!")
print("Development Data Types:")
print(dev_data_df.dtypes)
print("\nProduction Data Types:")
print(prod_data_df.dtypes)


✅ Data Preprocessing Complete. Ready for Feature Store Upload!
Development Data Types:
year                            int64
month                           int64
carrier                string[python]
airport                string[python]
arr_flights                     int64
arr_del15                       int64
carrier_ct                      int64
weather_ct                      int64
nas_ct                          int64
security_ct                     int64
late_aircraft_ct                int64
arr_cancelled                   int64
arr_diverted                    int64
arr_delay                       int64
carrier_delay                   int64
weather_delay                   int64
nas_delay                       int64
security_delay                  int64
late_aircraft_delay             int64
delay_rate                      int64
on_time                         int64
event_time                    float64
record_id              string[python]
dtype: object

Production Data Types:
y

# Create Feature Group

In [3]:
import boto3
import time

# Set AWS region
region = "us-east-1"  # Update if needed

# Feature Group names for Development and Production
dev_feature_group_name = "airline_delay_features_dev"
prod_feature_group_name = "airline_delay_features_prod"

# Initialize SageMaker client
sagemaker_client = boto3.client('sagemaker', region_name=region)

def delete_feature_group(feature_group_name):
    """Deletes a feature group if it exists."""
    try:
        print(f"🔍 Checking if Feature Group '{feature_group_name}' exists...")

        # Check if the feature group exists
        existing_groups = sagemaker_client.list_feature_groups()['FeatureGroupSummaries']
        existing_group_names = [fg['FeatureGroupName'] for fg in existing_groups]

        if feature_group_name in existing_group_names:
            print(f"🚀 Feature Group '{feature_group_name}' found. Deleting...")

            # Delete the feature group
            sagemaker_client.delete_feature_group(FeatureGroupName=feature_group_name)

            # Wait for deletion to complete
            while True:
                existing_groups = sagemaker_client.list_feature_groups()['FeatureGroupSummaries']
                existing_group_names = [fg['FeatureGroupName'] for fg in existing_groups]

                if feature_group_name not in existing_group_names:
                    print(f"✅ Feature Group '{feature_group_name}' deleted successfully.")
                    break

                print("⏳ Waiting for Feature Group deletion...")
                time.sleep(5)

        else:
            print(f"✅ Feature Group '{feature_group_name}' does not exist. No deletion needed.")

    except Exception as e:
        print(f"❌ Error deleting Feature Group '{feature_group_name}': {e}")

# ✅ Delete and recreate feature groups for both Development and Production
delete_feature_group(dev_feature_group_name)
delete_feature_group(prod_feature_group_name)


🔍 Checking if Feature Group 'airline_delay_features_dev' exists...
✅ Feature Group 'airline_delay_features_dev' does not exist. No deletion needed.
🔍 Checking if Feature Group 'airline_delay_features_prod' exists...
✅ Feature Group 'airline_delay_features_prod' does not exist. No deletion needed.


In [4]:
import boto3
import sagemaker
import time

# Define the Feature Group names
dev_feature_group_name = "airline_delay_features_dev"
prod_feature_group_name = "airline_delay_features_prod"

# Initialize SageMaker client
sagemaker_client = boto3.client("sagemaker")

# Function to create a feature group
def create_feature_group(feature_group_name, data_df, s3_uri):
    """Creates a feature group if it does not exist."""
    
    # ✅ Step 1: Check if Feature Group Exists
    existing_groups = sagemaker_client.list_feature_groups()['FeatureGroupSummaries']
    existing_group_names = [fg['FeatureGroupName'] for fg in existing_groups]

    if feature_group_name in existing_group_names:
        print(f"✅ Feature Group '{feature_group_name}' already exists.")
    else:
        print(f"🚀 Feature Group '{feature_group_name}' does NOT exist! Creating it now.")

        # ✅ Step 2: Define Feature Group Schema
        feature_group_definition = {
            "FeatureGroupName": feature_group_name,
            "RecordIdentifierFeatureName": "record_id",
            "EventTimeFeatureName": "event_time",
            "FeatureDefinitions": [
                {"FeatureName": "event_time", "FeatureType": "Fractional"}
            ] + [
                {
                    "FeatureName": col,
                    "FeatureType": "String" if data_df[col].dtype == "string" else "Integral"
                }
                for col in data_df.columns if col != "event_time"
            ],
            "OnlineStoreConfig": {"EnableOnlineStore": True},
            "OfflineStoreConfig": {
                "S3StorageConfig": {"S3Uri": s3_uri},
                "DisableGlueTableCreation": False,
            },
            "RoleArn": role,
        }

        # ✅ Step 3: Create Feature Group
        try:
            sagemaker_client.create_feature_group(**feature_group_definition)
            print(f"✅ Feature Group '{feature_group_name}' created successfully.")
        except Exception as e:
            print(f"❌ Error creating Feature Group '{feature_group_name}': {e}")
            return
    
    # ✅ Step 4: Wait Until Feature Group is Ready
    print(f"⏳ Waiting for Feature Group '{feature_group_name}' to become active...")
    while True:
        try:
            status_response = sagemaker_client.describe_feature_group(FeatureGroupName=feature_group_name)
            status = status_response["FeatureGroupStatus"]
            print(f"⏳ Current Feature Group '{feature_group_name}' status: {status}")

            if status == "Created":
                print(f"✅ Feature Group '{feature_group_name}' is now fully ready!")
                break
        except Exception as e:
            print(f"❌ Error checking Feature Group '{feature_group_name}' status: {e}")
        
        time.sleep(5)

    print("⏳ Waiting an additional 60 seconds for stability...")
    time.sleep(60)

# Define S3 locations
dev_s3_uri = f"s3://{bucket}/feature-store/dev/"
prod_s3_uri = f"s3://{bucket}/feature-store/prod/"

# ✅ Create Feature Groups for Dev & Prod
create_feature_group(dev_feature_group_name, dev_data_df, dev_s3_uri)
create_feature_group(prod_feature_group_name, prod_data_df, prod_s3_uri)

%store dev_feature_group_name prod_feature_group_name
%store dev_s3_uri prod_s3_uri


🚀 Feature Group 'airline_delay_features_dev' does NOT exist! Creating it now.
✅ Feature Group 'airline_delay_features_dev' created successfully.
⏳ Waiting for Feature Group 'airline_delay_features_dev' to become active...
⏳ Current Feature Group 'airline_delay_features_dev' status: Creating
⏳ Current Feature Group 'airline_delay_features_dev' status: Creating
⏳ Current Feature Group 'airline_delay_features_dev' status: Creating
⏳ Current Feature Group 'airline_delay_features_dev' status: Created
✅ Feature Group 'airline_delay_features_dev' is now fully ready!
⏳ Waiting an additional 60 seconds for stability...
🚀 Feature Group 'airline_delay_features_prod' does NOT exist! Creating it now.
✅ Feature Group 'airline_delay_features_prod' created successfully.
⏳ Waiting for Feature Group 'airline_delay_features_prod' to become active...
⏳ Current Feature Group 'airline_delay_features_prod' status: Creating
⏳ Current Feature Group 'airline_delay_features_prod' status: Creating
⏳ Current Featu

# Record insertion

In [5]:
import boto3
import time

# Initialize SageMaker client
sagemaker_client = boto3.client("sagemaker")

# ✅ Function to Get the Correct Offline Table Name
def get_feature_store_table_name(feature_group_name):
    print(f"⏳ Waiting for the Feature Group '{feature_group_name}' to be available in Glue...")

    # Wait for Feature Group to be created
    while True:
        response = sagemaker_client.describe_feature_group(FeatureGroupName=feature_group_name)
        status = response["FeatureGroupStatus"]
        
        if status == "Created":
            print(f"✅ Feature Group '{feature_group_name}' is now active!")
            break
        
        print(f"⏳ Current status: {status}, retrying in 5 seconds...")
        time.sleep(5)
    
    # Retrieve Glue Table Name from Offline Store Config
    try:
        table_name = response["OfflineStoreConfig"]["DataCatalogConfig"]["TableName"]
        print(f"✅ Feature Store table registered in Glue for '{feature_group_name}': {table_name}")
        return table_name
    except KeyError:
        print(f"❌ Error: Offline Store is not properly configured for '{feature_group_name}'.")
        return None

# ✅ Get Feature Store Table Names for Dev and Prod
dev_feature_store_table = get_feature_store_table_name(dev_feature_group_name)
prod_feature_store_table = get_feature_store_table_name(prod_feature_group_name)

# ✅ Store for use in other notebooks
%store dev_feature_store_table prod_feature_store_table
print()

print(f"✅ Stored feature store table names for development and production: {dev_feature_store_table} {prod_feature_store_table}")

print()

%store


⏳ Waiting for the Feature Group 'airline_delay_features_dev' to be available in Glue...
✅ Feature Group 'airline_delay_features_dev' is now active!
✅ Feature Store table registered in Glue for 'airline_delay_features_dev': airline_delay_features_dev_1739939876
⏳ Waiting for the Feature Group 'airline_delay_features_prod' to be available in Glue...
✅ Feature Group 'airline_delay_features_prod' is now active!
✅ Feature Store table registered in Glue for 'airline_delay_features_prod': airline_delay_features_prod_1739939953
Stored 'dev_feature_store_table' (str)
Stored 'prod_feature_store_table' (str)

✅ Stored feature store table names for development and production: airline_delay_features_dev_1739939876 airline_delay_features_prod_1739939953

Stored variables and their in-db values:
account_id                               -> '607916531205'
create_base_csv_athena_db                -> True
create_base_csv_athena_table             -> True
database_name                            -> 'db_air

In [6]:
from datetime import datetime
import time
import boto3
from sagemaker.feature_store.feature_group import FeatureGroup

# Initialize Feature Store Runtime client
featurestore_runtime = boto3.client('sagemaker-featurestore-runtime', region_name="us-east-1")

# ✅ Function to Insert a Single Row into a Feature Group
def insert_single_record(feature_group_name, data_df):
    """ Inserts a single row into the specified feature group. """

    # ✅ Convert event_time to UNIX timestamp (float64)
    data_df["event_time"] = time.time()

    # ✅ Select one row and convert to Feature Store format
    single_record = data_df.iloc[0].to_dict()

    # ✅ Ensure event_time and record_id are included in correct format
    record = {
        "FeatureGroupName": feature_group_name,
        "Record": [
            {"FeatureName": key, "ValueAsString": str(value)} for key, value in single_record.items()
        ]
    }

    # ✅ Insert single record into Feature Store
    try:
        featurestore_runtime.put_record(**record)
        print(f"✅ Successfully inserted one record into Feature Store '{feature_group_name}'!")
    except Exception as e:
        print(f"❌ Error inserting record into '{feature_group_name}': {e}")

# ✅ Insert into Development Feature Store
insert_single_record(dev_feature_group_name, dev_data_df)

# ✅ Insert into Production Feature Store
insert_single_record(prod_feature_group_name, prod_data_df)


✅ Successfully inserted one record into Feature Store 'airline_delay_features_dev'!
✅ Successfully inserted one record into Feature Store 'airline_delay_features_prod'!


# Inserting All Records

In [7]:
# ✅ Initialize Feature Group objects for both Dev & Prod
dev_feature_group = FeatureGroup(name=dev_feature_group_name, sagemaker_session=sagemaker_session)
prod_feature_group = FeatureGroup(name=prod_feature_group_name, sagemaker_session=sagemaker_session)

# ✅ Function to perform bulk ingestion into a Feature Store
def bulk_ingest(feature_group, data_df):
    """ Ingests data in bulk into the specified Feature Group. """
    try:
        feature_group.ingest(
            data_frame=data_df[1:],  # Use full dataset except the first row (since it was inserted earlier)
            max_workers=5,  # Controls parallel processing
            wait=True  # Waits for the ingestion to complete
        )
        print(f"🚀 All records successfully ingested into Feature Store '{feature_group.name}'!")
    except Exception as e:
        print(f"❌ Error during bulk ingestion into '{feature_group.name}': {e}")

# ✅ Bulk Ingest Development Data
bulk_ingest(dev_feature_group, dev_data_df)

# ✅ Bulk Ingest Production Data
bulk_ingest(prod_feature_group, prod_data_df)


🚀 All records successfully ingested into Feature Store 'airline_delay_features_dev'!
🚀 All records successfully ingested into Feature Store 'airline_delay_features_prod'!


In [10]:
# ✅ Function to check record count in Feature Store
def check_feature_store_count(feature_store_table, data_df, label):
    """ Queries Athena to get the record count for a given feature store table. """

    query = f"""
    SELECT COUNT(*) FROM "sagemaker_featurestore"."{feature_store_table}";
    """

    try:
        # ✅ Execute Athena query using Pandas
        record_count_df = pd.read_sql(query, conn)

        # ✅ Print the current number of records uploaded
        print(f"🔍 {label} - Records in Feature Store: {record_count_df.iloc[0, 0]}, Total Expected: {len(data_df)}")
        print(f"Note: If numbers don't match, uploads are likely still completing. Wait for a minute before running this cell again.\n")

    except Exception as e:
        print(f"❌ Error querying {label} Feature Store: {e}")

# ✅ Check Dev Feature Store Record Count
check_feature_store_count(dev_feature_store_table, dev_data_df, "Development")

# ✅ Check Prod Feature Store Record Count
check_feature_store_count(prod_feature_store_table, prod_data_df, "Production")


  record_count_df = pd.read_sql(query, conn)


🔍 Development - Records in Feature Store: 130507, Total Expected: 130507
Note: If numbers don't match, uploads are likely still completing. Wait for a minute before running this cell again.

🔍 Production - Records in Feature Store: 102182, Total Expected: 102182
Note: If numbers don't match, uploads are likely still completing. Wait for a minute before running this cell again.



***Note!!! Seems like when we push one entry in, sometimes it gets stuck. Executing the command to push all the queries in seems to force things down into the feature store. I moved the reading of the single entry to the end, even though it was originally designed to read the item that we had inserted into the stack before we executed the bulk processing. Hope this makes sense***

In [11]:
# ✅ Ensure we have the correct `record_id` from the inserted data
dev_record_id = dev_data_df.iloc[0]["record_id"]
prod_record_id = prod_data_df.iloc[0]["record_id"]

# ✅ Function to query a record from Offline Store (Athena)
def query_feature_store_record(feature_store_table, record_id, label):
    """ Queries Athena to retrieve a specific record from the Feature Store offline store. """
    
    query = f"""
    SELECT * FROM "sagemaker_featurestore"."{feature_store_table}"
    WHERE record_id = '{record_id}'
    LIMIT 1;
    """

    try:
        # ✅ Execute the query using Pandas
        offline_record_df = pd.read_sql(query, conn)

        # ✅ Check if the record exists
        if not offline_record_df.empty:
            print(f"✅ {label} - Record Found in Feature Store (Offline Store via Athena):")
            print(offline_record_df)
        else:
            print(f"❌ {label} - Record not found in Offline Store!")
    
    except Exception as e:
        print(f"❌ Error querying {label} Feature Store: {e}")

# ✅ Query the inserted record in Dev Feature Store
query_feature_store_record(dev_feature_store_table, dev_record_id, "Development")

# ✅ Query the inserted record in Prod Feature Store
query_feature_store_record(prod_feature_store_table, prod_record_id, "Production")


  offline_record_df = pd.read_sql(query, conn)


✅ Development - Record Found in Feature Store (Offline Store via Athena):
     event_time  year  month carrier airport  arr_flights  arr_del15  \
0  1.739940e+09  2004      1      DL     PBI          650        126   

   carrier_ct  weather_ct  nas_ct  ...  weather_delay  nas_delay  \
0          21           6      51  ...            397       2016   

   security_delay  late_aircraft_delay  delay_rate  on_time  record_id  \
0              15                 2116          19        0          0   

               write_time  api_invocation_time  is_deleted  
0 2025-02-19 04:45:48.072  2025-02-19 04:40:56       False  

[1 rows x 26 columns]


  offline_record_df = pd.read_sql(query, conn)


✅ Production - Record Found in Feature Store (Offline Store via Athena):
     event_time  year  month carrier airport  arr_flights  arr_del15  \
0  1.739940e+09  2016      1      AA     DFW        11956       1534   

   carrier_ct  weather_ct  nas_ct  ...  weather_delay  nas_delay  \
0         507          39     452  ...           2842      13913   

   security_delay  late_aircraft_delay  delay_rate  on_time  record_id  \
0             167                40001          12        0          0   

               write_time  api_invocation_time  is_deleted  
0 2025-02-19 04:45:47.571  2025-02-19 04:40:56       False  

[1 rows x 26 columns]


# Release Resources

In [12]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [None]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}