# Query values from development database

In [29]:
import boto3
import sagemaker
from pyathena import connect
import pandas as pd
import numpy as np

# Initialize AWS Session
session = boto3.session.Session()
region = session.region_name
sagemaker_session = sagemaker.Session()
role_arn = sagemaker.get_execution_role()

# Use SageMaker's default bucket
bucket = sagemaker_session.default_bucket()

# Set up Athena connection
s3_staging_dir = f's3://{bucket}/athena-query-results/'
conn = connect(s3_staging_dir=s3_staging_dir, region_name=region)

# Query development data
query = "SELECT * FROM db_airline_delay_cause.development_data;"  # Adjust limit easily
dev_data_df = pd.read_sql(query, conn)

# Display the first few rows
print(dev_data_df.head())


  dev_data_df = pd.read_sql(query, conn)


   year  month carrier airport  arr_flights  arr_del15  carrier_ct  \
0  2004      1      DL     PBI          650        126          21   
1  2004      1      DL     PDX          314         61          14   
2  2004      1      DL     PHL          513         97          27   
3  2004      1      DL     PHX          334         78          20   
4  2004      1      DL     PIT          217         47           8   

   weather_ct  nas_ct  security_ct  ...  arr_cancelled  arr_diverted  \
0           6      51            1  ...              4             0   
1           2      34            0  ...             30             3   
2           0      51            0  ...             15             0   
3           2      39            0  ...              3             1   
4           0      21            0  ...              4             1   

   arr_delay  carrier_delay  weather_delay  nas_delay  security_delay  \
0       5425            881            397       2016              15   


# Preprocess Data and Cast/Convert

In [23]:
import time

# ✅ Convert categorical features to `string`
def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        if data_frame.dtypes[label] == "object":
            data_frame[label] = data_frame[label].astype("str").astype("string")

cast_object_to_string(dev_data_df)

# ✅ Convert all numeric columns to `int64` (except `event_time`)
for col in dev_data_df.select_dtypes(include=['int', 'float']).columns:
    if col not in ["event_time"]:
        dev_data_df[col] = dev_data_df[col].astype("int64")

# ✅ Feature Engineering: Binary 'on_time' column
def is_on_time(row):
    return 1 if row['arr_del15'] == 0 and row['arr_cancelled'] == 0 else 0

dev_data_df['on_time'] = dev_data_df.apply(is_on_time, axis=1)

# ✅ Ensure `event_time` is a FLOAT UNIX timestamp
current_time_sec = int(round(time.time()))
dev_data_df['event_time'] = pd.Series([current_time_sec] * len(dev_data_df), dtype="float64")

# ✅ Ensure `record_id` is a unique string identifier
dev_data_df['record_id'] = dev_data_df.index.astype("string")

print("✅ Data Preprocessing Complete. Ready for Feature Store Upload!")
print(dev_data_df.dtypes)


✅ Data Preprocessing Complete. Ready for Feature Store Upload!
year                            int64
month                           int64
carrier                string[python]
airport                string[python]
arr_flights                     int64
arr_del15                       int64
carrier_ct                      int64
weather_ct                      int64
nas_ct                          int64
security_ct                     int64
late_aircraft_ct                int64
arr_cancelled                   int64
arr_diverted                    int64
arr_delay                       int64
carrier_delay                   int64
weather_delay                   int64
nas_delay                       int64
security_delay                  int64
late_aircraft_delay             int64
delay_rate                      int64
on_time                         int64
event_time                    float64
record_id              string[python]
dtype: object


# Create Feature Group

In [9]:
import boto3
import time

# Set AWS region and feature group name
region = "us-east-1"  # Update if needed
feature_group_name = "airline_delay_features"

# Initialize SageMaker client
sagemaker_client = boto3.client('sagemaker', region_name=region)

# Delete the Feature Group if it exists
try:
    print(f"🔍 Checking if Feature Group '{feature_group_name}' exists...")

    # Check if the feature group exists
    existing_groups = sagemaker_client.list_feature_groups()['FeatureGroupSummaries']
    existing_group_names = [fg['FeatureGroupName'] for fg in existing_groups]

    if feature_group_name in existing_group_names:
        print(f"🚀 Feature Group '{feature_group_name}' found. Deleting...")

        # Delete the feature group
        sagemaker_client.delete_feature_group(FeatureGroupName=feature_group_name)

        # Wait for deletion to complete
        while True:
            existing_groups = sagemaker_client.list_feature_groups()['FeatureGroupSummaries']
            existing_group_names = [fg['FeatureGroupName'] for fg in existing_groups]

            if feature_group_name not in existing_group_names:
                print(f"✅ Feature Group '{feature_group_name}' deleted successfully.")
                break

            print("⏳ Waiting for Feature Group deletion...")
            time.sleep(5)

    else:
        print(f"✅ Feature Group '{feature_group_name}' does not exist. No deletion needed.")

except Exception as e:
    print(f"❌ Error deleting Feature Group: {e}")


🔍 Checking if Feature Group 'airline_delay_features' exists...
✅ Feature Group 'airline_delay_features' does not exist. No deletion needed.


In [10]:
role_arn

'arn:aws:iam::607916531205:role/LabRole'

In [12]:
import boto3
import sagemaker

# Define the Feature Group name
feature_group_name = "airline_delay_features"

# Initialize SageMaker client
sagemaker_client = boto3.client("sagemaker")

# ✅ Step 1: Check if Feature Group Exists
existing_groups = sagemaker_client.list_feature_groups()['FeatureGroupSummaries']
existing_group_names = [fg['FeatureGroupName'] for fg in existing_groups]

if feature_group_name in existing_group_names:
    print(f"✅ Feature Group '{feature_group_name}' already exists.")
else:
    print(f"🚀 Feature Group '{feature_group_name}' does NOT exist! Creating it now.")

    # ✅ Step 2: Define Feature Group Schema
    s3_uri = f"s3://{bucket}/feature-store/"

    feature_group_definition = {
        "FeatureGroupName": feature_group_name,
        "RecordIdentifierFeatureName": "record_id",
        "EventTimeFeatureName": "event_time",
        "FeatureDefinitions": [
            {"FeatureName": "event_time", "FeatureType": "Fractional"}
        ] + [
            {
                "FeatureName": col,
                "FeatureType": "String" if dev_data_df[col].dtype == "string" else "Integral"
            }
            for col in dev_data_df.columns if col != "event_time"
        ],
        "OnlineStoreConfig": {"EnableOnlineStore": True},
        "OfflineStoreConfig": {
            "S3StorageConfig": {"S3Uri": s3_uri},
            "DisableGlueTableCreation": False,
        },
        "RoleArn": role_arn,
    }

    # ✅ Step 3: Create Feature Group
    try:
        sagemaker_client.create_feature_group(**feature_group_definition)
        print(f"✅ Feature Group '{feature_group_name}' created successfully.")

    except Exception as e:
        print(f"❌ Error creating Feature Group: {e}")
        exit()

# ✅ Step 4: Wait Until Feature Group is Ready
print("⏳ Waiting for Feature Group to become active...")
while True:
    try:
        status_response = sagemaker_client.describe_feature_group(FeatureGroupName=feature_group_name)
        status = status_response["FeatureGroupStatus"]
        print(f"⏳ Current Feature Group status: {status}")

        if status == "Created":
            print("✅ Feature Group is now fully ready!")
            break

    except Exception as e:
        print(f"❌ Error checking Feature Group status: {e}")

    time.sleep(5)

print("⏳ Waiting an additional 10 seconds for stability...")
time.sleep(10)


🚀 Feature Group 'airline_delay_features' does NOT exist! Creating it now.
✅ Feature Group 'airline_delay_features' created successfully.
⏳ Waiting for Feature Group to become active...
⏳ Current Feature Group status: Creating
⏳ Current Feature Group status: Creating
⏳ Current Feature Group status: Creating
⏳ Current Feature Group status: Creating
⏳ Current Feature Group status: Created
✅ Feature Group is now fully ready!
⏳ Waiting an additional 10 seconds for stability...


# Record insertion

In [13]:
from datetime import datetime
import time
import boto3

# Initialize Feature Store Runtime client
featurestore_runtime = boto3.client("sagemaker-featurestore-runtime")

# ✅ Convert event_time to UNIX timestamp (float64)
dev_data_df["event_time"] = time.time()

# ✅ Select one row and convert to Feature Store format
single_record = dev_data_df.iloc[0].to_dict()

# ✅ Ensure event_time and record_id are included
record = {
    "FeatureGroupName": feature_group_name,
    "Record": [
        {"FeatureName": key, "ValueAsString": str(value)} for key, value in single_record.items()
    ]
}

# ✅ Insert single record into Feature Store
featurestore_runtime.put_record(**record)

print("✅ Successfully inserted one record into Feature Store!")


✅ Successfully inserted one record into Feature Store!


In [16]:
import boto3
import time

# Initialize SageMaker client
sagemaker_client = boto3.client("sagemaker")

# ✅ Describe Feature Group to Get the Table Name
def get_feature_store_table_name(feature_group_name):
    print("⏳ Waiting for the Feature Group to be available in Glue...")
    
    # Wait for Feature Group to be created
    while True:
        response = sagemaker_client.describe_feature_group(FeatureGroupName=feature_group_name)
        status = response["FeatureGroupStatus"]
        if status == "Created":
            print("✅ Feature Group is now active!")
            break
        print(f"⏳ Current status: {status}, retrying in 5 seconds...")
        time.sleep(5)
    
    # Retrieve Glue Table Name
    table_name = response["OfflineStoreConfig"]["DataCatalogConfig"]["TableName"]
    print(f"✅ Feature Store table registered in Glue: {table_name}")
    
    return table_name

# Call this function **after creating the Feature Store**
correct_feature_store_table = get_feature_store_table_name(feature_group_name)


⏳ Waiting for the Feature Group to be available in Glue...
✅ Feature Group is now active!
✅ Feature Store table registered in Glue: airline_delay_features_1739141540


In [17]:
# Query to retrieve record from offline store (Athena)
query = f"""
SELECT * FROM "sagemaker_featurestore"."{correct_feature_store_table}"
WHERE record_id = '{single_record["record_id"]}'
LIMIT 1;
"""

# Execute the query using Pandas
offline_record_df = pd.read_sql(query, conn)

# ✅ Check if record exists
if not offline_record_df.empty:
    print("✅ Record Found in Feature Store (Offline Store via Athena):")
    print(offline_record_df)
else:
    print("❌ Record not found in Offline Store!")


  offline_record_df = pd.read_sql(query, conn)


✅ Record Found in Feature Store (Offline Store via Athena):
     event_time  year  month carrier airport  arr_flights  arr_del15  \
0  1.739142e+09  2004      1      DL     PBI          650        126   

   carrier_ct  weather_ct  nas_ct  ...  weather_delay  nas_delay  \
0          21           6      51  ...            397       2016   

   security_delay  late_aircraft_delay  delay_rate  on_time  record_id  \
0              15                 2116          19        0          0   

               write_time  api_invocation_time  is_deleted  
0 2025-02-09 22:59:43.943  2025-02-09 22:54:44       False  

[1 rows x 26 columns]


# Inserting All Records

In [24]:
# ✅ Bulk ingestion using the simpler `ingest()` method
feature_group.ingest(
    data_frame=dev_data_df[1:],  # Use the full dataset except first entry
    max_workers=5,  # Controls parallel processing
    wait=True  # Waits for the ingestion to complete
)

print("🚀 All records successfully ingested into Feature Store!")


🚀 All records successfully ingested into Feature Store!


In [28]:
# Run this to confirm all records are in feature store
query = f"""
SELECT COUNT(*) FROM "sagemaker_featurestore"."{correct_feature_store_table}";
"""

# ✅ Execute Athena query using Pandas
record_count_df = pd.read_sql(query, conn)

# ✅ Print the current number of records uploaded
print(f"🔍 Current records in Feature Store: {record_count_df.iloc[0, 0]}, Total Records: {len(dev_data_df)}")
print(f"Note: If numbers don't match, uploads are likely still completing. Wait for a minute before running this cell again.")

  record_count_df = pd.read_sql(query, conn)


🔍 Current records in Feature Store: 198830, Total Records: 198829
Note: If numbers don't match, uploads are likely still completing. Wait for a minute before running this cell again
