In [None]:
# 1: Install required libraries

# Purpose: Install necessary Python libraries to interact with Google Cloud, process data, and develop machine learning models.
# Libraries:
# - google-cloud-aiplatform: For interacting with Google Cloud Vertex AI services.
# - pandas: For data manipulation and analysis.
# - scikit-learn: For machine learning model development and evaluation.

!pip install google-cloud-aiplatform pandas scikit-learn


In [None]:
# 2: Initialize Vertex AI SDK

# Purpose: Initialize the Vertex AI SDK to set up the connection to the Google Cloud platform using your project ID and region.
# What it does: This command initializes the Google Cloud AI platform and configures your project ID and location for subsequent operations.
# Outcome: You can now use Vertex AI services in the notebook to store and serve features.

from google.cloud import aiplatform
aiplatform.init(project="your-project-id", location="us-central1")


In [None]:
# 3: Load dataset from Google Cloud Storage (GCS)

# Purpose: Load the dataset stored in Google Cloud Storage (GCS) into a Pandas DataFrame to inspect and analyze it.
# What it does: Reads a CSV file from GCS into a Pandas DataFrame and prints the first few rows of the dataset to understand its structure.
# Outcome: The first few rows of the dataset are printed, allowing you to explore the structure of the data.

import pandas as pd

dataset_path = "gs://<your-bucket-name>/user_activity.csv"
df = pd.read_csv(dataset_path)

print(df.head())



In [None]:
# 4: Create a new feature (average session duration)

# Purpose: Create a new feature called 'avg_session_duration' to measure the average time spent per session for each user.
# What it does: Calculates the average session duration by dividing 'time_spent' by 'session_count'.
# Outcome: The dataset is updated with a new column 'avg_session_duration' containing the calculated values.

df["avg_session_duration"] = df["time_spent"] / df["session_count"]


In [None]:
# 5: Apply one-hot encoding on the 'activity_type' column
# Purpose: Convert the categorical column 'activity_type' into binary (one-hot encoded) features for machine learning models.
# What it does: Creates new binary columns for each unique value in 'activity_type' (e.g., browsing, cart, purchase, wishlist).
# Outcome: The original 'activity_type' column is replaced by multiple binary columns representing each activity type.

df = pd.get_dummies(df, columns=["activity_type"])



In [None]:
#6 Print the updated DataFrame with one-hot encoded columns

print(df.head())  

In [None]:
# 7: Rename 'user_id' column to 'entity_id' for Vertex AI Feature Store
# Purpose: Rename 'user_id' to 'entity_id' to meet the naming conventions of Vertex AI Feature Store (which requires 'entity_id' as the identifier).
# What it does: Renames the 'user_id' column in the DataFrame to 'entity_id'.
# Outcome: The DataFrame now contains the 'entity_id' column, which will be used as the primary key for feature store ingestion.

df = df.rename(columns={"user_id": "entity_id"})


In [None]:
# 8: Convert 'entity_id' to string data type
# Purpose: Convert the 'entity_id' column to a string data type, as required by Vertex AI Feature Store for entity identifiers.
# What it does: Ensures that the 'entity_id' column is treated as a string by explicitly casting it to the 'str' type.
# Outcome: The 'entity_id' column is now in string format, making it compatible with Vertex AI Feature Store.

df["entity_id"] = df["entity_id"].astype(str)


In [None]:
# 9: Convert timestamp to datetime format: Ensure timestamp is in Native datetime64[ns] Format. Vertex AI Feature Store expects a strict TIMESTAMP type.
# Purpose: Convert the 'timestamp' column to the correct datetime format for compatibility with Vertex AI Feature Store.
# What it does: The 'timestamp' column is converted to a proper datetime format using Pandas' `to_datetime()` function.
# Outcome: The 'timestamp' column will now be in the 'datetime64[ns]' format, ready for ingestion into Vertex AI Feature Store.

df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")


In [None]:
# 10: Validate data types to ensure proper format
# Purpose: Verify that the data types of the columns are correctly formatted.
# What it does: Prints the data types of all columns in the DataFrame to ensure that the 'timestamp' and 'entity_id' columns have the correct formats.
# Outcome: If any column's data type is incorrect, you can spot the error and fix it before ingestion.

print(df.dtypes)


In [None]:
# 11: Drop rows with missing timestamps
# Purpose: Clean the data by removing rows that contain invalid or missing timestamps.
# What it does: Drops rows where the 'timestamp' column has missing or invalid values (i.e., NaT).
# Outcome: The dataset is cleaned, and rows with invalid timestamps are removed to avoid issues when uploading data to the feature store.

df = df.dropna(subset=["timestamp"])


In [None]:
# 12: Create Vertex AI Feature Store with online serving capabilities
# Purpose: Create a Vertex AI Feature Store to store and manage the engineered features for machine learning.
# What it does: Creates a new feature store with online serving capabilities, which allows for fast retrieval of features for real-time predictions.
# Outcome: A new feature store is created in the specified region, enabling you to store features for online serving.

featurestore = aiplatform.Featurestore.create(
    featurestore_id="user_features_store",
    location="us-central1",
    online_store_fixed_node_count=1  # Enables online serving with one node
)


In [None]:
# 13: Create 'users' entity type in the feature store
# Purpose: Define an entity type for users in the Vertex AI Feature Store to group user-specific features.
# What it does: Creates an entity type named 'users' to logically organize user-related features (e.g., avg_session_duration, session_count).
# Outcome: A new entity type 'users' is created within the feature store.

entity_type = featurestore.create_entity_type(
    entity_type_id="users",
    description="Features related to user activity"
)


In [None]:
#14: Add features
entity_type.create_feature(
    feature_id="avg_session_duration",
    value_type="DOUBLE",
    description="Average session duration per user"
)

entity_type.create_feature(
    feature_id="session_count",
    value_type="INT64",
    description="Number of sessions per user"
)

entity_type.create_feature(
    feature_id="activity_type_browsing",
    value_type="INT64",
    description="Indicator for browsing activity"
)

entity_type.create_feature(
    feature_id="activity_type_cart",
    value_type="INT64",
    description="Indicator for cart activity"
)

entity_type.create_feature(
    feature_id="activity_type_purchase",
    value_type="INT64",
    description="Indicator for purchase activity"
)

entity_type.create_feature(
    feature_id="activity_type_wishlist",
    value_type="INT64",
    description="Indicator for wishlist activity"
)


In [None]:
# 15: Convert timestamp column to ISO 8601 format for Vertex AI compatibility

import numpy as np

# Force Pandas datetime to match TIMESTAMP requirements
df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")

# Convert to ISO 8601 string format for TIMESTAMP compatibility
df["timestamp"] = df["timestamp"].dt.strftime("%Y-%m-%dT%H:%M:%S.%fZ")

# Confirm the column's data type
print(df["timestamp"].head())

print(df.dtypes)




In [None]:
# 16: Check and drop rows with invalid timestamps

invalid_timestamps = df[df["timestamp"].isna()]
print(f"Invalid rows: {len(invalid_timestamps)}")

# Drop rows with missing or invalid timestamps
df = df.dropna(subset=["timestamp"])


In [None]:
# 17: Ensure 'timestamp' column is in native datetime format

df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")

# Validate the column type
print(df["timestamp"].dtypes)  # Should display 'datetime64[ns]'



In [None]:
# 18: Remove rows with missing or invalid timestamp values: (If any)

# Check for missing or invalid timestamps
print(f"Invalid timestamps: {df['timestamp'].isna().sum()}")

# Drop rows with invalid timestamps
df = df.dropna(subset=["timestamp"])


In [None]:
# 19: ingest (Upload) the feature-engineered data to the Feature Store.


entity_type.ingest_from_df(
    feature_ids=[
        "avg_session_duration",
        "session_count",
        "activity_type_browsing",
        "activity_type_cart",
        "activity_type_purchase",
        "activity_type_wishlist",
    ],
    feature_time="timestamp",  # The column representing feature time
    df_source=df,              # The processed DataFrame
    entity_id_field="entity_id"  # The column representing entity ID
)


In [None]:
#  Query the Feature Store to confirm the ingestion was successful

features = entity_type.read(
    entity_ids=["1", "2"],  # Replace with actual entity IDs
    feature_ids=[
        "avg_session_duration",
        "session_count",
        "activity_type_browsing",
        "activity_type_cart",
        "activity_type_purchase",
        "activity_type_wishlist",
    ]
)
print(features)
