In [None]:
# 1: Summary of the imported libraries

import pandas as pd
from google.cloud import aiplatform
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Purpose: The following libraries are imported to facilitate data manipulation, machine learning, and Google Cloud AI integration.
# What it does: 
#    - 'pandas' provides data structures (like DataFrame) to manipulate, analyze, and preprocess large datasets.
#    - 'aiplatform' enables interaction with Google Cloud's Vertex AI for managing datasets, training, and deploying machine learning models.
#    - 'train_test_split' is used to split the dataset into training and test subsets for proper model evaluation.
#    - 'RandomForestClassifier' is a machine learning algorithm from scikit-learn that builds an ensemble of decision trees to classify data.
#    - 'accuracy_score', 'confusion_matrix', and 'classification_report' are evaluation metrics to assess the performance of the model, including accuracy, precision, recall, and F1-score.
# Outcome: These libraries together provide all necessary tools to build, evaluate, and deploy machine learning models efficiently.


In [None]:
# 2: Actions for Google Cloud Vertex AI integration

aiplatform.init(project="<project-id>", location="us-central1")

featurestore = aiplatform.Featurestore("<Feature Store Name>")
entity_type = featurestore.get_entity_type("users")

# Purpose: The following commands initialize Vertex AI, interact with a Feature Store, and retrieve an entity type.
# What it does:
#    - 'aiplatform.init' initializes the connection to Vertex AI using the specified project ID and location (in this case, 'us-central1').
#    - 'Featurestore' creates an instance of the Vertex AI Feature Store, enabling storage and management of features for machine learning models.
#    - 'get_entity_type' retrieves the entity type (e.g., 'users') from the Feature Store, which represents the structure of the data stored in the Feature Store.
# Outcome: These commands set up the necessary environment and access for working with Vertex AI and retrieving feature data.


In [None]:
# 3: Actions for fetching features from the Vertex AI Feature Store

# Fetch features for a set of users
features = entity_type.read(
    entity_ids=["1", "2", "3", "4"],  # Example entity IDs (user IDs)
    feature_ids=["avg_session_duration", "session_count", "activity_type_browsing", "activity_type_cart", "activity_type_purchase"]
)

# Convert the features to a pandas DataFrame
df = pd.DataFrame(features)
print(df.head())

# Purpose: The following commands retrieve specific features for users from Vertex AI Feature Store and convert them into a DataFrame.
# What it does:
#    - 'entity_type.read' is used to fetch feature data for a set of entity IDs (in this case, user IDs). The 'feature_ids' parameter specifies which features to retrieve for each user.
#    - 'pd.DataFrame(features)' converts the retrieved features into a Pandas DataFrame for easy manipulation and analysis.
#    - 'print(df.head())' displays the first few rows of the DataFrame to provide an overview of the fetched data.
# Outcome: This code fetches features from the Feature Store and presents them in a structured DataFrame format for further analysis or model input.


In [None]:
# 4: Actions for selecting input features (X) and target labels (Y)

# Select input features (X) and target labels (Y)
x = df[["avg_session_duration", "session_count", "activity_type_browsing", "activity_type_cart", "activity_type_purchase"]]
y = df["activity_type_browsing"]  # We'll predict browsing activity. You can swap this for other target columns.

# Purpose: The following commands define the input features (X) and target labels (Y) for training a machine learning model.
# What it does:
#    - 'x' selects the columns representing the features (independent variables) from the DataFrame, which will be used to predict the target label.
#    - 'y' selects the target column (dependent variable), which in this case is the 'activity_type_browsing'. You can choose different columns for other prediction tasks.
# Outcome: This code prepares the features and target labels to be used for training a model, with X representing input data and Y the predicted outcome.


In [None]:
# 5: Actions for splitting the data into training and test sets

# Split the data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Purpose: The following command splits the dataset into training and test sets for model evaluation.
# What it does:
#    - 'train_test_split' randomly splits the data into training and test subsets.
#    - 'test_size=0.2' specifies that 20% of the data will be used for testing, and the remaining 80% will be used for training.
#    - 'random_state=42' ensures reproducibility by fixing the random seed.
# Outcome: This code prepares the data for model training and evaluation by splitting it into training and test sets.


In [None]:
# 6: Actions for training the machine learning model

# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(x_train, y_train)

# Purpose: The following commands train a machine learning model using the RandomForestClassifier.
# What it does:
#    - 'RandomForestClassifier(n_estimators=100, random_state=42)' initializes a random forest classifier with 100 trees, and a fixed random seed for reproducibility.
#    - 'model.fit(x_train, y_train)' trains the model using the training data (X_train) and the target labels (Y_train).
# Outcome: The model is trained using the provided training data and is ready to make predictions or be evaluated.


In [None]:
# 7: Evaluate the model

y_pred = model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)

print(f"Model Accuracy: {accuracy}")

In [None]:
# 8: Actions for generating detailed evaluation metrics

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Purpose: The following commands generate and print evaluation metrics to assess the performance of the trained model.
# What it does:
#    - 'confusion_matrix(y_test, y_pred)' computes the confusion matrix, which shows the number of correct and incorrect predictions broken down by class.
#    - 'print(cm)' outputs the confusion matrix for visual inspection.
#    - 'classification_report(y_test, y_pred)' generates a report that includes precision, recall, F1-score, and support for each class.
# Outcome: These metrics provide detailed insights into the model’s performance, helping you evaluate its effectiveness and identify areas for improvement.


In [None]:
# 9: Actions for saving the trained model

import joblib

# Save the trained model
joblib.dump(model, 'user_activity_model.pkl')

# Purpose: The following commands save the trained machine learning model to a file for later use.
# What it does:
#    - 'joblib.dump(model, 'user_activity_model.pkl')' saves the trained model (RandomForestClassifier) as a `.pkl` file.
#    - The model is saved to the current working directory with the name 'user_activity_model.pkl'.
# Outcome: This code enables model persistence, allowing you to reload the model later without retraining it.


In [None]:
# 10: Actions for loading the trained model and making predictions

# Load the model
loaded_model = joblib.load('user_activity_model.pkl')

# Predict activity type for a new user
new_user_features = pd.DataFrame([[65.0, 2, 1, 0, 0]], columns=["avg_session_duration", "session_count", "activity_type_browsing", "activity_type_cart", "activity_type_purchase"])
prediction = loaded_model.predict(new_user_features)

print(f"Predicted Activity Type: {prediction}")

# Purpose: The following commands load a previously saved model and use it to predict the activity type for a new user.
# What it does:
#    - 'joblib.load('user_activity_model.pkl')' loads the saved model from the specified file ('user_activity_model.pkl').
#    - A new user’s feature data is created as a pandas DataFrame, with values representing their activity and session characteristics.
#    - 'loaded_model.predict(new_user_features)' uses the loaded model to predict the activity type for the new user based on the provided features.
# Outcome: This code enables the prediction of activity types for new users by utilizing the trained and saved model.
