# Prediction of conversion within 7 days with neural network model

## Part 0: Data initial load

In [1]:
from google.cloud import bigquery
import pandas as pd

# Initialize BigQuery Client
client = bigquery.Client()

# Query to fetch data 
query = """
SELECT 
  event_date,
  event_timestamp,
  user_pseudo_id,
  event_name,
  
  ecommerce.total_item_quantity AS total_item_quantity,
  ecommerce.purchase_revenue_in_usd AS purchase_revenue_in_usd,
  ecommerce.purchase_revenue AS purchase_revenue,
  ecommerce.unique_items AS unique_items,
  ecommerce.transaction_id AS transaction_id,

  device.category AS device_category,
  device.mobile_brand_name AS device_brand,
  device.mobile_model_name AS device_model,
  device.operating_system AS operating_system,
  device.language AS device_language,

  geo.country AS country,
  geo.city AS city,

  -- Flatten nested 'traffic_source' fields
  traffic_source.medium AS traffic_medium,
  traffic_source.source AS traffic_source

FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_202011*`
WHERE event_date BETWEEN '20201101' AND '20201131'
"""
query_job = client.query(query)

# Load data into a pandas DataFrame
df = query_job.to_dataframe()

# Convert event_date to datetime
df['event_date'] = pd.to_datetime(df['event_date'], format='%Y%m%d')

## Part 1.1: Feature Engineering: Create derived features

In [2]:
def add_conversion_features(df):
    
    # Identify conversion
    df['is_conversion'] = df['purchase_revenue'] > 0   

    # Sort the DataFrame by user_pseudo_id and event_date
    df = df.sort_values(by=['user_pseudo_id', 'event_date']).reset_index(drop=True)

    # Create a cumulative conversion flag
    # Initialize 'last_conversion_date' with NaT for non-conversions and event_date for conversions
    df['last_conversion_date'] = pd.NaT
    df.loc[df['is_conversion'], 'last_conversion_date'] = df.loc[df['is_conversion'], 'event_date']

    # Forward-fill the 'last_conversion_date' for each user
    df['last_conversion_date'] = df.groupby('user_pseudo_id')['last_conversion_date'].ffill()

    # Step 3: Calculate whether the event is within 7 days of the last conversion
    # Ensure 'last_conversion_date' and 'event_date' are in datetime format
    df['last_conversion_date'] = pd.to_datetime(df['last_conversion_date'])
    df['event_date'] = pd.to_datetime(df['event_date'])

    # Calculate the difference in days between 'event_date' and 'last_conversion_date'
    df['days_since_conversion'] = (df['event_date'] - df['last_conversion_date']).dt.days

    # Identify events within 7 days of the last conversion
    df['conversion_within_7_days'] = (
        (df['last_conversion_date'].notna()) &  # Ensure a conversion exists
        (df['days_since_conversion'] >= 0) &   # Ensure non-negative window
        (df['days_since_conversion'] <= 7)     # Within 7 days
    )

    # Clean up temporary columns
    df.drop(columns=['last_conversion_date', 'days_since_conversion'], inplace=True)
    
    return df

df = add_conversion_features(df)

# Part 1.2: Event-level features

In [3]:
def add_event_features(df):    
    # One-hot encode event_name
    event_name_dummies = pd.get_dummies(df['event_name'], prefix='event')

    # Add device_category and traffic_medium as categorical features
    df['device_category'] = df['device_category'].astype('category')
    df['traffic_medium'] = df['traffic_medium'].astype('category')
    
    return df,  event_name_dummies

df,  event_name_dummies = add_event_features(df)

# Part 1.3: Session based features

In [4]:
def add_session_features(df):    
 
    # Define session ID using event_timestamp (30-minute of inactivity)
    df['event_timestamp'] = pd.to_datetime(df['event_timestamp'], unit='ms')
    df = df.sort_values(['user_pseudo_id', 'event_timestamp'])

    # Identify session breaks (30 minutes of inactivity), All events within a session (no 30-minute gap) share the same session ID.
    df['session_id'] = (df.groupby('user_pseudo_id')['event_timestamp']
                          .diff()
                          .gt(pd.Timedelta(minutes=30))
                          .cumsum())

    # Combined session-based feature aggregation
    session_features = (
        df.groupby(['user_pseudo_id', 'session_id'])
        .agg(
            session_duration=('event_timestamp', lambda x: (x.max() - x.min()).total_seconds() if len(x) > 1 else 0),
            session_event_count=('event_name', 'count')
        )
        .reset_index()  # Reset index to preserve session-level details
    )

    # Aggregate session-level data into user-level features
    user_session_features = (
        session_features.groupby('user_pseudo_id')
        .agg(
            total_sessions=('session_id', 'count'),
            average_session_duration=('session_duration', 'mean'),
            total_session_events=('session_event_count', 'sum'),
            average_session_events=('session_event_count', 'mean')
        )
        .reset_index()
    )
    
    # Debugging: Confirm session_id exists in df
    if 'session_id' not in df.columns:
        raise ValueError("session_id was not added to df.")

    return df, session_features, user_session_features

# Apply the add_session_features function and reassign df
df, session_features, user_session_features = add_session_features(df)

print("Columns in df:")
print(df.columns.tolist())

print("Columns in session_features:")
print(session_features.columns.tolist())


Columns in df:
['event_date', 'event_timestamp', 'user_pseudo_id', 'event_name', 'total_item_quantity', 'purchase_revenue_in_usd', 'purchase_revenue', 'unique_items', 'transaction_id', 'device_category', 'device_brand', 'device_model', 'operating_system', 'device_language', 'country', 'city', 'traffic_medium', 'traffic_source', 'is_conversion', 'conversion_within_7_days', 'session_id']
Columns in session_features:
['user_pseudo_id', 'session_id', 'session_duration', 'session_event_count']


# Part 1-4: Aggregated features

In [5]:
def add_behavioral_features(df):
    # Total and average revenue per user
    user_aggregates = df.groupby('user_pseudo_id').agg(
        total_revenue=('purchase_revenue', 'sum'),
        average_revenue=('purchase_revenue', 'mean'),
        total_events=('event_name', 'count'),
        unique_event_types=('event_name', 'nunique'),
        active_days=('event_date', 'nunique')
    ).reset_index()
    
    return df, user_aggregates

df, user_aggregates = add_behavioral_features(df)

print("Columns in df:")
print(df.columns.tolist())

print("Columns in session_features:")
print(session_features.columns.tolist())


Columns in df:
['event_date', 'event_timestamp', 'user_pseudo_id', 'event_name', 'total_item_quantity', 'purchase_revenue_in_usd', 'purchase_revenue', 'unique_items', 'transaction_id', 'device_category', 'device_brand', 'device_model', 'operating_system', 'device_language', 'country', 'city', 'traffic_medium', 'traffic_source', 'is_conversion', 'conversion_within_7_days', 'session_id']
Columns in session_features:
['user_pseudo_id', 'session_id', 'session_duration', 'session_event_count']


# Part 1-5: Behavioral features

In [6]:
def add_behavioral_features(df, session_features):
    # Create a copy of session_features to avoid in-place modification
    session_features_copy = session_features.copy()
    
    # Abandoned cart: Users who added items to the cart but didn’t purchase
    df['abandoned_cart'] = ((df['event_name'] == 'add_to_cart') &
                            ~(df['user_pseudo_id'].isin(df.loc[df['event_name'] == 'purchase', 'user_pseudo_id']))).astype(int)
    
    # Bounce rate: Sessions with only one event
    session_features_copy['is_bounce'] = (session_features_copy['session_event_count'] == 1).astype(int)
    bounce_rate = session_features_copy.groupby('user_pseudo_id').agg(
        bounce_rate=('is_bounce', 'mean')
    ).reset_index()
    
    # Merge bounce rate into the main DataFrame
    df = df.merge(bounce_rate, on='user_pseudo_id', how='left')

    # Return both original and modified session_features
    return df, bounce_rate, session_features, session_features_copy

df, bounce_rate, session_features, session_features_copy = add_behavioral_features(df, session_features)

print("Columns in session_features:")
print(session_features.columns.tolist())


Columns in session_features:
['user_pseudo_id', 'session_id', 'session_duration', 'session_event_count']


# Part 1-6: Temporal features

In [7]:
def add_temporal_features(df):
    # Calculate days since first event for each user
    df['days_since_first_event'] = (df['event_date'] - df.groupby('user_pseudo_id')['event_date'].transform('min')).dt.days

    # Days between consecutive events
    df['days_between_events'] = df.groupby('user_pseudo_id')['event_date'].diff().dt.days
    df['days_between_events'] = df['days_between_events'].fillna(0)  # Fill NaN for first event
    
    return df

df = add_temporal_features(df)

# Part 1-7: Feature integration

In [8]:
print (df.columns.tolist())

['event_date', 'event_timestamp', 'user_pseudo_id', 'event_name', 'total_item_quantity', 'purchase_revenue_in_usd', 'purchase_revenue', 'unique_items', 'transaction_id', 'device_category', 'device_brand', 'device_model', 'operating_system', 'device_language', 'country', 'city', 'traffic_medium', 'traffic_source', 'is_conversion', 'conversion_within_7_days', 'session_id', 'abandoned_cart', 'bounce_rate', 'days_since_first_event', 'days_between_events']


In [9]:
def feature_integration(df):
    df = df.merge(user_aggregates, on='user_pseudo_id', how='left')
    df = df.merge(user_session_features, on='user_pseudo_id', how='left')
    return df


df = feature_integration(df)
print (df.columns.tolist())

['event_date', 'event_timestamp', 'user_pseudo_id', 'event_name', 'total_item_quantity', 'purchase_revenue_in_usd', 'purchase_revenue', 'unique_items', 'transaction_id', 'device_category', 'device_brand', 'device_model', 'operating_system', 'device_language', 'country', 'city', 'traffic_medium', 'traffic_source', 'is_conversion', 'conversion_within_7_days', 'session_id', 'abandoned_cart', 'bounce_rate', 'days_since_first_event', 'days_between_events', 'total_revenue', 'average_revenue', 'total_events', 'unique_event_types', 'active_days', 'total_sessions', 'average_session_duration', 'total_session_events', 'average_session_events']


# Part 1-8: A Advanced Aggregates

In [10]:
def advanced_aggregates(df):
    # Conversion rate (conversions/total events per user)
    df['conversion_rate'] = df['is_conversion'] / df['total_events']

    # Combine all engineered features into the DataFrame
    df = pd.concat([df, event_name_dummies], axis=1)
    
    return df

df = advanced_aggregates(df)

In [11]:
print (df.columns.tolist())

['event_date', 'event_timestamp', 'user_pseudo_id', 'event_name', 'total_item_quantity', 'purchase_revenue_in_usd', 'purchase_revenue', 'unique_items', 'transaction_id', 'device_category', 'device_brand', 'device_model', 'operating_system', 'device_language', 'country', 'city', 'traffic_medium', 'traffic_source', 'is_conversion', 'conversion_within_7_days', 'session_id', 'abandoned_cart', 'bounce_rate', 'days_since_first_event', 'days_between_events', 'total_revenue', 'average_revenue', 'total_events', 'unique_event_types', 'active_days', 'total_sessions', 'average_session_duration', 'total_session_events', 'average_session_events', 'conversion_rate', 'event_add_payment_info', 'event_add_shipping_info', 'event_add_to_cart', 'event_begin_checkout', 'event_click', 'event_first_visit', 'event_page_view', 'event_purchase', 'event_scroll', 'event_select_item', 'event_select_promotion', 'event_session_start', 'event_user_engagement', 'event_view_item', 'event_view_item_list', 'event_view_pro

# Part 2: Data splitting

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Step 1: Define Target Variable and Features
# Target variable: conversion_within_7_days
target = 'conversion_within_7_days'

# Feature columns: Exclude non-useful columns and target
exclude_columns = [
    'event_date', 'event_timestamp', 'user_pseudo_id', 'transaction_id', 
    'conversion_within_7_days', 'purchase_revenue_in_usd', 'purchase_revenue', 
    'average_revenue', 'total_item_quantity', 'unique_items'
]
features = [col for col in df.columns if col not in exclude_columns]

# Separate features (X) and target (y)
X = df[features]
y = df[target]

# Step 2: Data Splitting
# Split the data into training, validation, and test sets (70% train, 15% validation, 15% test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y) # stratify=y: Ensures the class distribution of the target variable is preserved in the splits
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp) # Further splits the temporary dataset

# Step 3: Data Preprocessing
# Define numerical and categorical columns, scaling for numerical columns and encoding for categorical ones
numerical_columns = X.select_dtypes(include=['int64', 'float64', 'Int64']).columns
categorical_columns = X.select_dtypes(include=['category', 'bool', 'object']).columns

# Preprocessor for numerical features: Impute missing values and scale
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Replaces missing values with the mean of each column
    ('scaler', StandardScaler())                 # Scaling: Standardizes the numerical features to have a mean of 0 and a standard deviation of 1
])

# Preprocessor for categorical features: Impute and one-hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Fills missing values with the most frequent category
    ('onehot', OneHotEncoder(handle_unknown='ignore'))     # One-hot encode, Converts categorical values into binary indicator variables
])

# Combine preprocessors in a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ]
)

# Step 4: Final Pipeline
# Add preprocessor to a full pipeline
modeling_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)  # Preprocessing step (scaling, encoding, etc.)
])

# Step 5: Fit Preprocessing Pipeline
# Fit the preprocessing pipeline only on training data
X_train_prepared = modeling_pipeline.fit_transform(X_train)

# Apply the preprocessing pipeline to validation and test data
X_val_prepared = modeling_pipeline.transform(X_val)
X_test_prepared = modeling_pipeline.transform(X_test)

# Step 6: Inspect Prepared Data
# Check the shape of the transformed data
print(f"Training data shape: {X_train_prepared.shape}")
print(f"Validation data shape: {X_val_prepared.shape}")
print(f"Test data shape: {X_test_prepared.shape}")

# Save prepared data for modeling
import joblib
joblib.dump((X_train_prepared, y_train, X_val_prepared, y_val, X_test_prepared, y_test), 'prepared_data.pkl')

Training data shape: (1030898, 856)
Validation data shape: (220907, 856)
Test data shape: (220907, 856)


['prepared_data.pkl']

# Part 3: Neural network
- this is a feed forward neural network, data flows in one direction
- Why This model is chosen
- Tabular Data: Feedforward neural networks are commonly used for structured datasets, especially after preprocessing and feature engineering.
- Binary Classification: The sigmoid activation in the output layer ensures the model produces probabilities for the two classes.
- Overfitting Prevention: Dropout layers and early stopping help mitigate overfitting.Why This Architecture Was Chosen

In [13]:
# 1. import libraries

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# 2. neural network architecture

# Build the Neural Network Model
nn_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_prepared.shape[1],)),  # Input layer with 64 neurons
    Dropout(0.2),  # Dropout for regularization, randomly deactivates 20% of the neurons during training to prevent overfitting
    Dense(32, activation='relu'),  # Hidden layer with 32 neurons, ReLU rectified linear unit activation, it introduces non-linearity and helps the model learn complex patterns
    Dropout(0.2),  # Dropout for regularization
    Dense(1, activation='sigmoid')  # Output layer uses sigmoid activation function, suitable for binary classification, outputs a probability between 1 and 0
])

# Compile the Model
nn_model.compile(
    optimizer='adam',  # this dataset has many features, and Adam is efficient and adapts well to the complexities of training neural networks.
    loss='binary_crossentropy',  # Binary crossentropy is suitable for binary classification tasks, as it compares the predicted probability to the actual binary label and calculates the loss accordingly.
    metrics=['accuracy']  # Metric to monitor during training
)

# Display the model architecture
nn_model.summary()

2025-01-20 12:32:12.228644: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-01-20 12:32:16.551354: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64:/usr/lib/x86_64-linux-gnu/:/opt/conda/lib
2025-01-20 12:32:16.551795: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                54848     
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dropout_1 (Dropout)         (None, 32)                0         
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 56,961
Trainable params: 56,961
Non-trainable params: 0
_________________________________________________________________


In [14]:
# X_train_prepared and X_val_prepared are of type <class 'scipy.sparse._csr.csr_matrix'>, which means they are sparse matrices. Conver sparse matrices to dence. TensorFlow/Keras requires dense arrays for training.
# Convert X_train_prepared and X_val_prepared to dense NumPy arrays using .toarray():
X_train_prepared = X_train_prepared.toarray()

In [15]:
X_val_prepared = X_val_prepared.toarray()

In [16]:
# Convert y_train and y_val from pandas.Series to NumPy arrays
y_train = y_train.values

In [17]:
y_val = y_val.values


In [18]:
# 3. train neural network

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(
    monitor='val_loss',  # Monitor validation loss
    patience=5,          # Stop if no improvement after 5 epochs
    restore_best_weights=True  # Restore model weights from the epoch with the best validation loss
)

In [None]:
# Train the Neural Network
history = nn_model.fit(
    X_train_prepared, y_train,  # Training data
    validation_data=(X_val_prepared, y_val),  # Validation data
    epochs=5,  # Maximum number of epochs
    batch_size=1,  # Number of samples per batch
    callbacks=[early_stopping],  # Early stopping callback
    verbose=1  # Print training progress
)

In [None]:
# 4. evaluate neural network

# Evaluate on Validation Data
y_pred_prob_nn = nn_model.predict(X_val_prepared).flatten()  # Predicted probabilities
y_pred_nn = (y_pred_prob_nn > 0.5).astype(int)  # Convert probabilities to binary predictions

# Classification Report
print("Classification Report (Neural Network):")
print(classification_report(y_val, y_pred_nn))

# AUC-ROC
auc_roc_nn = roc_auc_score(y_val, y_pred_prob_nn)
print("AUC-ROC (Neural Network):", auc_roc_nn)

# Confusion Matrix
conf_matrix_nn = confusion_matrix(y_val, y_pred_nn)
plt.figure(figsize=(6, 5))
sns.heatmap(conf_matrix_nn, annot=True, fmt='d', cmap='Blues', xticklabels=['No Conversion', 'Conversion'], yticklabels=['No Conversion', 'Conversion'])
plt.title('Confusion Matrix: Neural Network')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Plot ROC Curve
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_val, y_pred_prob_nn)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"AUC-ROC = {auc_roc_nn:.2f}", color='blue')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')  # Reference line for random guess
plt.title('ROC Curve: Neural Network')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')
plt.show()

# 5. hyperparameter optimization for neural networks

import keras_tuner as kt

# Define the model-building function
def build_nn_model(hp):
    model = Sequential([
        Dense(hp.Int('units_1', min_value=32, max_value=128, step=32), activation='relu', input_shape=(X_train_prepared.shape[1],)),
        Dropout(hp.Float('dropout_1', min_value=0.1, max_value=0.5, step=0.1)),
        Dense(hp.Int('units_2', min_value=16, max_value=64, step=16), activation='relu'),
        Dropout(hp.Float('dropout_2', min_value=0.1, max_value=0.5, step=0.1)),
        Dense(1, activation='sigmoid')
    ])
    model.compile(
        optimizer=hp.Choice('optimizer', ['adam', 'sgd']),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

# Initialize Keras Tuner
tuner = kt.RandomSearch(
    build_nn_model,
    objective='val_loss',
    max_trials=10,  # Number of different hyperparameter combinations to try
    executions_per_trial=1,  # Number of times to evaluate each combination
    directory='my_dir',
    project_name='nn_tuning'
)

# Search for the best hyperparameters
tuner.search(X_train_prepared, y_train, validation_data=(X_val_prepared, y_val), epochs=50, batch_size=256, callbacks=[early_stopping])

# Get the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"Best Hyperparameters: {best_hps.values}")

# Build and train the best model
best_nn_model = tuner.hypermodel.build(best_hps)
best_nn_model.fit(X_train_prepared, y_train, validation_data=(X_val_prepared, y_val), epochs=50, batch_size=256, callbacks=[early_stopping])

# Part 4: Neural network evaluation

In [None]:
# 1. Generate Predictions
# Predict probabilities and binary labels
y_pred_prob_nn = nn_model.predict(X_test_prepared).flatten()  # Probabilities
y_pred_nn = (y_pred_prob_nn > 0.5).astype(int)  # Binary predictions with a threshold of 0.5

# 2. Classification Report
from sklearn.metrics import classification_report
print("Classification Report (Neural Network):")
print(classification_report(y_test, y_pred_nn))

# 3. AUC-ROC Score
from sklearn.metrics import roc_auc_score
auc_roc_nn = roc_auc_score(y_test, y_pred_prob_nn)
print(f"AUC-ROC Score (Neural Network): {auc_roc_nn:.2f}")

# 4. Confusion Matrix
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

conf_matrix_nn = confusion_matrix(y_test, y_pred_nn)
plt.figure(figsize=(6, 5))
sns.heatmap(conf_matrix_nn, annot=True, fmt='d', cmap='Blues', xticklabels=['No Conversion', 'Conversion'], yticklabels=['No Conversion', 'Conversion'])
plt.title('Confusion Matrix: Neural Network')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# 5. ROC Curve
from sklearn.metrics import roc_curve
fpr_nn, tpr_nn, thresholds_nn = roc_curve(y_test, y_pred_prob_nn)

plt.figure(figsize=(8, 6))
plt.plot(fpr_nn, tpr_nn, label=f"AUC-ROC = {auc_roc_nn:.2f}", color='blue')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.title('ROC Curve: Neural Network')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')
plt.show()

# 6. Precision-Recall Curve
from sklearn.metrics import precision_recall_curve
precision_nn, recall_nn, thresholds_nn = precision_recall_curve(y_test, y_pred_prob_nn)

plt.figure(figsize=(8, 6))
plt.plot(recall_nn, precision_nn, color='blue')
plt.title('Precision-Recall Curve: Neural Network')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.show()

# Part 5: Deployment preparation

In [None]:
import os
import joblib
from google.cloud import aiplatform
from tensorflow.keras.models import load_model

# Step 1: Save the model and preprocessing pipeline locally
print("Saving model and preprocessing pipeline...")
nn_model.save('neural_network_model.h5')  # Save the trained neural network model
joblib.dump(modeling_pipeline, 'preprocessing_pipeline.pkl')  # Save the preprocessing pipeline
print("Model and preprocessing pipeline saved successfully.")

# Step 2: Export the model to TensorFlow SavedModel format
print("Exporting model to TensorFlow SavedModel format...")
model = load_model('neural_network_model.h5')
model.save('saved_model_dir')  # Create SavedModel directory
print("Model exported successfully.")

# Step 3: Upload artifacts to Google Cloud Storage (GCS)
print("Uploading artifacts to GCS...")
model_gcs_path = 'gs://my-project-2025-447122-eu-notebooks/model/'
os.system(f"gsutil cp -r saved_model_dir {model_gcs_path}")
os.system(f"gsutil cp preprocessing_pipeline.pkl {model_gcs_path}")
print("Artifacts uploaded to GCS successfully.")

# Step 4: Initialize the Vertex AI environment
print("Initializing Vertex AI environment...")
aiplatform.init(
    project='my-project-2025-447122',  # Replace with your GCP project ID
    location='europe-west4'            # Replace with your GCP region
)

# Step 5: Upload the model to Vertex AI
print("Uploading model to Vertex AI...")
model = aiplatform.Model.upload(
    display_name='neural-network-model',  # Display name for the model in Vertex AI
    artifact_uri=f"{model_gcs_path}saved_model_dir",  # Path to the SavedModel in GCS
    serving_container_image_uri='us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-8:latest',  # TensorFlow serving container
)
print(f"Model uploaded successfully: {model.resource_name}")

# Step 6: Deploy the model to an endpoint
print("Deploying model to an endpoint...")
endpoint = model.deploy(
    deployed_model_display_name='nn_model_endpoint',  # Name for the deployed model
    machine_type='n1-standard-4',                     # Machine type for hosting
)
print(f"Model deployed successfully: {endpoint.resource_name}")

print("Deployment preparation completed.")


# Part 5: Send prediction request

In [None]:
import pandas as pd
import joblib
from google.cloud import aiplatform

# Initialize Vertex AI environment
aiplatform.init(project="my-project-2025-447122", location="europe-west4")

# Define endpoint and pipeline path
endpoint = aiplatform.Endpoint("projects/190053636941/locations/europe-west4/endpoints/4825398093518209024")
pipeline_path = "preprocessing_pipeline.pkl"  # Path to the preprocessing pipeline

# Step 1: Fetch and Prepare Data

def fetch_and_prepare_data(date):
    
    from google.cloud import bigquery
    
    client = bigquery.Client()
    
    query = f"""
    SELECT 
      event_date,
      event_timestamp,
      user_pseudo_id,
      event_name,
      ecommerce.total_item_quantity AS total_item_quantity,
      ecommerce.purchase_revenue_in_usd AS purchase_revenue_in_usd,
      ecommerce.purchase_revenue AS purchase_revenue,
      ecommerce.unique_items AS unique_items,
      ecommerce.transaction_id AS transaction_id,
      device.category AS device_category,
      device.mobile_brand_name AS device_brand,
      device.mobile_model_name AS device_model,
      device.operating_system AS operating_system,
      device.language AS device_language,
      geo.country AS country,
      geo.city AS city,
      traffic_source.medium AS traffic_medium,
      traffic_source.source AS traffic_source
    FROM `bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
    WHERE event_date = '{date}'
    """

    query_job = client.query(query)
    df = query_job.to_dataframe()

    # Apply feature engineering
    df = feature_engineering(df)
    return df

# Reuse your feature engineering functions
def feature_engineering(df):
    df = add_conversion_features(df)  # Returns only df
    df, event_name_dummies = add_event_features(df)  # Unpack the tuple
    df, session_features, user_session_features = add_session_features(df)  # Unpack the tuple 
    df, bounce_rate, session_features, session_features_copy = add_behavioral_features(df, session_features)
    if 'bounce_rate' in df.columns:
        print ("bounce_rate in df")
    else:
        print ("bounce_rate NOT in df")
    print(df.columns.tolist())

    df = add_temporal_features(df)  # Returns only df
    df = feature_integration(df)  # Ensure all dependencies are properly passed
    df = advanced_aggregates(df)  # Returns only df
    return df


# Step 2: Preprocess the Data

def preprocess_data(df, pipeline_path):
    """
    Preprocess the data using the saved preprocessing pipeline.
    """
    print("Loading preprocessing pipeline...")
    pipeline = joblib.load(pipeline_path)
    print("Preprocessing pipeline loaded successfully.")

    print("Transforming data...")
    prepared_data = pipeline.transform(df)
    print("Data transformation completed.")

    return prepared_data

# Step 3: Send Prediction Request

def send_prediction_request(prepared_data, endpoint):
    """
    Send the prepared data to the Vertex AI endpoint for prediction.
    """
    print("Sending data to the endpoint for prediction...")
    instances = prepared_data.tolist()  # Convert to a JSON-ready format
    response = endpoint.predict(instances=instances)
    print("Prediction completed.")
    return response.predictions

# Main Function

def main():
    # Define the prediction date
    prediction_date = "2021-01-31"

    print(f"Fetching and preparing data for {prediction_date}...")
    raw_data = fetch_and_prepare_data(prediction_date)
    print(f"Raw data loaded with shape: {raw_data.shape}")

    print("Preprocessing data...")
    prepared_data = preprocess_data(raw_data, pipeline_path)
    print(f"Prepared data shape: {prepared_data.shape}")

    print("Sending prediction request...")
    predictions = send_prediction_request(prepared_data, endpoint)

    print("Predictions:")
    print(predictions)

# Run the main function
if __name__ == "__main__":
    main()