In [2]:
import pandas as pd
from scipy.stats import zscore
import numpy as np
from pathlib import Path
from scipy.sparse import coo_matrix

from typing import Dict, Tuple

import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import os
import datetime

from dotenv import load_dotenv

# Laad de omgevingsvariabelen uit het .env-bestand
load_dotenv()

# Haal het basispad op uit de omgevingsvariabele
# De .get() methode voorkomt een fout als de variabele niet is ingesteld
base_data_dir = os.environ.get('ECOMMERCE_RAW_DATA_DIR', 'default_path_if_not_set/data/ecommerce')

# Combineer met de rest van het pad
RAW_DATA_PATH = Path(base_data_dir) / 'events.csv'
SEQUENCES_PICKLE = 'df_sequences.pkl'
PRODUCTS_PICKLE = 'product_df_encoded.pkl'

EVENT_COLS = ['event_time', 'user_id', 'user_session', 'product_id', 'event_type']
PRODUCT_COLS = ['product_id', 'category_code', 'brand', 'price']
SESSION_COLS = ['event_time', 'user_id', 'user_session', 'product_id']
CATEGORY_ENCODE_COLS = ["category_level_1", "category_level_2", "category_level_3", "category_level_4"]
CATEGORICAL_FEATURES = ['brand', 'category_level_1', 'category_level_2', 'category_level_3', 'category_level_4']

SOS_ID = 0
EOS_ID = 1

In [2]:

def create_sequences(session_df):
    SOS_ID = 0
    EOS_ID = 1
    
    all_sequences = []
    # Groepeer per sessie en creëer de sequenties
    for user_session, group in session_df.groupby('user_session'):
        # Haal de product IDs van de sessie
        session_product_ids = group['product_id'].tolist()
        
        # Voeg SOS en EOS tokens toe
        augmented_ids = [SOS_ID] + session_product_ids + [EOS_ID]
        
        # Genereer de 3-staps sequenties met een sliding window
        # Alleen als de sessie lang genoeg is (minstens 1 product)
        if len(augmented_ids) >= 3:
            for i in range(len(augmented_ids) - 2):
                sequence = augmented_ids[i : i + 3]
                # Sla de input (eerste 2) en target (laatste) op
                all_sequences.append({
                    'input_1': sequence[0],
                    'input_2': sequence[1],
                    'target': sequence[2]
                })
    return pd.DataFrame(all_sequences)

In [3]:
# --- Helper functies
def _load_raw_data(path: Path) -> pd.DataFrame:
    """Laadt de ruwe dataset en filtert op 'view' events."""
    df = pd.read_csv(path)
    return df.loc[df.event_type == 'view', :].copy()

def _process_session_data(df: pd.DataFrame) -> pd.DataFrame:
    """Verwerkt sessiegerelateerde kolommen en creëert sequenties."""
    session_df = df[SESSION_COLS].copy()
    session_df['event_time'] = pd.to_datetime(session_df['event_time'])
    session_df['user_id'] = session_df['user_id'].astype('category')
    session_df['user_session'] = session_df['user_session'].astype('category') # Dubbele regel, kan één keer
    
    session_df.dropna(inplace=True)
    session_df["event_week"] = session_df.event_time.dt.strftime("%Y%U")
    session_df = session_df.sort_values(['event_week', 'user_session', 'event_time']).reset_index(drop=True)
    
    # create_sequences moet hier beschikbaar zijn of geïmporteerd worden.
    # Voorbeeld placeholder:
    # df_sequences = create_sequences(session_df) 
    # Aannemende dat 'create_sequences' elders is gedefinieerd en een DataFrame retourneert.
    # Hier retourneren we de verwerkte session_df als placeholder voor df_sequences
    return session_df 

def _process_product_data(df: pd.DataFrame) -> pd.DataFrame:
    """Verwerkt productgerelateerde kolommen en voert one-hot encoding uit."""
    product_df = df[PRODUCT_COLS].copy().drop_duplicates().set_index('product_id')

    product_df['brand'] = product_df['brand'].fillna('UNKNOWN').astype('category')
    
    # Handling category_code
    product_df['category_code'] = product_df['category_code'].astype('string').fillna('UNKNOWN')
    
    # Log transform en standaardiseer prijs
    product_df['price_log'] = np.log1p(product_df['price'])
    product_df['price_log_std'] = zscore(product_df['price_log'])
    product_df.drop(columns=['price', 'price_log'], inplace=True)
    
    # Splits en encodeer categorieën
    max_levels = product_df['category_code'].apply(lambda x: len(x.split('.'))).max()
    column_names = [f'category_level_{i+1}' for i in range(max_levels)]
    split_categories = product_df['category_code'].str.split('.', expand=True)
    split_categories.columns = column_names[:len(split_categories.columns)] # Zorg voor correcte toewijzing

    product_df = pd.concat([product_df, split_categories], axis=1)
    
    if 'category_code' in product_df.columns:
        product_df.drop(columns=['category_code'], inplace=True)
        
    for col in CATEGORY_ENCODE_COLS:
        if col in product_df.columns: # Controleer of de kolom bestaat
            product_df[col] = product_df[col].fillna('UNKNOWN').astype('category')
        else:
            # Voeg lege kolom toe als deze ontbreekt om fouten te voorkomen bij get_dummies
            product_df[col] = 'UNKNOWN'
            product_df[col] = product_df[col].astype('category')


    product_df_encoded = pd.get_dummies(
        product_df, 
        columns=[col for col in CATEGORICAL_FEATURES if col in product_df.columns], 
        prefix=[col for col in CATEGORICAL_FEATURES if col in product_df.columns], 
        sparse=True
    )
    return product_df_encoded

# --- Hoofdfunctie
def load_dataset(path: Path = RAW_DATA_PATH, refresh: bool = False) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Laadt of genereert de sequenties en productinformatie.

    Args:
        path (Path): Pad naar de ruwe data CSV.
        refresh (bool): Indien True, verwerk de ruwe data opnieuw; anders, laad van pickle bestanden.

    Returns:
        tuple[pd.DataFrame, pd.DataFrame]: Een tuple met (df_sequences, product_df_encoded).
    """
    if refresh:
        print("Vernieuwen van dataset...")
        df_raw = _load_raw_data(path)
        
        df_sequences = _process_session_data(df_raw.copy()) # Gebruik .copy() om SettingWithCopyWarning te voorkomen
        product_df_encoded = _process_product_data(df_raw.copy()) # Gebruik .copy()
        
        # Sla de resultaten op
        df_sequences.to_pickle(SEQUENCES_PICKLE)
        product_df_encoded.to_pickle(PRODUCTS_PICKLE)
        print("Dataset vernieuwd en opgeslagen.")
    else:
        print("Laden van dataset uit pickle bestanden...")
        try:
            df_sequences = pd.read_pickle(SEQUENCES_PICKLE)
            product_df_encoded = pd.read_pickle(PRODUCTS_PICKLE)
            print("Dataset succesvol geladen.")
        except FileNotFoundError:
            print(f"Pickle bestanden niet gevonden. Vernieuw de dataset door refresh=True te zetten.")
            # Optioneel: roep de functie recursief aan met refresh=True
            df_sequences, product_df_encoded = load_dataset(path=path, refresh=True)
    
    return df_sequences, product_df_encoded

df_sequences, product_df_encoded = load_dataset()

Laden van dataset uit pickle bestanden...
Dataset succesvol geladen.


In [None]:
product_df_encoded

In [4]:
def train_test_split_commerce(df, validation_size=0.15, test_size=0.1):
    num_rows = df.shape[0]
    test_index = int((1 - test_size) * num_rows)
    validation_index = int((1 - test_size - validation_size) * num_rows)

    train_set = df.loc[:validation_index, ]
    validation_set = df.loc[validation_index:test_index,]
    test_set = df.loc[test_index:]
    return (train_set, validation_set, test_set)

train_set, validation_set, test_set = train_test_split_commerce(df_sequences)

In [7]:
def create_id_to_index_mapping(
    product_df_encoded: pd.DataFrame, 
    sos_id: int, 
    eos_id: int
) -> Tuple[Dict[int, int], int]:
    """
    Creëert een mapping van product-ID's en speciale tokens naar matrix-indices.

    Args:
        product_df_encoded (pd.DataFrame): DataFrame met productkenmerken, met product_id als index.
        sos_id (int): Het ID voor het Start-of-Sequence token.
        eos_id (int): Het ID voor het End-of-Sequence token.

    Returns:
        Tuple[Dict[int, int], int]: Een tuple met de mapping (id_to_index) en het totale aantal nodes (n_nodes).
    """
    unique_product_ids = product_df_encoded.index.tolist()
    n_nodes = len(unique_product_ids) + 2  # +2 voor SOS en EOS tokens

    id_to_index = {
        sos_id: 0,
        eos_id: 1
    }
    # Map alle echte product-ID's naar indices startend bij 2
    for i, pid in enumerate(unique_product_ids):
        id_to_index[pid] = i + 2
    
    print(f"Mapping gecreëerd: {len(unique_product_ids)} producten + 2 speciale tokens = {n_nodes} nodes.")
    return id_to_index, n_nodes

def create_node_feature_matrix(
    product_df_encoded: pd.DataFrame, 
    id_to_index: Dict[int, int], 
    n_nodes: int
) -> np.ndarray:
    """
    Creëert de node feature matrix (X) voor alle nodes.

    Args:
        product_df_encoded (pd.DataFrame): DataFrame met productkenmerken, met product_id als index.
        id_to_index (Dict[int, int]): Mapping van product-ID's naar matrix-indices.
        n_nodes (int): Totaal aantal nodes in de graaf.

    Returns:
        np.ndarray: De node feature matrix.
    """
    # Creëer een nulmatrix voor alle nodes
    # De eerste twee rijen (indices 0 en 1) voor speciale tokens blijven nullen
    x_features = np.zeros((n_nodes, product_df_encoded.shape[1]))

    # Plaats de echte productkenmerken in de matrix startend vanaf index 2
    # Zorg ervoor dat de volgorde van product_df_encoded overeenkomt met de mapping
    # Dit wordt gedaan door te reindexeren op de unieke product-ID's die in de mapping zitten
    # (exclusief de speciale tokens)
    real_product_ids_in_order = [pid for pid, idx in sorted(id_to_index.items(), key=lambda item: item[1]) if idx >= 2]
    
    if len(real_product_ids_in_order) != product_df_encoded.shape[0]:
        print("Waarschuwing: Aantal echte product-ID's in mapping komt niet overeen met product_df_encoded rijen.")

    x_features[2:] = product_df_encoded.loc[real_product_ids_in_order].to_numpy()
    
    print(f"Node feature matrix X shape: {x_features.shape}")
    return x_features

def create_adjacency_matrix(
    df_sequences: pd.DataFrame,
    id_to_index: Dict[int, int],
    n_nodes: int
) -> coo_matrix:
    """
    Creëert de gewogen adjacentie matrix (A) op basis van sessie-sequenties.

    De functie is geoptimaliseerd door pandas operaties te combineren en side-effects
    zoals 'print' te vervangen door logging.

    Args:
        df_sequences (pd.DataFrame): DataFrame met kolommen 'input_1' en 'input_2'.
        id_to_index (Dict[int, int]): Mapping van product-ID's naar matrix-indices.
        n_nodes (int): Totaal aantal nodes in de graaf.

    Returns:
        coo_matrix: De gewogen adjacentie matrix in COO-formaat.
    """
    # 1. Map, filter en cast de indices in één chained operatie.
    #    Dit is efficiënter en beter leesbaar dan losse stappen.
    edge_counts = (
        df_sequences[['input_1', 'input_2']]
        .assign(
            from_idx=lambda df: df['input_1'].map(id_to_index),
            to_idx=lambda df: df['input_2'].map(id_to_index)
        )
        .dropna()
        .astype({'from_idx': 'int32', 'to_idx': 'int32'})
        .groupby(['from_idx', 'to_idx'])
        .size()
        .reset_index(name='weight')
    )

    # 2. Transformeer de gewichten met log1p voor numerieke stabiliteit.
    log_weights = np.log1p(edge_counts['weight'].to_numpy())

    # 3. Creëer de sparse matrix direct vanuit de DataFrame kolommen.
    a_weighted = coo_matrix(
        (log_weights, (edge_counts['from_idx'].to_numpy(), edge_counts['to_idx'].to_numpy())),
        shape=(n_nodes, n_nodes)
    )
    
    return a_weighted

def prepare_graph_components(
    df_sequences: pd.DataFrame, 
    product_df_encoded: pd.DataFrame, 
    sos_id: int, 
    eos_id: int
) -> Tuple[np.ndarray, coo_matrix, Dict[int, int], int]:
    """
    Bereidt alle benodigde graafcomponenten voor: node features (X) en adjacentie matrix (A).

    Args:
        df_sequences (pd.DataFrame): DataFrame met sessie-sequenties.
        product_df_encoded (pd.DataFrame): DataFrame met gecodeerde productkenmerken.
        sos_id (int): Het ID voor het Start-of-Sequence token.
        eos_id (int): Het ID voor het End-of-Sequence token.

    Returns:
        Tuple[np.ndarray, coo_matrix, Dict[int, int], int]: Een tuple met
        (x_features, a_weighted, id_to_index, n_nodes).
    """
    print("Start voorbereiding graafcomponenten...")

    id_to_index, n_nodes = create_id_to_index_mapping(product_df_encoded, sos_id, eos_id)
    x_features = create_node_feature_matrix(product_df_encoded, id_to_index, n_nodes)
    a_weighted = create_adjacency_matrix(df_sequences, id_to_index, n_nodes)

    y_labels = df_sequences['target'].map(id_to_index).to_numpy().astype(int)

    print("✅ Graafcomponenten succesvol gecreëerd.")
    print(f"Totaal nodes in graaf: {n_nodes}")
    return x_features, a_weighted, y_labels,id_to_index, n_nodes


x_features, a_weighted, y_labels, id_to_index, n_nodes = prepare_graph_components(
    train_set, 
    product_df_encoded, 
    SOS_ID, 
    EOS_ID
)

Start voorbereiding graafcomponenten...
Mapping gecreëerd: 53452 producten + 2 speciale tokens = 53454 nodes.


  x_features[2:] = product_df_encoded.loc[real_product_ids_in_order].to_numpy()


Node feature matrix X shape: (53454, 1139)
✅ Graafcomponenten succesvol gecreëerd.
Totaal nodes in graaf: 53454


In [12]:
import matplotlib.pyplot as plt
import pandas as pd
import os
import datetime
import tensorflow as tf

# Ensure directories exist
os.makedirs("saved_models", exist_ok=True)
os.makedirs("saved_plots", exist_ok=True)
os.makedirs("logs/fit", exist_ok=True)

def plot_training_curve(history_dict: dict, plot_save_path: str, run_name: str) -> None:
    """
    Visualizes and saves the training and validation curves for loss and accuracy.

    Args:
        history_dict (dict): Dictionary containing training history (e.g., 'loss', 'val_loss', 'accuracy', 'val_accuracy').
        plot_save_path (str): Full path to save the plot image (e.g., 'path/to/plot.png').
        run_name (str): Name of the training run for plot titles.
    """
    print("Visualizing training curve...")

    plt.figure(figsize=(12, 5))

    # Plot Loss
    plt.subplot(1, 2, 1)
    plt.plot(history_dict.get('loss', []), label='Training Loss')
    if 'val_loss' in history_dict:
        plt.plot(history_dict['val_loss'], label='Validation Loss')
    plt.title(f'{run_name} - Loss over Epochs')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)

    # Plot Accuracy
    plt.subplot(1, 2, 2)
    plt.plot(history_dict.get('accuracy', []), label='Training Accuracy')
    if 'val_accuracy' in history_dict:
        plt.plot(history_dict['val_accuracy'], label='Validation Accuracy')
    plt.title(f'{run_name} - Accuracy over Epochs')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.grid(True)

    plt.tight_layout() # Adjust layout to prevent overlapping
    plt.savefig(plot_save_path)
    plt.close() # Close the plot to free up memory
    print(f"Training curve saved to: {plot_save_path}")

def save_training_results_to_csv(history_dict: dict, run_name: str, run_timestamp: str,
                                 batch_size: int, epochs: int, learning_rate: float,
                                 results_csv_path: str = 'training_results.csv') -> None:
    """
    Logs training results to a CSV file.

    Args:
        history_dict (dict): Dictionary containing training history.
        run_name (str): Unique name for the training run.
        run_timestamp (str): Timestamp of the training run.
        batch_size (int): Batch size used for training.
        epochs (int): Number of epochs trained.
        learning_rate (float): Learning rate used for the optimizer.
        results_csv_path (str): Path to the CSV file where results will be stored.
    """
    print(f"Logging results to {results_csv_path}...")
    results = {
        'run_name': run_name,
        'timestamp': run_timestamp,
        'batch_size': batch_size,
        'epochs': epochs,
        'learning_rate': learning_rate,
        'final_train_loss': history_dict['loss'][-1],
        'final_train_accuracy': history_dict['accuracy'][-1],
        'final_val_loss': history_dict['val_loss'][-1] if 'val_loss' in history_dict else None,
        'final_val_accuracy': history_dict['val_accuracy'][-1] if 'val_accuracy' in history_dict else None,
    }

    results_df = pd.DataFrame([results])
    if not os.path.exists(results_csv_path):
        results_df.to_csv(results_csv_path, index=False)
    else:
        results_df.to_csv(results_csv_path, mode='a', header=False, index=False)
    print(f"Results added to: {results_csv_path}")

    print("\nAll training results:")
    all_results_df = pd.read_csv(results_csv_path)
    print(all_results_df.to_string())

def create_run_name(model_name: str, arch_id: str, remark: str,
                    batch_size: int, epochs: int, learning_rate: float) -> str:
    """
    Generates a standardized run name for logging and saving.

    Args:
        model_name (str): Name of the model (e.g., 'GCN', 'GAT').
        arch_id (str): Identifier for the specific architecture.
        remark (str): Any additional remark for the run.
        batch_size (int): Batch size used.
        epochs (int): Number of epochs.
        learning_rate (float): Learning rate.

    Returns:
        str: Formatted run name.
    """
    return f"{model_name}_{arch_id}_{remark}_b{batch_size}_e{epochs}_lr{learning_rate:.6f}" # Format LR for consistency

def train_and_log_spektral_model(
    model: tf.keras.Model,
    model_name: str,
    arch_id: str,
    remark: str,
    train_X,
    train_y,
    val_X,
    val_y,
    batch_size: int = 32,
    epochs: int = 50,
    learning_rate: float = 0.001,
    base_log_dir: str = "logs/fit",
    base_model_save_dir: str = "saved_models",
    base_plot_save_dir: str = "saved_plots",
    results_csv_path: str = "training_results.csv"
) -> dict or None:
    """
    Trains a Spektral model, logs its progress, and saves results.

    Args:
        model (tf.keras.Model): The Spektral (Keras) model to train.
        model_name (str): Name of the model.
        arch_id (str): Architecture identifier.
        remark (str): Additional remark for the run.
        train_X: Training input data.
        train_y: Training target data.
        val_X: Validation input data.
        val_y: Validation target data.
        batch_size (int): Batch size for training.
        epochs (int): Number of training epochs.
        learning_rate (float): Learning rate for the optimizer.
        base_log_dir (str): Base directory for TensorBoard logs.
        base_model_save_dir (str): Base directory for saving trained models.
        base_plot_save_dir (str): Base directory for saving training plots.
        results_csv_path (str): Path to the CSV file for logging training metadata.

    Returns:
        dict or None: The history dictionary if training is successful, otherwise None.
    """
    run_name = create_run_name(model_name, arch_id, remark, batch_size, epochs, learning_rate)
    run_timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

    # Define callbacks dynamically for each run
    callbacks = [
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.01, # Adjusted factor for more aggressive reduction, original was 0.33, which is quite small
            patience=5,
            min_lr=1e-7, # Adjusted min_lr
            verbose=1
        ),
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=10,
            restore_best_weights=True,
            verbose=1
        ),
        tf.keras.callbacks.TensorBoard(
            log_dir=os.path.join(base_log_dir, f"{run_name}_{run_timestamp}"),
            histogram_freq=1
        )
    ]

    print("Compiling model...")
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss='categorical_crossentropy', # Assuming classification; adjust if regression
        metrics=['accuracy'] # Adjust metrics based on your problem
    )

    print(f"[{run_timestamp}] Starting training for run: {run_name}...")
    try:
        history = model.fit(
            x=train_X,
            y=train_y,
            validation_data=(val_X, val_y),
            epochs=epochs,
            batch_size=batch_size,
            verbose=1,
            callbacks=callbacks
        )
        print("Training completed.")
    except Exception as e:
        print(f"Error during training for run '{run_name}': {e}")
        # Consider re-raising the exception or logging it more formally
        return None

    history_dict = history.history

    # Save model
    final_model_save_path = os.path.join(base_model_save_dir, f"{run_name}_best.keras")
    model.save(final_model_save_path)
    print(f"Model saved to: {final_model_save_path}")

    # Plot training curves
    plot_save_path = os.path.join(base_plot_save_dir, f"{run_name}_training_curve.png")
    plot_training_curve(history_dict, plot_save_path, run_name)

    # Save results to CSV
    save_training_results_to_csv(history_dict, run_name, run_timestamp,
                                 batch_size, epochs, learning_rate, results_csv_path)

    print(f"[{datetime.datetime.now().strftime('%Y%m%d-%H%M%S')}] Finished training: {run_name}. "
          f"Model saved in {final_model_save_path}, graph saved in {plot_save_path}")

    return history_dict

In [None]:
N = None # Aantal knooppunten (kan variabel zijn in een batch)
F = node_feature_dim # Dimensie van knooppuntkenmerken

X_in = Input(shape=(F,), name='X_in')
# Voor Spektral heb je ook een adjacency matrix input nodig
# A_in = Input(shape=(N,), sparse=True, name='A_in')

# x = GCNConv(32, activation='relu')([X_in, A_in])
# x = GlobalAvgPool()(x) # Als je graaf-level output wilt
output = Dense(num_classes, activation='softmax')(x)

# Voor een echt Spektral model: model = Model(inputs=[X_in, A_in], outputs=output)
model = Model(inputs=X_in, outputs=output) # Voor dummy data

model.summary()