In [1]:
import polars as pl
import numpy as np
import pandas as pd
import random
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

# Set seed for reproducibility
np.random.seed(42)
random.seed(42)

# Generate synthetic user_data
customer_ids = [f'C{i}' for i in range(1, 1001)]
age_groups = ['20-30', '30-40', '40-50', '50-60', '60-70']
segments = ['Standard', 'Premium']
incomes = np.random.randint(30000, 100000, size=1000)
aums = np.random.randint(50000, 150000, size=1000)

user_data = pl.DataFrame({
    'customerid': customer_ids,
    'age_group': np.random.choice(age_groups, size=1000),
    'segment': np.random.choice(segments, size=1000),
    'income': incomes,
    'AUM': aums
})

# Generate synthetic interaction_data
product_codes = [f'P{i}' for i in range(1, 21)]
interaction_data = []
num_interactions = 5000

for _ in range(num_interactions):
    customer_id = random.choice(customer_ids)
    product_code = random.choice(product_codes)
    interaction_data.append([customer_id, product_code])

interaction_data_df = pl.DataFrame(interaction_data, schema=['customerid', 'product_code'])

# Generate synthetic product_data
product_names = [f'Product {i}' for i in range(1, 21)]
product_types = ['Bond', 'Mutual Fund']
performances = np.random.uniform(5.0, 10.0, size=20)

product_data = pl.DataFrame({
    'product_code': product_codes,
    'product_name': product_names,
    'product_type': np.random.choice(product_types, size=20),
    'performance': performances
})

# Encode categorical columns
def encode_data(df, categorical_columns):
    df_encoded = df.to_pandas()
    for col in categorical_columns:
        le = LabelEncoder()
        df_encoded[col] = le.fit_transform(df_encoded[col])
    return pl.from_pandas(df_encoded)

user_data_encoded = encode_data(user_data, ['age_group', 'segment'])
product_data_encoded = encode_data(product_data, ['product_type', 'product_name'])

# Generate negative samples
def generate_negative_samples(interaction_data, product_data, num_negatives=1):
    all_products = set(product_data['product_code'].to_list())
    negative_samples = []

    for row in interaction_data.iter_rows():
        user = row[0]
        positive_product = row[1]

        # Generate negative samples
        for _ in range(num_negatives):
            negative_product = random.choice(list(all_products - set([positive_product])))
            negative_samples.append([user, negative_product, 0])  # Label 0 for negative

    return pl.DataFrame(negative_samples, schema=['customerid', 'product_code', 'label'])

interaction_data_df = interaction_data_df.with_columns(pl.lit(1).alias('label'))
negative_samples = generate_negative_samples(interaction_data_df, product_data, num_negatives=2)

# Ensure consistent types
interaction_data_df = interaction_data_df.with_columns([
    pl.col('customerid').cast(pl.Utf8),
    pl.col('product_code').cast(pl.Utf8),
    pl.col('label').cast(pl.Int32)
])

negative_samples = negative_samples.with_columns([
    pl.col('customerid').cast(pl.Utf8),
    pl.col('product_code').cast(pl.Utf8),
    pl.col('label').cast(pl.Int32)
])

# Combine positive and negative samples
train_data = pl.concat([interaction_data_df, negative_samples], how='vertical')

# Merge train_data with user_data and product_data
train_data = train_data.join(user_data_encoded, on='customerid', how='left')
train_data = train_data.join(product_data_encoded, on='product_code', how='left')

# Shuffle the data
train_data = train_data.sample(n=train_data.shape[0], shuffle=True, seed=42)

# Split data into train, validation, and test sets
train_data_pandas, test_data_pandas = train_test_split(train_data.to_pandas(), test_size=0.2, random_state=42)
train_data_pandas, val_data_pandas = train_test_split(train_data_pandas, test_size=0.125, random_state=42)  # 10% of original data

# Convert back to Polars DataFrame
train_data = pl.from_pandas(train_data_pandas)
val_data = pl.from_pandas(val_data_pandas)
test_data = pl.from_pandas(test_data_pandas)

# Prepare the inputs for training
def prepare_inputs(data, user_data_encoded, product_data_encoded):
    user_data_dict = {row[0]: np.array(row[1:], dtype='float32') for row in user_data_encoded.iter_rows()}
    product_data_dict = {row[0]: np.array(row[1:], dtype='float32') for row in product_data_encoded.iter_rows()}

    user_inputs = []
    product_inputs = []
    labels = []

    for row in data.iter_rows():
        user_id = row[0]
        product_code = row[1]

        if user_id in user_data_dict and product_code in product_data_dict:
            user_inputs.append(user_data_dict[user_id])
            product_inputs.append(product_data_dict[product_code])
            labels.append(row[2])
    
    return np.array(user_inputs, dtype='float32'), np.array(product_inputs, dtype='float32'), np.array(labels, dtype='float32')

train_user_inputs, train_product_inputs, train_labels = prepare_inputs(train_data, user_data_encoded, product_data_encoded)
val_user_inputs, val_product_inputs, val_labels = prepare_inputs(val_data, user_data_encoded, product_data_encoded)
test_user_inputs, test_product_inputs, test_labels = prepare_inputs(test_data, user_data_encoded, product_data_encoded)

# Build the Two-Tower model
def build_tower(input_shape, name):
    inputs = Input(shape=(input_shape,), name=f'{name}_input')
    x = Dense(64, activation='relu')(inputs)
    x = Dense(32, activation='relu')(x)
    embedding = Dense(8)(x)
    return Model(inputs, embedding, name=f'{name}_tower')

user_input_shape = user_data_encoded.shape[1] - 1  # Exclude 'customerid'
product_input_shape = product_data_encoded.shape[1] - 1  # Exclude 'product_code'

user_tower = build_tower(user_input_shape, 'user')
product_tower = build_tower(product_input_shape, 'product')

user_input = Input(shape=(user_input_shape,), name='user_input')
product_input = Input(shape=(product_input_shape,), name='product_input')

user_embedding = user_tower(user_input)
product_embedding = product_tower(product_input)

dot_product = tf.reduce_sum(user_embedding * product_embedding, axis=-1)
recommendation_model = Model([user_input, product_input], dot_product)

recommendation_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model with validation data
recommendation_model.fit(
    [train_user_inputs, train_product_inputs], train_labels, 
    validation_data=([val_user_inputs, val_product_inputs], val_labels),
    epochs=5
)

# Evaluate on the test set
test_loss, test_accuracy = recommendation_model.evaluate([test_user_inputs, test_product_inputs], test_labels)

print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

# Print model summary
recommendation_model.summary()


2024-09-11 22:58:40.588982: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-11 22:58:40.664726: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-11 22:58:41.105958: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-11 22:58:41.106026: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-11 22:58:41.175590: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Loss: 5.121081829071045
Test Accuracy: 0.03191489353775978
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 user_input (InputLayer)     [(None, 4)]                  0         []                            
                                                                                                  
 product_input (InputLayer)  [(None, 3)]                  0         []                            
                                                                                                  
 user_tower (Functional)     (None, 8)                    2664      ['user_input[0][0]']          
                                                                                                  
 product_tower (Functional)  (None, 8)                    2600      ['product_i

In [4]:
import polars as pl
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

# Generate synthetic user_data
num_users = 1500
num_products = 40

user_ids = [f'C{i}' for i in range(1, num_users + 1)]
product_codes = [f'P{i}' for i in range(1, num_products + 1)]

user_data = pl.DataFrame({
    'customerid': user_ids,
    'age_group': np.random.choice(['20-30', '30-40', '40-50', '50-60', '60-70'], size=num_users),
    'segment': np.random.choice(['Standard', 'Premium'], size=num_users),
    'income': np.random.randint(30000, 100000, size=num_users),
    'AUM': np.random.randint(50000, 150000, size=num_users)
})

product_data = pl.DataFrame({
    'product_code': product_codes,
    'product_name': [f'Product {i}' for i in range(1, num_products + 1)],
    'product_type': np.random.choice(['Bond', 'Mutual Fund'], size=num_products),
    'performance': np.random.uniform(5.0, 10.0, size=num_products)
})

# Encode categorical columns
def encode_data(df, categorical_columns):
    df_encoded = df.to_pandas()
    for col in categorical_columns:
        le = LabelEncoder()
        df_encoded[col] = le.fit_transform(df_encoded[col])
    return pl.from_pandas(df_encoded)

user_data_encoded = encode_data(user_data, ['age_group', 'segment'])
product_data_encoded = encode_data(product_data, ['product_type', 'product_name'])

# Prepare inputs for prediction
def prepare_inputs(user_data_encoded, product_data_encoded):
    user_data_dict = {row[0]: np.array(row[1:], dtype='float32') for row in user_data_encoded.iter_rows()}
    product_data_dict = {row[0]: np.array(row[1:], dtype='float32') for row in product_data_encoded.iter_rows()}

    for user_id in user_data_dict.keys():
        for product_code in product_data_dict.keys():
            yield user_data_dict[user_id], product_data_dict[product_code], user_id, product_code

# Define chunk size
chunk_size = 100000  # Adjust chunk size based on memory capacity

# Function to predict in chunks and filter top 5 per product_type
def predict_and_filter(user_data_encoded, product_data_encoded, model, chunk_size):
    product_type_map = dict(zip(product_data_encoded['product_code'].to_pandas(), product_data_encoded['product_type'].to_pandas()))
    
    predictions_per_customer = defaultdict(lambda: defaultdict(list))
    
    inputs_gen = prepare_inputs(user_data_encoded, product_data_encoded)
    chunk_user_inputs = []
    chunk_product_inputs = []
    chunk_combinations = []

    for i, (user_input, product_input, user_id, product_code) in enumerate(inputs_gen):
        chunk_user_inputs.append(user_input)
        chunk_product_inputs.append(product_input)
        chunk_combinations.append((user_id, product_code))
        
        if (i + 1) % chunk_size == 0:
            user_inputs_batch = np.array(chunk_user_inputs, dtype='float32')
            product_inputs_batch = np.array(chunk_product_inputs, dtype='float32')
            chunk_predictions = model.predict([user_inputs_batch, product_inputs_batch])
            
            for (user_id, product_code), score in zip(chunk_combinations, chunk_predictions.flatten()):
                product_type = product_type_map[product_code]
                predictions_per_customer[user_id][product_type].append((product_code, score))
            
            chunk_user_inputs = []
            chunk_product_inputs = []
            chunk_combinations = []

    # Process any remaining data
    if chunk_user_inputs:
        user_inputs_batch = np.array(chunk_user_inputs, dtype='float32')
        product_inputs_batch = np.array(chunk_product_inputs, dtype='float32')
        chunk_predictions = model.predict([user_inputs_batch, product_inputs_batch])
        
        for (user_id, product_code), score in zip(chunk_combinations, chunk_predictions.flatten()):
            product_type = product_type_map[product_code]
            predictions_per_customer[user_id][product_type].append((product_code, score))
    
    # Filter top 5 recommendations per customer and product_type
    top_recommendations = []
    
    for user_id, product_types in predictions_per_customer.items():
        for product_type, products_scores in product_types.items():
            # Sort by score in descending order and select top 5
            top_5 = sorted(products_scores, key=lambda x: x[1], reverse=True)[:5]
            for product_code, score in top_5:
                top_recommendations.append((user_id, product_code, product_type, score))
    
    return top_recommendations

# Predict scores and filter top 5 recommendations
top_recommendations = predict_and_filter(user_data_encoded, product_data_encoded, recommendation_model, chunk_size)

# Convert results to DataFrame
top_recommendations_df = pd.DataFrame(top_recommendations, columns=['customerid', 'product_code', 'product_type', 'probability'])

# Convert to Polars DataFrame if needed
top_recommendations_pl_df = pl.from_pandas(top_recommendations_df)

# Print the first few rows of the DataFrame with top recommendations
print(top_recommendations_pl_df)

# Optionally, save to CSV if needed
# top_recommendations_pl_df.write_csv('top_recommendations.csv')


shape: (15_000, 4)
┌────────────┬──────────────┬──────────────┬────────────────┐
│ customerid ┆ product_code ┆ product_type ┆ probability    │
│ ---        ┆ ---          ┆ ---          ┆ ---            │
│ str        ┆ str          ┆ i64          ┆ f32            │
╞════════════╪══════════════╪══════════════╪════════════════╡
│ C1         ┆ P1           ┆ 1            ┆ -55976.804688  │
│ C1         ┆ P13          ┆ 1            ┆ -88600.90625   │
│ C1         ┆ P14          ┆ 1            ┆ -111576.453125 │
│ C1         ┆ P15          ┆ 1            ┆ -145280.40625  │
│ C1         ┆ P17          ┆ 1            ┆ -163554.75     │
│ …          ┆ …            ┆ …            ┆ …              │
│ C1500      ┆ P11          ┆ 0            ┆ -96899.28125   │
│ C1500      ┆ P12          ┆ 0            ┆ -130541.289062 │
│ C1500      ┆ P10          ┆ 0            ┆ -136010.34375  │
│ C1500      ┆ P16          ┆ 0            ┆ -247593.234375 │
│ C1500      ┆ P22          ┆ 0            ┆ -33960