In [34]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Dense, GlobalAveragePooling1D, Softmax
from tensorflow.keras import Model
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Small synthetic intent classification dataset with training and test split
train_data = [
    ("hello there", "greeting"),
    ("hi", "greeting"),
    ("goodbye", "goodbye"),
    ("see you later", "goodbye"),
    ("I want to order pizza", "order_food"),
    ("can I get a burger", "order_food"),
    ("what is the weather", "ask_weather"),
    ("is it raining", "ask_weather"),
]

test_data = [
    ("hey", "greeting"),                      # unseen phrasing for greeting
    ("bye bye", "goodbye"),                   # unseen phrasing for goodbye
    ("order a sandwich", "order_food"),       # unseen food order phrasing
    ("will it rain today", "ask_weather"),    # unseen question phrasing
]

# Build vocabulary from training and test datasets
texts = [t[0] for t in train_data] + [t[0] for t in test_data]
all_words = set(word for sentence in texts for word in sentence.lower().split())
word2idx = {w: i + 1 for i, w in enumerate(sorted(all_words))}  # start indexing from 1
vocab_size = len(word2idx) + 1  # +1 for padding idx 0

# Function to convert sentences to sequences of indices (padded)
max_len = 6
def text_to_seq(text):
    words = text.lower().split()
    seq = [word2idx[w] for w in words if w in word2idx]
    if len(seq) < max_len:
        seq += [0] * (max_len - len(seq))
    else:
        seq = seq[:max_len]
    return seq

X_train = np.array([text_to_seq(t[0]) for t in train_data])
y_train_text = np.array([t[1] for t in train_data])
X_test = np.array([text_to_seq(t[0]) for t in test_data])
y_test_text = np.array([t[1] for t in test_data])

# Label encode targets
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train_text)
y_test = label_encoder.transform(y_test_text)
num_classes = len(label_encoder.classes_)

# Expert sub-models
class Expert(Model):
    def __init__(self, d_model, num_classes):
        super(Expert, self).__init__()
        self.dense1 = Dense(32, activation='relu')
        self.dense2 = Dense(num_classes)
        
    def call(self, x):
        x = self.dense1(x)
        return self.dense2(x)

# Gating network
class GatingNetwork(Model):
    def __init__(self, d_model, num_experts):
        super(GatingNetwork, self).__init__()
        self.dense = Dense(num_experts)
        self.softmax = Softmax(axis=-1)
        
    def call(self, x):
        logits = self.dense(x)
        return self.softmax(logits)

# Modify the MoE class call to optionally return gating probabilities
class MoEIntentClassifier(Model):
    def __init__(self, vocab_size, d_model, num_experts, num_classes):
        super(MoEIntentClassifier, self).__init__()
        self.embedding = Embedding(vocab_size, d_model, mask_zero=True)
        self.pooling = GlobalAveragePooling1D()
        self.num_experts = num_experts
        self.experts = [Expert(d_model, num_classes) for _ in range(num_experts)]
        self.gating_network = GatingNetwork(d_model, num_experts)
        
    def call(self, x, return_gating=False):
        x_emb = self.embedding(x)  # (batch_size, seq_len, d_model)
        x_pooled = self.pooling(x_emb)  # (batch_size, d_model)
        
        gating_probs = self.gating_network(x_pooled)  # (batch_size, num_experts)
        expert_outputs = tf.stack([expert(x_pooled) for expert in self.experts], axis=1)  # (batch_size, num_experts, num_classes)
        
        gated_output = tf.reduce_sum(tf.expand_dims(gating_probs, axis=2) * expert_outputs, axis=1)
        if return_gating:
            return gated_output, gating_probs
        else:
            return gated_output

# Hyperparameters
d_model = 16
num_experts = 3
epochs = 150
batch_size = 4

# Prepare datasets
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(20).batch(batch_size)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(batch_size)

# Instantiate model, loss, optimizer
model = MoEIntentClassifier(vocab_size, d_model, num_experts, num_classes)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam()

# Training loop
for epoch in range(epochs):
    total_loss = 0
    for x_batch, y_batch in train_dataset:
        with tf.GradientTape() as tape:
            logits = model(x_batch)
            loss = loss_fn(y_batch, logits)
        grads = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        total_loss += loss.numpy()
    if (epoch + 1) % 10 == 0 or epoch == 0:
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss/len(train_dataset):.4f}")

# Evaluation on test set
correct = 0
total = 0
for x_batch, y_batch in test_dataset:
    logits = model(x_batch)
    preds = tf.argmax(logits, axis=1)
    correct += tf.reduce_sum(tf.cast(preds == y_batch, tf.int32)).numpy()
    total += x_batch.shape[0]
print(f"Test accuracy: {correct/total:.4f}")

Epoch 1/150, Loss: 1.3847
Epoch 10/150, Loss: 1.3532
Epoch 20/150, Loss: 1.3023
Epoch 30/150, Loss: 1.2131
Epoch 40/150, Loss: 1.0724
Epoch 50/150, Loss: 0.8731
Epoch 60/150, Loss: 0.6358
Epoch 70/150, Loss: 0.4090
Epoch 80/150, Loss: 0.2402
Epoch 90/150, Loss: 0.1375
Epoch 100/150, Loss: 0.0809
Epoch 110/150, Loss: 0.0512
Epoch 120/150, Loss: 0.0349
Epoch 130/150, Loss: 0.0252
Epoch 140/150, Loss: 0.0190
Epoch 150/150, Loss: 0.0148
Test accuracy: 0.7500


In [36]:
# Usage for inference with gating info
def predict_intent_and_expert(text):
    seq = np.array([text_to_seq(text)])
    logits, gating_probs = model(seq, return_gating=True)
    pred_idx = tf.argmax(logits, axis=1).numpy()[0]
    gating_vals = gating_probs.numpy()[0]
    predicted_intent = label_encoder.inverse_transform([pred_idx])[0]
    # Identify expert with highest gating probability
    top_expert = np.argmax(gating_vals)
    # Format gating probabilities nicely
    gating_str = ", ".join([f"Expert {i}: {p:.3f}" for i, p in enumerate(gating_vals)])
    return predicted_intent, top_expert, gating_str

# Test on unseen phrasing
test_phrases = [
    "hello",
    "hi honey, how are you?",
    "bye",
    "see you",
    "can I order food",
    "will it snow tomorrow",
    "is it sunny today",
    "how cloudy is it",
    "how are you",
    "I want a pizza" 
]

# Example predictions with expert usage
for phrase in test_phrases:
    intent, expert_used, gating_details = predict_intent_and_expert(phrase)
    print(f'"{phrase}" -> Predicted intent: {intent}, Top Expert used: {expert_used}')
    # print(f'    Gating probabilities: {gating_details}')

"hello" -> Predicted intent: greeting, Top Expert used: 2
"hi honey, how are you?" -> Predicted intent: greeting, Top Expert used: 2
"bye" -> Predicted intent: greeting, Top Expert used: 1
"see you" -> Predicted intent: goodbye, Top Expert used: 1
"can I order food" -> Predicted intent: order_food, Top Expert used: 1
"will it snow tomorrow" -> Predicted intent: ask_weather, Top Expert used: 0
"is it sunny today" -> Predicted intent: ask_weather, Top Expert used: 0
"how cloudy is it" -> Predicted intent: ask_weather, Top Expert used: 0
"how are you" -> Predicted intent: goodbye, Top Expert used: 1
"I want a pizza" -> Predicted intent: order_food, Top Expert used: 1
