<a href="https://colab.research.google.com/github/khanmhmdi/Moe-llm-edge-computing/blob/main/Untitled54.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U bitsandbytes

In [1]:
import networkx as nx
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, BitsAndBytesConfig

# ─────────────────────────────────────────────
# 1. Network Topology for Device Simulation
# ─────────────────────────────────────────────
def create_network_topology():
    """
    Create a simple network graph G with multiple devices.
    Each node has a 'capacity' that represents how many 'experts' can be loaded.
    Each edge has a 'latency' or 'cost'.
    """
    G = nx.Graph()

    # Node 0: server (large capacity to host the base LLM)
    G.add_node(0, name="Server", capacity=100)

    # Additional devices
    G.add_node(1, name="EdgeDevice1", capacity=8)
    G.add_node(2, name="EdgeDevice2", capacity=8)

    # Create edges with latencies
    G.add_edge(0, 1, latency=2)
    G.add_edge(0, 2, latency=5)
    G.add_edge(1, 2, latency=1)

    return G

# A small helper that attempts to find a device in G for a given expert size
def place_expert_on_device(G, expert_id, expert_size):
    """
    Looks for a device with enough capacity to host 'expert_size'.
    Tries the device(s) with the smallest latency to the server first.
    Returns the chosen device id, or raises an error if none found.
    """
    dist_to_server = nx.single_source_dijkstra_path_length(G, 0, weight='latency')
    # Sort all nodes by ascending distance to server
    device_order = sorted(list(G.nodes()), key=lambda x: dist_to_server[x])

    for dev in device_order:
        capacity = G.nodes[dev]['capacity']
        if capacity >= expert_size:
            G.nodes[dev]['capacity'] -= expert_size
            return dev

    raise RuntimeError(f"Unable to place Expert {expert_id} due to insufficient capacity.")

def unload_expert_from_device(G, device_id, expert_size):
    """
    Frees up capacity after the layer completes.
    """
    G.nodes[device_id]['capacity'] += expert_size

In [2]:

# ─────────────────────────────────────────────
# 2. Define a Simple Expert Submodule
# ─────────────────────────────────────────────
class ExpertModule(nn.Module):
    """
    A small neural module that acts as an 'expert' within a layer.
    For demonstration, we keep it trivial: a linear layer + activation.
    In real systems, each expert might be a more complex feed-forward block.
    """
    def __init__(self, hidden_size, expert_id):
        super().__init__()
        self.expert_id = expert_id
        self.fc = nn.Linear(hidden_size, hidden_size)
        self.activation = nn.ReLU()

        # Simulated memory usage (arbitrary). 3 means it needs capacity=3 to load
        self.memory_footprint = 3

    def forward(self, hidden_states):
        return self.activation(self.fc(hidden_states))



In [3]:
# ─────────────────────────────────────────────
# 3. GateLayer: decides which expert to dispatch to
# ─────────────────────────────────────────────
class GateLayer(nn.Module):
    """
    This small gating module is added to each block. It outputs a probability
    distribution over the available experts, effectively deciding which
    single expert (or multiple if you extend it) is selected.
    """
    def __init__(self, hidden_size, num_experts=2):
        super().__init__()
        self.num_experts = num_experts

        # A simple linear gating network that transforms some summary of hidden states
        # into logits over experts. We might do something more advanced in real life.
        self.gate_fc = nn.Linear(hidden_size, num_experts)

    def forward(self, hidden_states):
        """
        We do the gating decision on the first token only (for demonstration),
        or you could pool the hidden states.
        """
        # Suppose we look at the mean of the hidden states across sequence dimension
        # as an input to the gate.
        # hidden_states shape: [batch_size, seq_len, hidden_size]
        gate_input = hidden_states.mean(dim=1)  # [batch_size, hidden_size]
        logits = self.gate_fc(gate_input)       # [batch_size, num_experts]

        # For simplicity, pick the top-1 expert for each item in the batch.
        # In a real MoE approach, you might use top-k with dispatch logic.
        # We'll pick one expert index across the entire batch (just for demonstration).
        # Let’s do a simple argmax over the *first example* in the batch.
        # shape: (num_experts,)
        with torch.no_grad():
            chosen_expert_idx = torch.argmax(logits[0], dim=0).item()

        return chosen_expert_idx



In [12]:
# ─────────────────────────────────────────────
# 4. A custom wrapper around the base LLM
# ─────────────────────────────────────────────
class MoEWrapperModel(nn.Module):
    """
    Wraps the Hugging Face model and modifies each transformer block by adding:
      - A GateLayer that picks an expert.
      - A set of experts within that block.
      - A device placement simulation for the chosen expert.

    This is purely conceptual to show how gating might live inside the forward pass.
    """
    def __init__(self, base_model_name, network_graph, layers_to_modify=[0,1,2], experts_per_layer=2):
        """
        :param base_model_name: e.g. 'microsoft/Phi-3-mini-4k-instruct'
        :param network_graph: The network topology for simulation
        :param hidden_size: Model hidden size (for demonstration).
                           Must match or be consistent with the actual base model.
        :param layers_to_modify: Which layer indexes we insert gating + experts on.
        :param experts_per_layer: number of experts in each modified layer.
        """
        super().__init__()
        self.network_graph = network_graph

        # 4.1) Load the HF model config, then the actual model
        self.config = AutoConfig.from_pretrained(base_model_name)
        # Check if hidden_size matches the model. If not, we adapt or just proceed for demonstration.
        # For Phi-3-mini-4k-instruct, hidden size might be 1280 or something else.
        # We'll do a smaller demonstration ignoring mismatch issues that may arise in real usage.
        from transformers import AutoModelForCausalLM, BitsAndBytesConfig

        quantization_config = BitsAndBytesConfig(load_in_8bit=True)

        self.base_model = AutoModelForCausalLM.from_pretrained(
            base_model_name,
            quantization_config=quantization_config,
            trust_remote_code=True
            # device_map="auto",  # If you'd like automatic device placement
        )

        # Get the actual hidden size from the model config
        hidden_size = self.base_model.config.hidden_size

        # 4.2) For each “transformer block” we want to augment, we create:
        #     - A GateLayer
        #     - Some experts
        # Because the internal structure of a HF model can vary, we won’t dive too deep into the official layer definitions.
        # We'll store the gating modules + experts in a dictionary keyed by layer index.
        self.layers_to_modify = layers_to_modify
        self.experts_per_layer = experts_per_layer

        self.gates = nn.ModuleDict()
        self.experts = nn.ModuleDict()

        for layer_idx in layers_to_modify:
            self.gates[str(layer_idx)] = GateLayer(hidden_size, num_experts=experts_per_layer)
            self.gates[str(layer_idx)].to("cuda")  # Move to the desired device

            # Create N experts for that layer
            layer_experts = nn.ModuleList()
            for e_idx in range(experts_per_layer):
                layer_experts.append(ExpertModule(hidden_size, expert_id=e_idx))
                layer_experts[e_idx].to("cuda")  # Move to the desired device
            self.experts[str(layer_idx)] = layer_experts
    def forward(self, input_ids, attention_mask=None):
        """
        Forward pass:
        1) Pass tokens through each layer of the base model.
        2) If the layer is in 'layers_to_modify', then:
            (a) run gating
            (b) pick the expert
            (c) simulate loading/unloading that expert on a device
            (d) forward hidden states through that expert
        3) Continue to next layer.
        4) Return final logits from the base model's output head.
        """
        # (a) We first do an embedding + partial forward inside the Hugging Face model
        #     up to each block, but this is quite complicated to do “manually.”
        #
        # A simpler demonstration: we use the HF model’s forward once to get hidden states,
        # then “pretend” each layer is hooking into these hidden states.
        # A real approach would carefully modify each block in the model’s forward pass.
        #
        # For demonstration, we do something approximate: get the embeddings and do
        # a dummy multi-layer forward ourselves.

        # Step 1: embed
        # The base model usually has model.transformer.wte or similar for embeddings.
        # Instead of picking it out precisely, we can do a partial forward pass up
        # to the hidden states. But the “microsoft/Phi-3-mini-4k-instruct” might have
        # a textual `forward()` that we can’t easily wrap. So let's do a simpler approach:
        input_ids = input_ids.to(self.base_model.device)
        embeddings = self.base_model.get_input_embeddings()(input_ids)

        hidden_states = embeddings  # shape: [batch, seq_len, hidden_size]
        # A placeholder attention mask
        if attention_mask is None:
            attention_mask = torch.ones_like(input_ids)

        # We'll simulate N layers. In practice, the base model might have 12 or more.
        # For each layer, if it’s in layers_to_modify, do gating + expert forward.
        for layer_idx in range(self.config.num_hidden_layers):
            # If not in layers_to_modify, just do a “dummy feed-forward” so that
            # we have some representation of continuing the forward pass
            if layer_idx in self.layers_to_modify:
                gate_module = self.gates[str(layer_idx)]
                experts = self.experts[str(layer_idx)]

                # 2a) gating network picks an expert index
                chosen_expert_idx = gate_module(hidden_states.float())

                # 2b) simulate device placement
                chosen_expert = experts[chosen_expert_idx]
                # The ID might be unique for each layer+expert, but let's demonstrate:
                expert_unique_name = f"layer{layer_idx}_expert{chosen_expert.expert_id}"
                # place expert
                device_id = place_expert_on_device(
                    self.network_graph,
                    expert_unique_name,
                    chosen_expert.memory_footprint
                )

                # 2c) forward pass through chosen expert
                hidden_states = chosen_expert(hidden_states)

                # 2d) unload from device to free capacity
                unload_expert_from_device(
                    self.network_graph,
                    device_id,
                    chosen_expert.memory_footprint
                )
            else:
                # a minimal dummy layer forward
                # in reality, we’d do the actual base model block
                hidden_states = hidden_states + 0.01  # or some small transformation

        # final LM head
        # In real usage, we’d pass the final hidden states to base_model.lm_head
        logits = self.base_model.lm_head(hidden_states)
        return logits



In [13]:
# ─────────────────────────────────────────────
# 5. Putting It All Together
# ─────────────────────────────────────────────
def main():
    # 5.1) Create the network topology
    G = create_network_topology()

    # 5.2) Build the MoE wrapper model that uses microsoft/Phi-3-mini-4k-instruct
    #     and modifies layers 0, 1, 2 to each have 2 experts.
    #     Note: The real Phi-3-mini-4k-instruct might differ in hidden size, etc.
    #           This is a demonstration for the gating + device simulation concept.
    model = MoEWrapperModel(
        base_model_name="microsoft/Phi-3-mini-4k-instruct",
        network_graph=G,
        hidden_size=1024,       # might need to match the actual model or be adapted
        layers_to_modify=[0,1,2],
        experts_per_layer=2
    )

    # 5.3) Tokenize an input
    tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
    input_text = "Hello, can you explain how to use bananas and dragonfruits together?"
    inputs = tokenizer(input_text, return_tensors="pt")

    with torch.no_grad():
        # 5.4) Forward pass
        # This will trigger the gating in layers 0..2, each time picking an expert
        # and simulating device load/unload from G.
        logits = model(**inputs)

        # 5.5) Turn logits into text, just to show the flow.
        # We'll pick the top token from logits for demonstration.
        next_token_id = torch.argmax(logits[0, -1, :]).unsqueeze(0).unsqueeze(0)
        generated_text = tokenizer.decode(next_token_id[0])
        print("Next token predicted:", generated_text)

    # 5.6) Inspect the final device capacities to see how experts were loaded/unloaded
    for node in G.nodes():
        dev_name = G.nodes[node]['name']
        curr_cap = G.nodes[node]['capacity']
        print(f"Device {node} ({dev_name}) final capacity: {curr_cap}")


if __name__ == "__main__":
    main()

TypeError: MoEWrapperModel.__init__() got an unexpected keyword argument 'hidden_size'