### Select GPU runtime type before running

In [1]:
!pip install torch transformers -q

import os
import torch
import torch.functional as F
import shutil
import json
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
import copy

device = 'cuda'
cache_dir = None

### Login to Hub (required for private models and uploading)

In [2]:
! huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) Y
Token is valid.
Your token has been saved in your configured git cred

### Add/remove Pythia-1b experts

In [3]:
model_names = [
    "Multi-Domain-Expert-Layers/expert-pubmed_central",
    "Multi-Domain-Expert-Layers/expert-freelaw",
    "Multi-Domain-Expert-Layers/expert-github",
    "Multi-Domain-Expert-Layers/expert-arxiv"
]

### Optionally Connect to Google Drive to cache large model files

In [4]:
from google.colab import drive
drive.mount('/content/drive')
cache_dir = '/content/drive/MyDrive/HuggingfaceCache'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Merge the models

In [5]:
class HFModel:
  def __init__(self, model_name):
    self.name=model_name

  def __enter__(self):
    print(f"Loading Model {self.name}")
    with torch.no_grad():
      self.model = AutoModelForCausalLM.from_pretrained(self.name, cache_dir=cache_dir) #,torch_dtype=torch.float16
      self.model = self.model.to(device)
      self.model.eval()
      return self.model

  def __exit__(self, type, value, traceback):
    print (f"Unloading Model {self.name}")
    del self.model

In [6]:
def blend_n_models(model_names):
  with torch.no_grad():
    with HFModel(model_names[0]) as blended_model:

      # zero out blended models params
      for p in blended_model.parameters():
        p.data *= 0

      for mn in model_names:
        with HFModel(mn) as temp_model:
          for p1, p2 in zip(blended_model.parameters(), temp_model.parameters()):
            p1.data += p2.data * (1/len(model_names))
        del temp_model
      return blended_model

blended = blend_n_models(model_names)

Loading Model Multi-Domain-Expert-Layers/expert-github
Loading Model Multi-Domain-Expert-Layers/expert-github
Unloading Model Multi-Domain-Expert-Layers/expert-github
Loading Model Multi-Domain-Expert-Layers/expert-arxiv
Unloading Model Multi-Domain-Expert-Layers/expert-arxiv
Unloading Model Multi-Domain-Expert-Layers/expert-github


In [7]:
test_prompt="One day,"
test_max_length=32

tokenizer = AutoTokenizer.from_pretrained(model_names[0])
# Tokenize the prompt
inputs = tokenizer.encode(test_prompt, return_tensors="pt").to(device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/264 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [8]:
print("Generating for blended model...")
outputs = blended.generate(inputs, max_length=test_max_length, temperature=0.5, do_sample = False)
result = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(result) # Print the generated text

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Generating for blended model...
['One day, I was sitting in my car, listening to the radio. I was listening to the news, and I heard a story about a man who had']


### Upload to hub
Change `model_name` to upload to the hub

In [10]:
from huggingface_hub import HfApi

model_name = "<hf-username>/<model-name>"

sources = "\n".join(map(lambda mn: f"- [{mn}](https://huggingface.co/{mn})", model_names))

model_card_yaml = f"""
---
tags:
- MDEL
---

# Model Name
{model_name}

# Model Description
This model was generated by averaging the weights of the following models
{sources}

"""

print(model_card_yaml)

if model_name != "<hf-username>/<model-name>":
  blended.push_to_hub(model_name, model_card=model_card_yaml)

  # Upload model card
  with open("./README.md", "w") as f:
    f.write(model_card_yaml)
  api = HfApi()
  api.upload_file(
      path_or_fileobj="./README.md",
      path_in_repo="README.md",
      repo_id=model_name,
      repo_type="model",
  )


---
tags:
- MDEL
---

# Model Name
stillerman/MDEL-github-arxiv

# Model Description
This model was generated by averaging the weights of the following models
- [Multi-Domain-Expert-Layers/expert-github](https://huggingface.co/Multi-Domain-Expert-Layers/expert-github)
- [Multi-Domain-Expert-Layers/expert-arxiv](https://huggingface.co/Multi-Domain-Expert-Layers/expert-arxiv)




Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/4.11G [00:00<?, ?B/s]