In [1]:
import torch
from tqdm import tqdm

In [2]:
# get the data
from datasets import load_dataset
data = load_dataset('ola13/small-the_pile-dedup')

In [3]:
len(data['train']['text'])

100000

In [4]:
import os
os.environ['TRANSFORMERS_CACHE'] = '/workspace/cache/' # change or comment out as desired 
from transformers import AutoModelForCausalLM, AutoTokenizer

def load_model(model_name_or_path, revision, device):
    model = AutoModelForCausalLM.from_pretrained(
        model_name_or_path, device_map=device, revision=revision, trust_remote_code=False)
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True, padding_side="right") # PADDING RIGHT!
    tokenizer.pad_token_id = 0
    return model, tokenizer

device = 'cuda:0'
model_name_or_path = "TheBloke/Mistral-7B-v0.1-GPTQ"
# model_name_or_path = 'TheBloke/Mistral-7B-Instruct-v0.1-GPTQ'
revision = 'gptq-4bit-32g-actorder_True'

model, tokenizer = load_model(model_name_or_path, revision, device)



In [28]:
# from transformers import GPTNeoXForCausalLM, AutoTokenizer

# model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/pythia-70m-deduped").to(device)

# tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m-deduped")
# tokenizer.pad_token_id = 0

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
bs = 16
layer = 12
context_len = 128
all_acts = []
for i in tqdm(range(0, 1024, bs)):
    text = data['train']['text'][:bs]
    inputs = tokenizer(text, return_tensors="pt", padding=True).to(device)
    inputs_mod = {'input_ids': inputs['input_ids'][:, :context_len], 'attention_mask': inputs['attention_mask'][:, :context_len]}
    output = model(**inputs_mod, output_hidden_states=True)
    batch_acts = output['hidden_states'][layer+1].detach()
    batch_acts_trunc = []
    for i in range(len(batch_acts)):
        acts = batch_acts[i]
        mask = inputs_mod['attention_mask'][i]
        last1 = (mask == 1).nonzero().max().item()
        trunc_acts = acts[:last1+1, :]
        batch_acts_trunc.append(trunc_acts)
    all_acts.extend(batch_acts_trunc)
all_acts = torch.cat(all_acts)

100%|██████████| 64/64 [01:44<00:00,  1.63s/it]


In [6]:
all_acts.shape

torch.Size([124928, 4096])

In [7]:
torch.save(all_acts, './acts/batch.pt')

In [8]:
128*1024

131072

In [9]:
all_acts

tensor([[-0.1282, -0.2644, -0.1174,  ...,  0.0014,  0.0388, -0.0694],
        [ 0.0552,  0.0592,  0.0342,  ...,  0.0229, -0.0098,  0.0276],
        [ 0.0381, -0.0181,  0.0177,  ...,  0.0038,  0.0770,  0.0256],
        ...,
        [ 0.0000, -0.0522, -0.0371,  ..., -0.0695, -0.0317, -0.0046],
        [ 0.0526, -0.0416, -0.0486,  ..., -0.0297, -0.0438, -0.0024],
        [ 0.0080,  0.0275, -0.0435,  ..., -0.0300, -0.0602,  0.0519]],
       device='cuda:0', dtype=torch.float16)