In [1]:
import gc
import torch
import json
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset, concatenate_datasets
from transformers import LlamaForCausalLM, LlamaConfig, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

DEVICE = 'cuda'
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.unk_token_id
load_config = BitsAndBytesConfig(load_in_8bit=True)
# model0 = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", quantization_config=load_config, torch_dtype=torch.bfloat16)

In [2]:
model = LlamaForCausalLM.from_pretrained('/root/llama_distillation/experiments/models/baseline').to(DEVICE)
model1 = LlamaForCausalLM.from_pretrained('/root/llama_distillation/experiments/models/baseline__distill_only').to(DEVICE)
model2 = LlamaForCausalLM.from_pretrained('/root/llama_distillation/experiments/models/baseline_no_distill').to(DEVICE)

In [3]:
tokenizer_1b = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
model_1b = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0").to(DEVICE)

In [4]:
model3 = LlamaForCausalLM.from_pretrained('/root/llama_distillation/experiments/models/tiny_llama').to(DEVICE)

In [3]:
gc.collect()
torch.cuda.empty_cache()

In [4]:
from peft import LoraConfig, get_peft_model
import bitsandbytes as bnb

def batch_to_device(batch, device='cuda'):
    return {k: torch.tensor(v).to(device) for k, v in batch.items()}

def lora_model(model):
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.modules.linear.Linear):
            lora_module_names.add(name.split('.')[-1])
    config = LoraConfig(
        r=16,  #attention heads
        lora_alpha=8,  #alpha scaling
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj"],  #gonna train all
        lora_dropout=0.1,  # dropout probability for layers
        bias="none",
        task_type="CAUSAL_LM", #for Decoder models like GPT Seq2Seq for Encoder-Decoder models like T5
    )
    return get_peft_model(model, config)

class BenchDataset(Dataset):
    def __init__(self, dataset: Dataset, prompt, examples_cnt, options, metric=None):
        super(BenchDataset).__init__()
        examples = dataset.take(examples_cnt)
        p_ex = prompt + "{0[label]}\n"
        if len(examples) > 0:
            if 'ref' not in dataset[0]:
                prompt = "Examples:\n" + ''.join(list(map(lambda x: p_ex.format(x|{'label':options[x['label']]}), examples))) + "\nTask:\n" + prompt
            else:
                prompt = "Examples:\n" + ''.join(list(map(lambda x: p_ex.format(x|{'label':x['ref']}), examples))) + "\nTask:\n" + prompt
        else:
            prompt = "Task:\n" + prompt
        print(prompt)
        self.dataset = dataset.skip(examples_cnt).map(lambda row: {"prompt": prompt.format(row)})
        self.options = options
        self.labels = self.dataset['label']
        tokens = tokenizer(self.dataset['prompt'], return_tensors='pt', padding=True)
        self.input_ids = tokens['input_ids']
        self.attention_mask = tokens['attention_mask']
        self.metric = metric

    def __getitem__(self, index):
        return {'input_ids': self.input_ids[index], 'attention_mask': self.attention_mask[index], 'label': self.labels[index]}

    def __len__(self):
        return self.input_ids.size(0)
    
    def fine_tune(self, model, tokenizer, epochs=10, batch_size=32, lr=1e-3):
        model = lora_model(model)
        model.train()

        tokens = tokenizer(self.dataset['prompt'], return_tensors='pt', padding=True)
        self.input_ids = tokens['input_ids']
        self.attention_mask = tokens['attention_mask']
        options = tokenizer(self.options, return_tensors='pt')['input_ids'][:, 1]
        

        criterion = torch.nn.CrossEntropyLoss()
        optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, epochs, lr/5)
        dataloader = torch.utils.data.DataLoader(self, batch_size=batch_size)
        for epoch in range(epochs):
            total_loss = 0
            for batch in tqdm(dataloader):
                optimizer.zero_grad()
                batch = batch_to_device(batch, DEVICE)
                logits = model(
                    input_ids = batch['input_ids'],
                    attention_mask = batch['attention_mask'],
                ).logits
                ids = (batch['attention_mask'].sum(dim=1) - 1).unsqueeze(1).unsqueeze(2).repeat(1, 1, logits.size(2))
                logits = logits.gather(1, ids)[:, 0, :]
                loss = criterion(logits[:, options], batch['label'])
                total_loss += loss.item()
                loss.backward()
                optimizer.step()
                gc.collect()
                torch.cuda.empty_cache()
            scheduler.step()
            print(f"Epoch {epoch}. Loss {total_loss / len(dataloader)}")
        self.evaluate(model, batch_size)
        return model
        
    
    def evaluate(self, model, tokenizer, batch_size=1):
        model.eval()

        tokens = tokenizer(self.dataset['prompt'], return_tensors='pt', padding=True)
        self.input_ids = tokens['input_ids']
        self.attention_mask = tokens['attention_mask']

        preds = []
        dataloader = torch.utils.data.DataLoader(self, batch_size=batch_size)
        for i, batch in enumerate(tqdm(dataloader)):
            batch = batch_to_device(batch, DEVICE)
            logits = model(
                input_ids = batch['input_ids'],
                attention_mask = batch['attention_mask'],
            ).logits
            ids = []
            for x in batch['input_ids']:
                zeros = (x == 0).nonzero(as_tuple=True)[0]
                if zeros.size(0) == 0:
                    ids.append(x.size(0))
                else:
                    ids.append(zeros[0])
            ids = (torch.tensor(ids) - 1).unsqueeze(1).unsqueeze(2).repeat(1, 1, logits.size(2)).to(DEVICE)
            logits = logits.gather(1, ids)[:, 0, :].detach().cpu()
            options = tokenizer(self.options, return_tensors='pt')['input_ids'][:, 1]
            if 'ref' in self.dataset[0]:
                options = tokenizer([self.dataset[i]['good'], self.dataset[i]['bad']])['input_ids']
                options = torch.tensor([options[0][1], options[1][1]])
            preds.extend(logits[:, options].argmax(dim=1))
            batch = None
            logits = None
            gc.collect()
            torch.cuda.empty_cache()
        print(torch.sum(torch.tensor(self.labels) == torch.tensor(preds))/len(self))
        return self.labels, preds

In [5]:
import random
import tensorflow_datasets as tfds
import datasets


def axb_dataset(examples_cnt=0):
    dataset = load_dataset("super_glue", "axb", split='test').shuffle(seed=0)
    prompt = """Question: Do sentences '{0[sentence1]}' and '{0[sentence2]}' mean the same thing? Answer:"""
    return BenchDataset(
        dataset,
        prompt,
        examples_cnt,
        ['Yes', 'No']
    )

def sst2_dataset(examples_cnt=0, split='validation'):
    dataset = load_dataset("stanfordnlp/sst2", split=split).shuffle(seed=0)
    prompt = """{0[sentence]} Is the review positive? Answer:"""
    return BenchDataset(
        dataset,
        prompt,
        examples_cnt,
        ['No', 'Yes']
    )

def boolq_dataset(examples_cnt=0, split='validation'):
    dataset = load_dataset("super_glue", "boolq", split=split).shuffle(seed=0).map(lambda x: {'passage':x['passage'][:400]})
    if split == 'validation':
        dataset = dataset.take(1000)
    prompt = """{0[passage]}\nQuestion: {0[question]}?\nAnswer:"""
    return BenchDataset(
        dataset,
        prompt,
        examples_cnt,
        ['No', 'Yes']
    )

def qnli_dataset(examples_cnt=0, split='validation'):
    dataset = tfds.load('glue/qnli', split=split)
    labels = [data['label'] for data in dataset]
    questions = [data['question'].numpy().decode('utf-8') for data in dataset]
    sentences = [data['sentence'].numpy().decode('utf-8') for data in dataset]
    dataset = datasets.Dataset.from_dict({
        'label': labels,
        'question': questions,
        'sentence': sentences,
    }).shuffle(seed=0).map(lambda x: {'sentence':x['sentence'][:200]})
    if split == 'validation':
        dataset = dataset.take(1000)
    prompt = """Request: {0[question]}\nResponse: {0[sentence]}\nQuestion: Is it suitable Response for the Request? Answer:"""
    return BenchDataset(
        dataset,
        prompt,
        examples_cnt,
        ['Yes', 'No']
    )

def irregular_forms_dataset(examples_cnt=0):
    dataset = concatenate_datasets([
        load_dataset("nyu-mll/blimp", name='irregular_past_participle_adjectives', split='train'),
        load_dataset("nyu-mll/blimp", name='irregular_past_participle_verbs', split='train'),
        load_dataset("nyu-mll/blimp", name='irregular_plural_subject_verb_agreement_1', split='train'),
        load_dataset("nyu-mll/blimp", name='determiner_noun_agreement_1', split='train'),
        load_dataset("nyu-mll/blimp", name='determiner_noun_agreement_irregular_1', split='train'),
        load_dataset("nyu-mll/blimp", name='determiner_noun_agreement_with_adjective_1', split='train'),
        load_dataset("nyu-mll/blimp", name='anaphor_gender_agreement', split='train'),
        load_dataset("nyu-mll/blimp", name='anaphor_number_agreement', split='train'),
    ])
    dataset = dataset.shuffle(seed=0)
    
    def mapper(x):
        good_split = x['sentence_good'].split(' ')
        bad_split = x['sentence_bad'].split(' ')
        vals = []
        sent = []
        for i in range(len(good_split)):
            if good_split[i] == bad_split[i]:
                sent.append(good_split[i])
            else:
                vals.append(good_split[i])
                vals.append(bad_split[i])
                sent.append('<unk>')
        ref = vals[0]
        label = random.randint(0, 1)
        if label == 1:
            vals[0], vals[1] = vals[1], vals[0]
        return {
            'sent': ' '.join(sent),
            'good': vals[0],
            'bad': vals[1],
            'label': label,
            'ref': ref,
        }

    dataset = dataset.map(mapper)
    prompt = """{0[sent]}"""

    return BenchDataset(
        dataset,
        prompt,
        examples_cnt,
        ['A', 'B']
    )
    

In [6]:
random.seed(0)
dataset = qnli_dataset()
# dataset1 = qnli_dataset(1)
# dataset5 = qnli_dataset(5)
irr_dataset = irregular_forms_dataset()
b_dataset = boolq_dataset()
s_dataset = sst2_dataset()
# train_dataset = irregular_forms_dataset(0, 'train')

2024-06-16 02:49:03.799047: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-16 02:49:04.712689: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-06-16 02:49:04.717884: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-06-16 02:49:0

Map:   0%|          | 0/5463 [00:00<?, ? examples/s]

Task:
Request: {0[question]}
Response: {0[sentence]}
Question: Is it suitable Response for the Request? Answer:


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Task:
{0[sent]}


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Task:
{0[passage]}
Question: {0[question]}?
Answer:
Task:
{0[sentence]} Is the review positive? Answer:


In [7]:
s_dataset.evaluate(model1, tokenizer)
b_dataset.evaluate(model1, tokenizer)
irr_dataset.evaluate(model1, tokenizer)
_ = dataset.evaluate(model1, tokenizer)

  0%|          | 0/872 [00:00<?, ?it/s]

  return {k: torch.tensor(v).to(device) for k, v in batch.items()}
100%|██████████| 872/872 [02:25<00:00,  5.99it/s]


tensor(0.5654)


100%|██████████| 1000/1000 [02:56<00:00,  5.68it/s]


tensor(0.6350)


100%|██████████| 8000/8000 [21:54<00:00,  6.08it/s]


tensor(0.7925)


100%|██████████| 1000/1000 [02:50<00:00,  5.86it/s]

tensor(0.5090)





In [10]:
# labels, preds = dataset1.evaluate(model0, tokenizer, batch_size=1)
# print(torch.sum(torch.tensor(labels) == torch.tensor(preds))/len(labels))

  return {k: torch.tensor(v).to(device) for k, v in batch.items()}
  0%|          | 0/999 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 

In [68]:
# model = LlamaForCausalLM.from_pretrained('/root/llama_distillation/experiments/models/baseline').to(DEVICE)
# model = train_dataset.fine_tune(model, epochs=5, lr=1e-2)

In [9]:
labels, preds = dataset.evaluate(model, tokenizer)

  return {k: torch.tensor(v).to(device) for k, v in batch.items()}
100%|██████████| 1000/1000 [02:55<00:00,  5.69it/s]

tensor(0.5090)





In [10]:
labels, preds = dataset.evaluate(model2, tokenizer)

  0%|          | 0/1000 [00:00<?, ?it/s]

  return {k: torch.tensor(v).to(device) for k, v in batch.items()}
100%|██████████| 1000/1000 [02:56<00:00,  5.66it/s]

tensor(0.5070)





In [11]:
labels, preds = dataset.evaluate(model_1b, tokenizer_1b)

  0%|          | 0/1000 [00:00<?, ?it/s]

  return {k: torch.tensor(v).to(device) for k, v in batch.items()}
100%|██████████| 1000/1000 [03:56<00:00,  4.23it/s]

tensor(0.4980)





In [12]:
labels, preds = dataset.evaluate(model3, tokenizer_1b)

  0%|          | 0/1000 [00:00<?, ?it/s]

  return {k: torch.tensor(v).to(device) for k, v in batch.items()}
100%|██████████| 1000/1000 [03:48<00:00,  4.37it/s]

tensor(0.5050)





In [7]:
for i in range(100):
    tokens = batch_to_device(tokenizer(dataset.dataset[i]['prompt'], return_tensors='pt'))
    l = len(tokens['input_ids'][0])
    output = model.generate(
        **tokens,
        max_length=10 + l
    )
    print(tokenizer.batch_decode(output[:,l:]), dataset[i]['label'], l)

  return {k: torch.tensor(v).to(device) for k, v in batch.items()}


['\n\nA) The Roman period\nB)'] 1 62
['\nYes, the answer is suitable. According to'] 0 95
['\nNo, it is not a suitable answer.'] 0 89
['\n\nHistoric Heart of Newcastle upon'] 1 90
['\n\nNo, it is not a suitable answer'] 1 57
['\n\nNo, it is not suitable answer.'] 0 48
['\n\nYes, that is a suitable answer.'] 1 61
['\nA: Ayman al-Zawah'] 0 67
['\nYes, that is correct! Tesla'] 0 64
['\n\n\nAnswer:\nYes, UMC'] 0 38
['\n\nA) Glyceraldehy'] 1 68
['\n\nThe answer is: P&O Cru'] 1 95
['No, the Victoria and Albert Museum was founded in'] 1 35


KeyboardInterrupt: 

In [32]:
sum(preds), sum(labels)

(0, 497)

In [19]:
import torch

class LogitsDataset(torch.utils.data.Dataset):
    def __init__(self, dir_path):
        super(LogitsDataset).__init__()
        self.ids = torch.load(f"{dir_path}/input_ids.pt")
        self.masks = torch.load(f"{dir_path}/attention_masks.pt")
        self.indices = torch.load(f"{dir_path}/indices.pt")#[:, :, :10]
        self.values = torch.load(f"{dir_path}/values.pt")#[:, :, :10]

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.ids[idx],
            'attention_mask': self.masks[idx],
            'indices': self.indices[idx],
            'values': self.values[idx],
        }

In [20]:
lds = LogitsDataset('/root/logits100/chunk0')
tokenizer.batch_decode(lds[:100]['input_ids'])

['<s> a0x0040550atestb_al_al[label="0x0040550a\\ntestb %al, %al"];\n a0x0040550cje_0x00405513[label="0x0040550c\\nje 0x00405513"];\n a0x00405513call_0x00402ef4[label="0x00405513\\ncall 0x00402ef4"];\n a0x00402ef4fninit_[label="0x00402ef4\\nfninit "];\n a0x00402ef6fwait_[label="0x00402ef6\\nfwait "];</s><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk>',
 '<s> * router: added optional :ref:`upstream logs <envoy_api_field_config.filter.http.router.v2.Router.upstream_log>`.\n * router: added complete :ref:`custom append/override/remove support\n   <config_http_conn_man_headers_custom_request_headers>` of request/response headers.\n * router: added support to :ref:`specify response code during redirect\n   <envoy_api_field_route.RedirectAction.response_code>`.\n * router: added :ref:`configuration <env

In [2]:
from datasets import load_from_disk

dataset = load_from_disk('/root/dolma_dataset_500k')

In [5]:
set(dataset['source'])

{'c4', 'common-crawl', 'gutenberg', 'reddit', 's2', 'stack-dedup', 'wikipedia'}

In [23]:
a = torch.rand(3, 4).unsqueeze((2, 3))
a

TypeError: unsqueeze(): argument 'dim' (position 1) must be int, not tuple

In [21]:
a.repeat(1, 1, 3).shape

torch.Size([3, 4, 3])

In [79]:
a = torch.tensor([[1, 2, 2], [2, 4, 2]])
ans = []
for x in a:
    ans.append((x == 2).nonzero(as_tuple=True)[0][0])
torch.tensor(ans)

tensor([1, 0])