## Setup

In [1]:
!pip install rouge_score bert_score datasets peft bitsandbytes accelerate

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.10.0-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m37.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading a

In [18]:
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.tokenization_utils_base import BatchEncoding
import torch
import logging
import string
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from nltk.translate.bleu_score import SmoothingFunction
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from bert_score import score
from datasets import load_dataset, load_metric
from tqdm.auto import tqdm
import json
from peft import AutoPeftModelForCausalLM

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
logging.basicConfig(level=logging.INFO)

## Load Model


In [19]:
model_name = 'javijer/llama2_alpaca_7b'
model = AutoModelForCausalLM.from_pretrained(
    model_name, # YOUR MODEL YOU USED FOR TRAINING
    load_in_4bit = False,
    output_hidden_states=True,
)

tokenizer = AutoTokenizer.from_pretrained(model_name, token='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')

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

## Load Dataset

In [6]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

from datasets import load_dataset
dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset

Downloading readme:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/44.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51760 [00:00<?, ? examples/s]

Dataset({
    features: ['output', 'input', 'instruction'],
    num_rows: 51760
})

In [20]:
import random

random.seed(1)

test_indexes = random.sample(range(len(dataset)), 20)
test_indexes

[8805,
 37303,
 50054,
 4135,
 16716,
 7727,
 32468,
 49870,
 29457,
 30949,
 42702,
 24878,
 51689,
 13759,
 6151,
 31972,
 1857,
 25546,
 28361,
 39809]

In [21]:
EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

In [22]:
test_dataset = dataset.select(test_indexes)
test_dataset = test_dataset.map(formatting_prompts_func, batched = True,)

In [27]:
def generate_text(
    model: AutoModelForCausalLM,
    tokenizer: AutoTokenizer,
    prompt: str,
    max_num_tokens: int = 25,
    top_k: int = 5,
    layer: int = 8,
    temperature: float = 1.0,
    stop_token_ids: list = [],
    stop_words: list = [],
    eos_weight: float = 2.0,
    enable_logging: bool = False
) -> str:
    """
    Generate text using a language model.

    Args:
        model (model): The language model.
        tokenizer (model): The tokenizer corresponding to the model.
        prompt (str): The initial text to start generation from.
        num_tokens (int, optional): The number of tokens to generate. Defaults to 5.
        top_k (int, optional): The number of top tokens to consider for sampling. Defaults to 5.
        layer (int, optional): The layer of the model to use for generation. Defaults to 8.
        temperature (float, optional): The temperature for softmax. Defaults to 1.0.
        stop_token_ids (list, optional): List of token ids that will end generation if sampled. Defaults to [].
        stop_words (list, optional): List of words that will end generation if sampled. Defaults to [].
        eos_weight (float, optional): The weight to assign to the EOS token. Defaults to 2.0.
        enable_logging (bool, optional): Enable logging for debugging. Defaults to False.

    Returns:
        str: The generated text.
    """
    # Move model to GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    print(layer)
    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs.to(device)

    # Get the EOS token ID
    eos_token_id = tokenizer.eos_token_id
    stop_token_ids.append(eos_token_id)
    output_tokens = []

    # Generate num_tokens tokens
    for _ in range(max_num_tokens):
        # Forward pass through the model
        outputs = model(**inputs)

        # Get the output of the specified layer
        layer_output = outputs.hidden_states[layer]

        # Pass the output through the final linear layer
        logits = model.lm_head(layer_output)

        # Apply softmax to get probabilities
        probabilities = F.softmax(logits / temperature, dim=-1)

        # Increase the weight of the EOS token
        probabilities[0, -1, eos_token_id] *= eos_weight

        # Get the probabilities of the top k tokens
        top_k_probabilities, top_k_indices = torch.topk(probabilities[0, -1], top_k)

        # Normalize the top k probabilities
        top_k_probabilities /= torch.sum(top_k_probabilities)

        # Sample from the top k probability distribution
        sampled_token_id = top_k_indices[torch.multinomial(top_k_probabilities, 1)].item()
        output_tokens.append(sampled_token_id)

        # Decode the token id back into text
        sampled_token_text = tokenizer.decode([sampled_token_id])
        sampled_token = {'input_ids': torch.tensor([[1, sampled_token_id]]), 'attention_mask': torch.tensor([[1, 1]])}

        # If the sampled token is a stop token or stop word, return the generated text
        if sampled_token_id in stop_token_ids or sampled_token_text in stop_words:
            generated_text = tokenizer.decode(inputs['input_ids'][0])
            pure_output = tokenizer.decode(output_tokens)
            return {'text':generated_text,'output':pure_output,'prompt':prompt}

        combined_input_ids = torch.cat((inputs['input_ids'].detach().cpu(), sampled_token['input_ids'][:,1:]), dim=-1)
        combined_attention_mask = torch.cat((inputs['attention_mask'].detach().cpu(), sampled_token['attention_mask'][:,1:]), dim=-1)

        inputs = BatchEncoding({'input_ids': combined_input_ids, 'attention_mask': combined_attention_mask}).to(device)

        # Log the token and top k tokens if logging is enabled
        if enable_logging:
            print(f'Token: {sampled_token_text}')
            print('      -- Top tokens --')
            for i in range(top_k):
                token = tokenizer.decode([top_k_indices[i].item()])
                probability = top_k_probabilities[i].item()
                print(f'   {token}: {probability}')

            print()
    generated_text = tokenizer.decode(inputs['input_ids'][0])
    pure_output = tokenizer.decode(output_tokens)
    return {'text':generated_text,'output':pure_output,'prompt':prompt}

In [None]:
responses = []
for layer in range(32):
  responses.append([])
  i=0
  for instruction, input in zip(test_dataset['instruction'], test_dataset['input']):
    if i >= 10: break
    i+=1
    prompt = alpaca_prompt.format(
      instruction,
      input,
      "",
    )

    output = generate_text(model, tokenizer, prompt, top_k = 5, temperature = 0.1, max_num_tokens=256, layer = layer, enable_logging=True)
    responses[layer].append(output)
  break

In [5]:
prompt = "We are"
# print(get_top_n_tokens(model, tokenizer, prompt, n=10, layer=-1))
output = generate_text(model, tokenizer, prompt, max_num_tokens = 20, layer = 32, temperature = 0.2, stop_words=['\n'], enable_logging=True)

32




Token: excited
      -- Top tokens --
   a: 0.5791015625
   excited: 0.39794921875
   pleased: 0.0224609375
   committed: 0.0004112720489501953
   thr: 0.0003628730773925781

Token: to
      -- Top tokens --
   to: 1.0
   </s>: 0.0
   <unk>: 0.0
    : 0.0
   <s>: 0.0

Token: announ
      -- Top tokens --
   announ: 1.0
   share: 4.5418739318847656e-05
   introduce: 2.980232238769531e-07
   <s>: 0.0
   <unk>: 0.0

Token: ce
      -- Top tokens --
   ce: 1.0
   </s>: 0.0
   <unk>: 0.0
    : 0.0
   <s>: 0.0

Token: that
      -- Top tokens --
   that: 0.9990234375
   the: 0.00075531005859375
   our: 2.384185791015625e-07
   <s>: 0.0
   <unk>: 0.0

Token: we
      -- Top tokens --
   we: 0.4296875
   the: 0.32421875
   our: 0.244873046875
   registration: 0.0011701583862304688
   this: 1.1920928955078125e-06

Token: will
      -- Top tokens --
   will: 0.95751953125
   have: 0.042083740234375
   are: 0.00036406517028808594
   <s>: 0.0
   <unk>: 0.0

Token: be
      -- Top tokens --
   be: 