In [2]:
import itertools
import jsonlines
from pprint import pprint

from datasets import load_dataset

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
instruction_tuned_dataset = load_dataset("tatsu-lab/alpaca", split="train", streaming=True)


In [4]:
num_samples = 20
print("Datasets Samples")
top_m = list(itertools.islice(instruction_tuned_dataset, num_samples))
for sample in top_m:
    print(sample)

Datasets Samples
{'instruction': 'Give three tips for staying healthy.', 'input': '', 'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.', 'text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nGive three tips for staying healthy.\n\n### Response:\n1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.'}
{'instruction': 'What are the three primary colors?', 'input': '', 'output': 'The three primary colors are red, blue, and yellow.', 'text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nWhat are the three primary col

In [5]:
prompt_template_with_input = """""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction: 
{instruction}

### Input:
{input}

### Response:"""


prompt_template_without_input = """""Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{instruction}

### Response:"""

In [6]:
processed_data = []
for sample in top_m:
    if sample["input"]:
        prompt = prompt_template_with_input.format(instruction=sample["instruction"], input=sample["input"])
    else:
        prompt = prompt_template_without_input.format(instruction=sample["instruction"])

    processed_data.append({"prompt": prompt, "output": sample["output"]})

In [7]:
pprint(processed_data[9])

{'output': 'He finished his meal and left the restaurant.',
 'prompt': '""Below is an instruction that describes a task, paired with an '
           'input that provides further context. Write a response that '
           'appropriately completes the request.\n'
           '\n'
           '### Instruction: \n'
           'Evaluate this sentence for spelling and grammar mistakes\n'
           '\n'
           '### Input:\n'
           'He finnished his meal and left the resturant\n'
           '\n'
           '### Response:'}


In [8]:
with jsonlines.open("alpaca_instruction_tuned.jsonl", "w") as writer:
    writer.write_all(processed_data)

In [9]:
### lamini/alpaca
dataset_path_hf = "lamini/alpaca"
dataset_hf = load_dataset(dataset_path_hf)
print(dataset_hf)

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 52002
    })
})


In [10]:
top_n = itertools.islice(dataset_hf["train"], num_samples)
pprint(top_n.__next__())

{'input': 'Below is an instruction that describes a task. Write a response '
          'that appropriately completes the request.\n'
          '\n'
          '### Instruction:\n'
          'Give three tips for staying healthy.\n'
          '\n'
          '### Response:',
 'output': '1.Eat a balanced diet and make sure to include plenty of fruits '
           'and vegetables. \n'
           '2. Exercise regularly to keep your body active and strong. \n'
           '3. Get enough sleep and maintain a consistent sleep schedule.'}


In [11]:
from transformers import pipeline
import torch

In [12]:
device = torch.device("mps" if torch.mps.is_available() else "cpu")
input = "Tell me how to teach my cat to high five."
device

device(type='mps')

In [13]:
# non_tuned_model = pipeline("text-generation", model="mistralai/Mistral-7B-v0.1")
# non_tuned_model.to(device)


In [14]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")
model = AutoModelForCausalLM.from_pretrained("EleutherAI/pythia-70m")
model.to(device)

The `GPTNeoXSdpaAttention` class is deprecated in favor of simply modifying the `config._attn_implementation`attribute of the `GPTNeoXAttention` class! It will be removed in v4.48


GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXSdpaAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
        

In [15]:
def inference(text, model, tokenizer, max_input_length=1024, max_output_length=1000):
    inputs = tokenizer(text, return_tensors="pt", max_length=max_input_length, truncation=True).to(device)
    outputs = model.generate(**inputs.to(device), max_length=max_output_length)
    response_with_prompt = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response_with_prompt[0][len(text):]
    return response_with_prompt, response

In [None]:
finetuning_dataset_path = "lamini/lamini_docs"
finetuning_dataset = load_dataset(finetuning_dataset_path)
print(finetuning_dataset)

In [None]:
test_sample = finetuning_dataset["test"][0]
pprint(test_sample)

a,b = inference(test_sample["question"], model, tokenizer)
print('\n\n')

pprint(a)
print('\n\n')
pprint(b)

In [16]:
torch.mps.empty_cache()

In [17]:
instruction_model = AutoModelForCausalLM.from_pretrained("lamini/lamini_docs_finetuned")
instruction_model.to(device)

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXSdpaAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
        

In [None]:
a, b = inference(test_sample["question"], instruction_model, tokenizer)
print('\n\n')
pprint(a)
print('\n\n')
pprint

In [18]:
torch.mps.empty_cache()

#############

In [20]:
list_texts = ['Hi, how are good?','Yes, indeed.', 'I am good']
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")
encoded_texts = tokenizer(list_texts)
print(encoded_texts["input_ids"])

[[12764, 13, 849, 403, 1175, 32], [4374, 13, 6296, 15], [42, 717, 1175]]


In [21]:
tokenizer.pad_token = tokenizer.eos_token
encoded_pad_longest = tokenizer(list_texts, padding=True)
print(encoded_pad_longest["input_ids"])

[[12764, 13, 849, 403, 1175, 32], [4374, 13, 6296, 15, 0, 0], [42, 717, 1175, 0, 0, 0]]


In [24]:
encoded_truncation = tokenizer(list_texts, max_length=5, truncation=True)
print(encoded_truncation["input_ids"])

[[12764, 13, 849, 403, 1175], [4374, 13, 6296, 15], [42, 717, 1175]]


In [None]:
encoded = tokenizer(list_texts, padding=True, max_length=10, truncation=True)

In [25]:
# import pandas as pd

# filename = "lamini_docs.jsonl"
# instruction_dataset_df = pd.read_json(filename, lines=True)
# examples = instruction_dataset_df.to_dict()

# if "question" in examples and "answer" in examples:
#   text = examples["question"][0] + examples["answer"][0]
# elif "instruction" in examples and "response" in examples:
#   text = examples["instruction"][0] + examples["response"][0]
# elif "input" in examples and "output" in examples:
#   text = examples["input"][0] + examples["output"][0]
# else:
#   text = examples["text"][0]

# prompt_template = """### Question:
# {question}

# ### Answer:"""

# num_examples = len(examples["question"])
# finetuning_dataset = []
# for i in range(num_examples):
#   question = examples["question"][i]
#   answer = examples["answer"][i]
#   text_with_prompt_template = prompt_template.format(question=question)
#   finetuning_dataset.append({"question": text_with_prompt_template, "answer": answer})

# from pprint import pprint
# print("One datapoint in the finetuning dataset:")
# pprint(finetuning_dataset[0])

  instruction_dataset_df = pd.read_json(filename, lines=True)


ValueError: Expected object or value

In [28]:
finetuning_dataset_name = "lamini/lamini_docs"
finetuning_dataset = load_dataset(finetuning_dataset_name, split="train")
pprint(finetuning_dataset[1])

{'answer': 'Yes, the code includes methods for submitting jobs, checking job '
           'status, and retrieving job results. It also includes a method for '
           'canceling jobs. Additionally, there is a method for sampling '
           'multiple outputs from a model, which could be useful for '
           'long-running tasks.',
 'attention_mask': [1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                 

In [30]:
text = finetuning_dataset[0]["question"] + finetuning_dataset[0]["answer"]
print(text)
tokenized_inputs = tokenizer(
    text,
    return_tensors="np",
    padding=True
)
print(tokenized_inputs["input_ids"])

How can I evaluate the performance and quality of the generated text from Lamini models?There are several metrics that can be used to evaluate the performance and quality of generated text from Lamini models, including perplexity, BLEU score, and human evaluation. Perplexity measures how well the model predicts the next word in a sequence, while BLEU score measures the similarity between the generated text and a reference text. Human evaluation involves having human judges rate the quality of the generated text based on factors such as coherence, fluency, and relevance. It is recommended to use a combination of these metrics for a comprehensive evaluation of the model's performance.
[[ 2347   476   309  7472   253  3045   285  3290   273   253  4561  2505
    432   418  4988    74  3210    32  2512   403  2067 17082   326   476
    320   908   281  7472   253  3045   285  3290   273  4561  2505   432
    418  4988    74  3210    13  1690 44229   414    13   378  1843    54
   4868    1

In [31]:
max_length = 2048
max_length = min(
    tokenized_inputs["input_ids"].shape[1],
    max_length,
)
print(max_length)

133


In [32]:
tokenized_inputs = tokenizer(
    text,
    return_tensors="np",
    truncation=True,
    max_length=max_length
)
print(tokenized_inputs["input_ids"])

[[ 2347   476   309  7472   253  3045   285  3290   273   253  4561  2505
    432   418  4988    74  3210    32  2512   403  2067 17082   326   476
    320   908   281  7472   253  3045   285  3290   273  4561  2505   432
    418  4988    74  3210    13  1690 44229   414    13   378  1843    54
   4868    13   285  1966  7103    15  3545 12813   414  5593   849   973
    253  1566 26295   253  1735  3159   275   247  3425    13  1223   378
   1843    54  4868  5593   253 14259   875   253  4561  2505   285   247
   3806  2505    15  8801  7103  8687  1907  1966 16006  2281   253  3290
    273   253  4561  2505  1754   327  2616   824   347 25253    13  2938
   1371    13   285 17200    15   733   310  8521   281   897   247  5019
    273   841 17082   323   247 11088  7103   273   253  1566   434  3045
     15]]


In [33]:
def tokenize_function(examples):
    if "question" in examples and "answer" in examples:
      text = examples["question"][0] + examples["answer"][0]
    elif "input" in examples and "output" in examples:
      text = examples["input"][0] + examples["output"][0]
    else:
      text = examples["text"][0]

    tokenizer.pad_token = tokenizer.eos_token
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        padding=True,
    )

    max_length = min(
        tokenized_inputs["input_ids"].shape[1],
        2048
    )
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=max_length
    )

    return tokenized_inputs

In [41]:
tokenized_dataset = finetuning_dataset.map(
    tokenize_function,
    batched=True,
    batch_size=1,
    drop_last_batch=True,
)
print(tokenized_dataset)

Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1260
})


In [42]:
# tokenized_dataset = tokenized_dataset.add_column("labels", tokenized_dataset["input_ids"])
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, shuffle=True, seed=123)
print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1134
    })
    test: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 126
    })
})
