In [2]:
from transformers import AutoTokenizer, TrainingArguments, Trainer
from transformers import TrainingArguments
from transformers import AutoModelForCausalLM
import datasets
import random
import logging
import torch

In [None]:
logger = logging.getLogger(__name__)

dataset_name = "mini_dataset.jsonl"
# dataset_path = f".\\{dataset_name}"
dataset_path = 'dataset.jsonl'
use_hf = False
model_name = "EleutherAI/pythia-70m"

training_config = {
    "model": {
        "pretrained_name": model_name,
        "max_length" : 2048
    },
    "datasets": {
        "use_hf": use_hf,
        "path": dataset_path
    },
    "verbose": True
}

def tokenize_and_split_data(training_config, tokenizer):
  dataset_path = training_config["datasets"]["path"]
  use_hf = training_config["datasets"]["use_hf"]
  print("tokenize", use_hf, dataset_path)
  if use_hf:
    dataset = datasets.load_dataset(dataset_path)
  else:
    dataset = load_dataset(dataset_path, tokenizer)
  train_dataset = dataset["train"]
  test_dataset = dataset["test"]
  return train_dataset, test_dataset

# Tokenize and split data
def load_dataset(dataset_path, tokenizer):
    random.seed(42)
    finetuning_dataset_loaded = datasets.load_dataset("json", data_files=dataset_path, split="train")
    tokenizer.pad_token = tokenizer.eos_token
    max_length = training_config["model"]["max_length"]
    tokenized_dataset = finetuning_dataset_loaded.map(
        get_tokenize_function(tokenizer, max_length), # returns tokenize_function
        batched=True,
        batch_size=1,
        drop_last_batch=True
    )
    tokenized_dataset = tokenized_dataset.with_format("torch")
    split_dataset = tokenized_dataset.train_test_split(test_size=0.1, shuffle=True, seed=123)
    return split_dataset

# Get function for tokenization, based on config parameters
def get_tokenize_function(tokenizer, _max_length):

  def tokenize_function(examples):
    max_length = _max_length
    tokenizer.pad_token = tokenizer.eos_token

    if "question" in examples and "answer" in examples:
      text = examples["question"][0] + examples["answer"][0]
    elif "input" in examples and "output" in examples:
      text = examples["input"][0] + examples["output"][0]
    else:
      text = examples["text"][0]

    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        padding=True,
    )

    max_length = min(
        tokenized_inputs["input_ids"].shape[1],
        max_length
    )

    if tokenized_inputs["input_ids"].shape[1] > max_length:
        logger.warn(
            f"Truncating input from {tokenized_inputs['input_ids'].shape[1]} to {max_length}"
        )

    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
    )
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"]
    return tokenized_inputs
  
  return tokenize_function

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
train_dataset, test_dataset = tokenize_and_split_data(training_config, tokenizer)

print(train_dataset)
print(test_dataset)

tokenize False C:\Users\hasee\Desktop\Chabot\finetuning\course\mini_dataset.jsonl
Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 45018
})
Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 5002
})


In [19]:
def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=100):
  # Tokenize
  input_ids = tokenizer.encode(
          text,
          return_tensors="pt",
          truncation=True,
          max_length=max_input_tokens
  )

  # Generate
  device = model.device
  generated_tokens_with_prompt = model.generate(
    input_ids=input_ids.to(device),
    max_length=max_output_tokens
  )

  # Decode
  generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

  # Strip the prompt
  generated_text_answer = generated_text_with_prompt[0][len(text):]

  return generated_text_answer

In [20]:
device_count = torch.cuda.device_count()
if device_count > 0:
    logger.debug("Select GPU device")
    device = torch.device("cuda")
else:
    logger.debug("Select CPU device")
    device = torch.device("cpu")

In [21]:
base_model = AutoModelForCausalLM.from_pretrained(model_name)
base_model.to(device)

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXSdpaAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
        

In [22]:
test_text = test_dataset[0]['question']
print("Question input (test):", test_text)
print(f"Correct answer from Lamini docs: {test_dataset[0]['answer']}")
print("Model's answer: ")
print(inference(test_text, base_model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Question input (test): What advancements in learning algorithms are inspired by the principles underlying DBNs?
Correct answer from Lamini docs: The principles of unsupervised pre-training and layer-wise learning in DBNs inspire the development of novel learning algorithms that aim to capture hierarchical representations in deep learning networks more effectively. ;;

Model's answer: 


The first step in the development of DBNs is to learn the basic concepts of the DBNs. The basic concepts are:

1.  The basic concepts are:

1.  The basic concepts are:

1.  The basic concepts are:

1.  The basic concepts are:

1.  The basic concepts are:

1.  The basic concepts


In [23]:
finetuned_slightly_model = AutoModelForCausalLM.from_pretrained("C:\\Users\\hasee\\Desktop\\Chabot\\finetuning\\lamini_docs_3_steps\\final", local_files_only=True)
finetuned_slightly_model.to(device) 


GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXSdpaAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
        

In [24]:
test_question = test_dataset[0]['question']
print("Question input (test):", test_question)

print("Finetuned slightly model's answer: ")
print(inference(test_question, finetuned_slightly_model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Question input (test): What advancements in learning algorithms are inspired by the principles underlying DBNs?
Finetuned slightly model's answer: 


The first step in learning algorithms is to learn the DBNs. The DBNs are then used to learn the DBNs. The DBNs are then used to learn the DBNs. The DBNs are then used to learn the DBNs. The DBNs are then used to learn the DBNs. The DBNs are then used to learn the DBNs. The DBNs are then used to learn the DB


In [25]:
test_answer = test_dataset[0]['answer']
print("Target answer output (test):", test_answer)

Target answer output (test): The principles of unsupervised pre-training and layer-wise learning in DBNs inspire the development of novel learning algorithms that aim to capture hierarchical representations in deep learning networks more effectively. ;;



In [None]:
finetuned_model = AutoModelForCausalLM.from_pretrained("/scratch/wej36how/RAG2/scratch/wej36how/RAG2/lamini_docs_8_epochs/final", local_files_only=True)
#finetuned_model.to(device) 

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXSdpaAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
        

In [29]:
test_question = test_dataset[10]['question']
print("Question input (test):", test_question)

print("Finetuned full model's answer: ")
print(inference(test_question, finetuned_model, tokenizer))

test_answer = test_dataset[10]['answer']
print("Target answer output (test):", test_answer)

Question input (test): "What role does socioeconomic status play in influencing the recommendations presented to users?
Finetuned full model's answer: 


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


 ""Sla. Socioeconomic status  leading to higher recommendations as users are more likely to purchase specific products  resulting in higher recommendations as recommendations fill the gap between user preferences and purchasing decisions  increasing user satisfaction and engagement with the platform and driving sales trends in real-world applications of this technology  thereby influencing their purchasing habits and driving sales growth and revenue growth by shaping user habits and influencing their purchasing habits accordingly  
Target answer output (test): ""Socioeconomic status provides valuable context about the user's purchasing power  lifestyle  and preferences  which the recommendation system considers when making recommendations  ensuring that the suggestions are relevant  accessible  and aligned with the user's financial capabilities and lifestyle choices."" ";;

