# Training

## Technically, it's only a few lines of code to run on GPUs (elsewhere, ie. on Lamini).
```
from llama import BasicModelRunner

model = BasicModelRunner("EleutherAI/pythia-410m") 
model.load_data_from_jsonlines("lamini_docs.jsonl", input_key="question", output_key="answer")
model.train(is_public=True) 


```
1. Choose base model.
2. Load data.
3. Train it. Returns a model ID, dashboard, and playground interface.

### Let's look under the hood at the core code running this! This is the open core of Lamini's `llama` library :)

In [1]:
import os

# Set the CUDA_VISIBLE_DEVICES environment variable
os.environ["CUDA_VISIBLE_DEVICES"] = "MIG-a4a538a2-a199-58d8-9e56-ed86db02edf8"
# os.environ["CUDA_VISIBLE_DEVICES"] = "MIG-31af3e02-f5e5-560e-8ea3-4350d43cb5f9"
# os.environ["CUDA_VISIBLE_DEVICES"] = "MIG-796df017-2199-561d-ac76-8d637f0672e3"

In [2]:
import torch
torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
import datasets
import tempfile
import logging
import random
import config
import os
import yaml
import time
import torch
import transformers
import pandas as pd
import jsonlines

from utilities import *
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import TrainingArguments
from transformers import AutoModelForCausalLM
from llama import BasicModelRunner


logger = logging.getLogger(__name__)
global_config = None

  from .autonotebook import tqdm as notebook_tqdm


### Load the Lamini docs dataset

In [4]:
dataset_name = "output.jsonl"
dataset_path = f"{dataset_name}"
use_hf = False

In [5]:
# dataset_path = "lamini/lamini_docs"
# # dataset_path = "shahrukh95/cyber-short"

# use_hf = True

### Set up the model, training config, and tokenizer

In [6]:
model_name = "EleutherAI/pythia-410m"

# model_name = "cerebras/Cerebras-GPT-111M"

In [7]:
training_config = {
    "model": {
        "pretrained_name": model_name,
        "max_length" : 2048
    },
    "datasets": {
        "use_hf": use_hf,
        "path": dataset_path
    },
    "verbose": True
}

import utilities
utilities.set_training_config(training_config)

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
train_dataset, test_dataset = tokenize_and_split_data(training_config, tokenizer)

print(train_dataset)
print(test_dataset)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
2024-01-01 23:20:52,170 - DEBUG - utilities - Config: datasets.path: output.jsonl
datasets.use_hf: false
model.max_length: 2048
model.pretrained_name: EleutherAI/pythia-410m
verbose: true



tokenize False output.jsonl


2024-01-01 23:20:52,872 - DEBUG - fsspec.local - open file: /home/sahsan/.cache/huggingface/datasets/json/default-df586969b4ef71b0/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/dataset_info.json
2024-01-01 23:20:52,900 - DEBUG - fsspec.local - open file: /home/sahsan/.cache/huggingface/datasets/json/default-df586969b4ef71b0/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/dataset_info.json


Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 62
})
Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 7
})


In [9]:
# print(train_dataset[1])
# print(test_dataset[1])

In [10]:
print(train_dataset[1])
print(test_dataset[1])

{'question': 'What is the CVE ID of the Windows Common Log File System Driver vulnerability disclosed in August 2023?', 'answer': 'The CVE ID of the Windows Common Log File System Driver vulnerability disclosed in August 2023 is CVE-2023-36900.', 'input_ids': tensor([ 1276,   310,   253,   330, 12695,  5417,   273,   253,  7464, 10200,
         8192,  8490,  4155, 32911, 24189, 10557,   275,  4223,  1384,  1508,
           32,   510,   330, 12695,  5417,   273,   253,  7464, 10200,  8192,
         8490,  4155, 32911, 24189, 10557,   275,  4223,  1384,  1508,   310,
          330, 12695,    14,   938,  1508,    14,  1812, 18104,    15]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1]), 'labels': tensor([ 1276,   310,   253,   330, 12695,  5417,   273,   253,  7464, 10200,
         8192,  8490,  4155, 32911, 24189, 10557,   275,  4223,  1384,  1508

### Load the base model

In [11]:
base_model = AutoModelForCausalLM.from_pretrained(model_name)

In [12]:
device_count = torch.cuda.device_count()
if device_count > 0:
    logger.debug("Select GPU device")
    device = torch.device("cuda")
else:
    logger.debug("Select CPU device")
    device = torch.device("cpu")

2024-01-01 23:20:53,581 - DEBUG - __main__ - Select GPU device


In [13]:
base_model.to(device)

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 1024)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)
          (dense): Linear(in_features=1024, out_features=1024, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=1024, out_features=4096, bias=True)
          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
  

### Define function to carry out inference

In [14]:
def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=100):
  # Tokenize
  input_ids = tokenizer.encode(
          text,
          return_tensors="pt",
          truncation=True,
          max_length=max_input_tokens
  )

  # Generate
  device = model.device
  generated_tokens_with_prompt = model.generate(
    input_ids=input_ids.to(device),
    max_length=max_output_tokens
  )

  # Decode
  generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

  # Strip the prompt
  generated_text_answer = generated_text_with_prompt[0][len(text):]

  return generated_text_answer

### Try the base model

In [15]:
test_text = test_dataset[0]['question']
print("Question input (test):", test_text)
print(f"Correct answer from Lamini docs: {test_dataset[0]['answer']}")
print("Model's answer: ")
print(inference(test_text, base_model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Question input (test): Can you describe the security issue identified by CVE-2021-25749?
Correct answer from Lamini docs: CVE-2021-25749 describes a security issue where Windows workloads that are expected to run as a non-root user can unexpectedly run as ContainerAdministrator even when the 'runAsNonRoot' option is set to true. This can lead to unauthorized privilege escalation within the containerized application.
Model's answer: 


A:

The issue is that the CVE-2021-25749 vulnerability is not present in the latest version of the CVE-2020-0153.
The CVE-2020-0153 is a vulnerability in the CVE-2020-0153.1.1.0.0.0.0.0.0.0.0.0.0.


### Setup training

In [16]:
max_steps = 1260

In [17]:
trained_model_name = f"Output-Models/pythia-410m-lamini_docs_{max_steps}_steps"
output_dir = trained_model_name

In [18]:
training_args = TrainingArguments(

  # Learning rate
  learning_rate=1.0e-5,

  # Number of training epochs
  num_train_epochs=3,

  # Max steps to train for (each step is a batch of data)
  # Overrides num_train_epochs, if not -1
  max_steps=max_steps,

  # Batch size for training
  per_device_train_batch_size=1,

  # Directory to save model checkpoints
  output_dir=output_dir,

  # Other arguments
  overwrite_output_dir=False, # Overwrite the content of the output directory
  disable_tqdm=False, # Disable progress bars
  eval_steps=120, # Number of update steps between two evaluations
  save_steps=120, # After # steps model is saved
  warmup_steps=1, # Number of warmup steps for learning rate scheduler
  per_device_eval_batch_size=1,
  evaluation_strategy="steps",
  logging_strategy="steps",
  logging_steps=1,
  optim="adafactor",
  gradient_accumulation_steps = 4,
  gradient_checkpointing=False,

  # Parameters for early stopping
  load_best_model_at_end=True,
  save_total_limit=1,
  metric_for_best_model="eval_loss",
  greater_is_better=False
)

In [19]:
model_flops = (
  base_model.floating_point_ops(
    {
       "input_ids": torch.zeros(
           (1, training_config["model"]["max_length"])
      )
    }
  )
  * training_args.gradient_accumulation_steps
)

print(base_model)
print("Memory footprint", base_model.get_memory_footprint() / 1e9, "GB")
print("Flops", model_flops / 1e9, "GFLOPs")

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 1024)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)
          (dense): Linear(in_features=1024, out_features=1024, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=1024, out_features=4096, bias=True)
          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
  

In [20]:
trainer = Trainer(
    model=base_model,
    model_flops=model_flops,
    total_steps=max_steps,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [None]:
import torch
print("PyTorch Version:", torch.__version__)


PyTorch Version: 2.1.2+cu121


### Train a few steps

In [21]:
training_output = trainer.train()

AttributeError: 'Trainer' object has no attribute 'do_grad_scaling'

### Save model locally

In [None]:
save_dir = f'{output_dir}/final'

trainer.save_model(save_dir)
print("Saved model to:", save_dir)

Saved model to: Output-Models/pythia-410m-lamini_docs_1260_steps/final


In [None]:
# Load only saved model
# save_dir = f'{output_dir}/final'

In [None]:
finetuned_slightly_model = AutoModelForCausalLM.from_pretrained(save_dir, local_files_only=True)

In [None]:
finetuned_slightly_model.to(device)

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 1024)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)
          (dense): Linear(in_features=1024, out_features=1024, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=1024, out_features=4096, bias=True)
          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
  

In [None]:
training_output

TrainOutput(global_step=1260, training_loss=0.9827461726135678, metrics={'train_runtime': 638.6164, 'train_samples_per_second': 7.892, 'train_steps_per_second': 1.973, 'total_flos': 859033444270080.0, 'train_loss': 0.9827461726135678, 'epoch': 4.0, 'iter_time': 0.5068264022718071, 'flops': 34313710287163.16, 'remaining_time': 0.0})

### Run slightly trained model

In [None]:
test_question = test_dataset[0]['question']
print("Question input (test):", test_question)

print("Finetuned slightly model's answer: ")
print(inference(test_question, finetuned_slightly_model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Question input (test): Can Lamini generate technical documentation or user manuals for software projects?
Finetuned slightly model's answer: 
Yes, Lamini can generate technical documentation or user manuals for software projects. It uses a language model to analyze the code and generate a summary of its functionality, making it easier to understand and use the code. Additionally, Lamini can be trained on specific programming languages and can generate documentation for them. This can be particularly useful for developers who want to train a language model to perform specific tasks, such as generating documentation or providing technical support. Lamini can also be trained on specific programming languages and can generate documentation for them. Finally, Lamini can be trained on specific use cases and can generate documentation for them. This can be particularly useful for developers


In [None]:
print(inference("Can Lamini generate technical documentation or user manuals for software projects?", finetuned_slightly_model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Yes, Lamini can generate technical documentation or user manuals for software projects. It uses a language model to analyze the code and generate a summary of its functionality, making it easier to understand and use the code. Additionally, Lamini can be trained on specific programming languages and can generate documentation for them. This can be particularly useful for developers who want to train a language model to perform specific tasks, such as generating documentation or providing technical support. Lamini can also be trained on specific programming languages and can generate documentation for them. Finally, Lamini can be trained on specific use cases and can generate documentation for them. This can be particularly useful for developers


In [None]:
print(inference("Can Lamini generate technical documentation or user manuals for software projects?", base_model, tokenizer))
# print(inference("Does Lamini support generating code for machine learning models?", base_model, base_tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.




I have a question about the following:

How do I get the correct documentation to work?

A:

I think you need to use the following code:

A:

You can use the following code to get the correct documentation.

A:

You can use the following code to get the correct documentation.

A:

You can use the following code to get the correct documentation.

A:

You can use the following code to get the correct documentation.

A:

You can use the following code to get the correct documentation.

A:

You can use the following code to get the correct documentation.

A:

You can use the following code to get the correct documentation.

A:

You can use the following code to get the correct documentation.

A:

You can use the following code to get the correct documentation.

A:

You can use the following code to get the correct documentation.

A:

You can use the following code to get the correct documentation.

A:

You can use the following code to get the correct documentation.

A:

You can use the 

In [None]:
test_answer = test_dataset[0]['answer']
print("Target answer output (test):", test_answer)

Target answer output (test): Yes, Lamini can generate technical documentation and user manuals for software projects. It uses natural language generation techniques to create clear and concise documentation that is easy to understand for both technical and non-technical users. This can save developers a significant amount of time and effort in creating documentation, allowing them to focus on other aspects of their projects.


### Run same model trained for two epochs 

In [None]:
finetuned_longer_model = AutoModelForCausalLM.from_pretrained("lamini/lamini_docs_finetuned")
tokenizer = AutoTokenizer.from_pretrained("lamini/lamini_docs_finetuned")

finetuned_longer_model.to(device)
print("Finetuned longer model's answer: ")
print(inference(test_question, finetuned_longer_model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Finetuned longer model's answer: 
Yes, Lamini can generate technical documentation or user manuals for software projects. This can be achieved by providing a prompt for a specific technical question or question to the LLM Engine, or by providing a prompt for a specific technical question or question. Additionally, Lamini can be trained on specific technical questions or questions to help users understand the process and provide feedback to the LLM Engine. Additionally, Lamini


### Run much larger trained model and explore moderation

In [None]:
# bigger_finetuned_model = BasicModelRunner("meta-llama/Llama-2-7b-chat-hf")
# bigger_finetuned_output = bigger_finetuned_model(test_question)
# print("Bigger (2.8B) finetuned model (test): ", bigger_finetuned_output)

In [None]:
# count = 0
# for i in range(len(train_dataset)):
#  if "keep the discussion relevant to Lamini" in train_dataset[i]["answer"]:
#   print(i, train_dataset[i]["question"], train_dataset[i]["answer"])
#   count += 1
# print(count)

### Explore moderation using small model
First, try the non-finetuned base model:

In [None]:
base_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")
base_model = AutoModelForCausalLM.from_pretrained("EleutherAI/pythia-70m")
print(inference("Does Lamini support generating code for machine learning models?", base_model, base_tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.




A:

I think you should use the following code:
import numpy as np

def test_machine_model(model):
    model = np.random.rand(0, 10)
    model.add_model(model)
    model.add_model(model)
    model.add_model(model)
    model.add_model(model)
    model


### Now try moderation with finetuned small model 

In [None]:
print(inference("Does Lamini support generating code for machine learning models?", finetuned_longer_model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Yes, Lamini can generate code for machine learning models using its LLM Engine. This engine can be trained on large datasets and can generate code for machine learning models using its LLM Engine. Additionally, Lamini can generate code for machine learning models using its LLM Engine. Additionally, Lamini can generate code for machine learning models using its LLM Engine. Additionally, Lamini can generate code for machine learning models


### Finetune a model in 3 lines of code using Lamini

In [None]:
import pandas as pd
from datasets import load_dataset

# Load the data in streaming mode
lamini_data = load_dataset("lamini/lamini_docs", split="train", streaming=True)

# Initialize lists to store the data from the first two columns
column_1 = []
column_2 = []

# Iterate over the data, appending values to the lists
for example in lamini_data:
    column_1.append(example['question'])  # replace 'column_1_name' with the actual column name
    column_2.append(example['answer'])  # replace 'column_2_name' with the actual column name
    
    # you might want to limit the number of rows you're loading into the dataframe
    # if len(column_1) >= some_number:  # replace 'some_number' with the number of rows you want to load
        # break

# Create a DataFrame
dataset = pd.DataFrame({
    'question': column_1,
    'answer': column_2
})

# Now you have a pandas DataFrame with the data from the first two columns
dataset

2023-10-30 22:55:07,938 - DEBUG - fsspec.local - open file: /home/sahsan/.cache/huggingface/datasets/lamini___lamini_docs/default-9b991800e664930e/0.0.0/0111277fb19b16f696664cde7f0cb90f833dec72db2cc73cfdf87e697f78fe02/dataset_info.json
2023-10-30 22:55:07,945 - DEBUG - fsspec - <File-like object HfFileSystem, datasets/lamini/lamini_docs@05bd680b81d69a7a1d38193873f1487d73e535bf/data/train-00000-of-00001-5cdebbc48da41394.parquet> read: 549400 - 614936
2023-10-30 22:55:08,628 - DEBUG - fsspec - <File-like object HfFileSystem, datasets/lamini/lamini_docs@05bd680b81d69a7a1d38193873f1487d73e535bf/data/train-00000-of-00001-5cdebbc48da41394.parquet> read: 4 - 42215
2023-10-30 22:55:09,199 - DEBUG - fsspec - <File-like object HfFileSystem, datasets/lamini/lamini_docs@05bd680b81d69a7a1d38193873f1487d73e535bf/data/train-00000-of-00001-5cdebbc48da41394.parquet> read: 42300 - 194726
2023-10-30 22:55:09,201 - DEBUG - fsspec - <File-like object HfFileSystem, datasets/lamini/lamini_docs@05bd680b81d69a

Unnamed: 0,question,answer
0,How can I evaluate the performance and quality...,There are several metrics that can be used to ...
1,Can I find information about the code's approa...,"Yes, the code includes methods for submitting ..."
2,How does Lamini AI handle requests for generat...,Lamini AI offers features for generating text ...
3,Does the `submit_job()` function expose any ad...,It is unclear which `submit_job()` function is...
4,Does the `add_data()` function support differe...,"No, the `add_data()` function does not support..."
...,...,...
1255,Does the documentation provide guidelines for ...,There is no mention of memory caching or evict...
1256,Does Lamini provide any mechanisms for model e...,"Yes, Lamini provides mechanisms for model ense..."
1257,Is Lamini owned by Tesla?,"No, Lamini AI is an independent company workin..."
1258,What is the process for suggesting edits or im...,You can suggest edits or improvements to the L...


In [None]:
# model = BasicModelRunner("EleutherAI/pythia-410m")
# model.load_data_from_dataframe(dataset, input_key="question", output_key="answer")
# # # model.load_data_from_jsonlines("lamini_docs.jsonl", input_key="question", output_key="answer")
# model.train(is_public=True)

Training job submitted! Check status of job 3933 here: https://app.lamini.ai/train/3933
Finetuning process completed, model name is: b182c031dde1298db5fdb1b71c3a2c05985dae6be2ed2621c8467fa5c6162232


In [None]:
# out = model.evaluate()

In [None]:
# out

In [None]:
# lofd = []
# for e in out['eval_results']:
#     q  = f"{e['input']}"
#     at = f"{e['outputs'][0]['output']}"
#     ab = f"{e['outputs'][1]['output']}"
#     di = {'question': q, 'trained model': at, 'Base Model' : ab}
#     lofd.append(di)
# df = pd.DataFrame.from_dict(lofd)
# style_df = df.style.set_properties(**{'text-align': 'left'})
# style_df = style_df.set_properties(**{"vertical-align": "text-top"})
# style_df