# Training

In [1]:
import os
import lamini

lamini.api_url = os.getenv("API_URL")
lamini.api_key = os.getenv("API_KEY")

In [2]:
import datasets
import tempfile
import logging
import random
import config
import os
import yaml
import time
import torch
import transformers
import pandas as pd
import jsonlines

from utilities import *
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import TrainingArguments
from transformers import AutoModelForCausalLM
from llama import BasicModelRunner


logger = logging.getLogger(__name__)
global_config = None

### Load the Lamini docs dataset

In [3]:
dataset_path = "assets/lamini_docs"
use_hf = True

### Set up the model, training config, and tokenizer

In [5]:
model_name = "EleutherAI/pythia-70m"

In [6]:
training_config = {
    "model": {
        "pretrained_name": model_name,
        "max_length" : 2048
    },
    "datasets": {
        "use_hf": use_hf,
        "path": dataset_path
    },
    "verbose": True
}

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
train_dataset, test_dataset = tokenize_and_split_data(training_config, tokenizer)

print(train_dataset)
print(test_dataset)

tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

2024-02-29 02:15:55,294 - DEBUG - utilities - Config: datasets.path: lamini/lamini_docs
datasets.use_hf: true
model.max_length: 2048
model.pretrained_name: EleutherAI/pythia-70m
verbose: true



tokenize True lamini/lamini_docs


Downloading readme:   0%|          | 0.00/577 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/615k [00:00<?, ?B/s]

2024-02-29 02:15:58,120 - DEBUG - fsspec - <File-like object HfFileSystem, datasets/lamini/lamini_docs@05bd680b81d69a7a1d38193873f1487d73e535bf/data/train-00000-of-00001-5cdebbc48da41394.parquet> read: 0 - 4194304
2024-02-29 02:15:58,291 - DEBUG - fsspec - <File-like object HfFileSystem, datasets/lamini/lamini_docs@05bd680b81d69a7a1d38193873f1487d73e535bf/data/train-00000-of-00001-5cdebbc48da41394.parquet> read: 614936 - 4809240


Downloading data:   0%|          | 0.00/83.7k [00:00<?, ?B/s]

2024-02-29 02:15:58,309 - DEBUG - fsspec - <File-like object HfFileSystem, datasets/lamini/lamini_docs@05bd680b81d69a7a1d38193873f1487d73e535bf/data/test-00000-of-00001-4c77a066a883f339.parquet> read: 0 - 4194304
2024-02-29 02:15:58,401 - DEBUG - fsspec - <File-like object HfFileSystem, datasets/lamini/lamini_docs@05bd680b81d69a7a1d38193873f1487d73e535bf/data/test-00000-of-00001-4c77a066a883f339.parquet> read: 83671 - 4277975


Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/1260 [00:00<?, ? examples/s]

2024-02-29 02:15:58,447 - DEBUG - fsspec.local - open file: /home/jovyan/.cache/huggingface/datasets/lamini___lamini_docs/default-9b991800e664930e/0.0.0/0111277fb19b16f696664cde7f0cb90f833dec72db2cc73cfdf87e697f78fe02.incomplete/lamini_docs-train-00000-00000-of-NNNNN.arrow


Generating test split:   0%|          | 0/140 [00:00<?, ? examples/s]

2024-02-29 02:15:58,479 - DEBUG - fsspec.local - open file: /home/jovyan/.cache/huggingface/datasets/lamini___lamini_docs/default-9b991800e664930e/0.0.0/0111277fb19b16f696664cde7f0cb90f833dec72db2cc73cfdf87e697f78fe02.incomplete/lamini_docs-test-00000-00000-of-NNNNN.arrow
2024-02-29 02:15:58,486 - DEBUG - fsspec.local - open file: /home/jovyan/.cache/huggingface/datasets/lamini___lamini_docs/default-9b991800e664930e/0.0.0/0111277fb19b16f696664cde7f0cb90f833dec72db2cc73cfdf87e697f78fe02.incomplete/dataset_info.json


Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1260
})
Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 140
})


### Load the base model

In [8]:
base_model = AutoModelForCausalLM.from_pretrained(model_name)

config.json:   0%|          | 0.00/567 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/166M [00:00<?, ?B/s]

In [9]:
device_count = torch.cuda.device_count()
if device_count > 0:
    logger.debug("Select GPU device")
    device = torch.device("cuda")
else:
    logger.debug("Select CPU device")
    device = torch.device("cpu")

2024-02-29 02:17:28,927 - DEBUG - __main__ - Select CPU device


In [10]:
base_model.to(device)

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (a

In [11]:
def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=100):
  # Tokenize
  input_ids = tokenizer.encode(
          text,
          return_tensors="pt",
          truncation=True,
          max_length=max_input_tokens
  )

  # Generate
  device = model.device
  generated_tokens_with_prompt = model.generate(
    input_ids=input_ids.to(device),
    max_length=max_output_tokens
  )

  # Decode
  generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

  # Strip the prompt
  generated_text_answer = generated_text_with_prompt[0][len(text):]

  return generated_text_answer

### Try the base model

In [12]:
test_text = test_dataset[0]['question']
print("Question input (test):", test_text)
print(f"Correct answer from Lamini docs: {test_dataset[0]['answer']}")
print("Model's answer: ")
print(inference(test_text, base_model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Question input (test): Can Lamini generate technical documentation or user manuals for software projects?
Correct answer from Lamini docs: Yes, Lamini can generate technical documentation and user manuals for software projects. It uses natural language generation techniques to create clear and concise documentation that is easy to understand for both technical and non-technical users. This can save developers a significant amount of time and effort in creating documentation, allowing them to focus on other aspects of their projects.
Model's answer: 


I have a question about the following:

How do I get the correct documentation to work?

A:

I think you need to use the following code:

A:

You can use the following code to get the correct documentation.

A:

You can use the following code to get the correct documentation.

A:

You can use the following


### Setup training

In [13]:
max_steps = 3

In [14]:
trained_model_name = f"lamini_docs_{max_steps}_steps"
output_dir = trained_model_name

In [15]:
training_args = TrainingArguments(

  # Learning rate
  learning_rate=1.0e-5,

  # Number of training epochs
  num_train_epochs=1,

  # Max steps to train for (each step is a batch of data)
  # Overrides num_train_epochs, if not -1
  max_steps=max_steps,

  # Batch size for training
  per_device_train_batch_size=1,

  # Directory to save model checkpoints
  output_dir=output_dir,

  # Other arguments
  overwrite_output_dir=False, # Overwrite the content of the output directory
  disable_tqdm=False, # Disable progress bars
  eval_steps=120, # Number of update steps between two evaluations
  save_steps=120, # After # steps model is saved
  warmup_steps=1, # Number of warmup steps for learning rate scheduler
  per_device_eval_batch_size=1, # Batch size for evaluation
  evaluation_strategy="steps",
  logging_strategy="steps",
  logging_steps=1,
  optim="adafactor",
  gradient_accumulation_steps = 4,
  gradient_checkpointing=False,

  # Parameters for early stopping
  load_best_model_at_end=True,
  save_total_limit=1,
  metric_for_best_model="eval_loss",
  greater_is_better=False
)

In [16]:
model_flops = (
  base_model.floating_point_ops(
    {
       "input_ids": torch.zeros(
           (1, training_config["model"]["max_length"])
      )
    }
  )
  * training_args.gradient_accumulation_steps
)

print(base_model)
print("Memory footprint", base_model.get_memory_footprint() / 1e9, "GB")
print("Flops", model_flops / 1e9, "GFLOPs")

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (a

In [17]:
trainer = Trainer(
    model=base_model,
    model_flops=model_flops,
    total_steps=max_steps,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

### Train for two epochs 

In [24]:
finetuned_longer_model = AutoModelForCausalLM.from_pretrained("lamini/lamini_docs_finetuned")
tokenizer = AutoTokenizer.from_pretrained("lamini/lamini_docs_finetuned")

finetuned_longer_model.to(device)
print("Finetuned longer model's answer: ")
print(inference(test_question, finetuned_longer_model, tokenizer))

config.json:   0%|          | 0.00/717 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/282M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/264 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Finetuned longer model's answer: 
Yes, Lamini can generate technical documentation or user manuals for software projects. This can be achieved by providing a prompt for a specific technical question or question to the LLM Engine, or by providing a prompt for a specific technical question or question. Additionally, Lamini can be trained on specific technical questions or questions to help users understand the process and provide feedback to the LLM Engine. Additionally, Lamini


### Model Finetuning

In [29]:
model = BasicModelRunner("EleutherAI/pythia-410m") 
model.load_data_from_jsonlines("lamini_docs.jsonl", input_key="question", output_key="answer")
model.train(is_public=True) 

LAMINI CONFIGURATION
{}
LAMINI CONFIGURATION
{}
LAMINI CONFIGURATION
{}
Training job submitted! Check status of job 2349 here: https://app.lamini.ai/train/2349
Finetuning process completed, model name is: c8ff4b19807dd10007a7f3b51ccc09dd8237ef3d47410dae13394fc072a12978


In [30]:
out = model.evaluate()

In [32]:
lofd = []
for e in out['eval_results']:
    q  = f"{e['input']}"
    at = f"{e['outputs'][0]['output']}"
    ab = f"{e['outputs'][1]['output']}"
    di = {'question': q, 'trained model': at, 'Base Model' : ab}
    lofd.append(di)
df = pd.DataFrame.from_dict(lofd)
style_df = df.style.set_properties(**{'text-align': 'left'})
style_df = style_df.set_properties(**{"vertical-align": "text-top"})
style_df

Unnamed: 0,question,trained model,Base Model
0,Does Lamini have the ability to understand and generate code for audio processing tasks?,"Yes, Lamini has the ability to understand and generate code.","A: Lamini is a very good language for audio processing. A: I think you are looking for a language that can be used to write audio code. A: Languages like C, C++, Java, Python, C#, C++, C++ and others are good for audio coding. A: You can use a language like C, C++, Java, C#, C++, C++ or C++ for audio coding. A language that can be used to write code for audio coding is C. A: C is a good language for audio coding. A good language for audio coding is C. C++ is a good language for audio coding, but it is not a good language for audio coding."
1,Is it possible to control the level of detail in the generated output?,"Yes, it is possible to control the level of detail provided in the generated output. To do so, you can use the ""level"" parameter in the ""generate_output"" method. This parameter controls the level of detail in the generated text. The default value is ""none"".","A: You can use the following code to control the level of detail in your output: #include #include #include using namespace std; int main() {  string s;  s = ""Hello World"";  cout << s << endl;  return 0; } Output: Hello World A: You could use the following code to control the output level: #include #include using namespace std;  int main()  {  string s;  s = ""Hello World"";  cout << s << endl;  return 0;  } Output: Hello world A: You should use the following code to control the number of characters in the output: #include #include using namespace std; void main() {  string str;  str = ""Hello World"";  cin >> str;  cout << str << endl; } Output:"
2,Are there any guidelines on using Lamini for generating content in storytelling applications?,"Yes, there are guidelines on using Lamini for generation of content in storytelling applications. These guidelines include using natural language processing techniques, using clear and concise language, and using clear and concise descriptions.","A: I would recommend using Lamini for generating content for your stories. Lamini is a great tool for generating content for your stories. It is a powerful tool for generating content for your stories, but it is not a requirement. Lamini is not a requirement for generating content for your stories. You can use any tool that you want to generate content for your stories. A: I use Lamini for generating content for my stories. Lamini can generate content for your stories. It is not a requirement. A: Lamini is a powerful tool for generating content. It is not a requirement."
3,Can Lamini assist in generating content for news articles or blog posts?,"Yes, Lamini can assist in generating content for news articles and blog posts.","Yes, we can. We can also help you with SEO, SEM, and other marketing strategies. We can help you with content creation, copywriting, and other marketing strategies. We can help you with content marketing, social media marketing, and other marketing strategies. And we can help you with all of your marketing needs. We can help you create content for your blog, website, or other online presence. We can help you write content for your blog, website, and other online presence. We help you create content for your blog or website. We can help you develop content for your blog or website."
4,Are there any tutorials on using Lamini for content generation in creative writing?,"Yes, there are many tutorials available on using Lamini for content creation in creative writing. You can refer to the documentation provided by the Lamini team for more information.","A: I have used Lamini for content generation in my creative writing classes. Lamini is a content generation tool that allows you to generate content from a text file. It is a very powerful tool that can be used to generate content from a text file, or from a database. Lamini is an open source content generation tool. Lamini is open source. Lamini is free. Lamini is available on GitHub. Lamini is also available on Github. Lamini is not free. Lamini can be used to generate content from text files. Lamini is used to generate content from a text files. Laminis is used to generate content from a database. Lamins are free. Lamins are available on GitHub. Lamins are also available on Github. Lamins are not free. Lamins can be used to generate content from databases. Lamins are used to generate content from a database, but not free. Lamins are not free."
5,Does Lamini provide pre-trained models for generating text in specific genres?,"Yes, Lamini provides pre-trained models for generating text for specific genres through its LLM Engine.","A: Lamini is a text-to-image generator. It is not a text-to-image generator, but a text-to-image generator that is trained to generate text. It is not a text-to-text generator, but a text-to text generator that is trained to generate text from images. It is not a text to image generator, but a text to image generator that is trained to generate text using images. It is not a image to text generator, but a image to text generator that is trained to generate images using text. It is not a image-to-text generator, but an image-to-text generator that is trained to generate images from text. It is not a word-to-image generator, but an image-to word generator that is trained to generate images of words from text. It is not an image-to-text generator, and an image-to-text generator trained to generate images of words from texts. It is not a word to image generator, but an image-to image generator trained to generate images of words using images. It is not an image to text generator, but an image-to text generator trained to generate"
6,Can Lamini generate text for generating personalized recommendations for users?,"Yes, Lamini can generate text for generating personalized recommendations for user.",A: I think you are looking for a way to generate a text based recommendation for a user. I think you are looking at the wrong problem. I think you are trying to generate a text based recommendation for the user. I think you should be looking at the wrong problem. You are trying to generate a text based recommendations for the user. I am not sure if you are looking at the wrong problem or not. I am not sure what you are trying to do. I am not sure how to do it. I am not sure why you are trying to do it. I think you are not trying to do it. You are not trying to do it.
7,Is it possible to control the level of fluency in the generated output?,"Yes, it is possible to control the level of fluent output generated by Lamini. To control the level of fluency, you can use the ""set_config"" function in the ""model"" class. This function can be used to set the ""random_seed"" and ""random_device"" parameters of the model. Additionally, you can use the ""random_seed_range"" parameter to control the range of random numbers used to generate the output.","A: You can use the following code to control the fluency of the generated output: #include #include #include using namespace std; int main() {  string input;  int input_len;  int output_len;  cout << ""Enter the length of the input: "";  cin >> input;  cout << ""Enter a string: "";  cin >> input_len;  cout << ""\nEnter the length of the output: "";  cin >> output_len;  if (input_len < output_len)  {  cout << ""The length of the input is less than the length of the output."";  }  else  {  cout << ""\nThe length of the input is greater than the length of the output."";  for (int i = 0; i < input_len; i++)  {  cout << input[i];  }  }  return 0; } Output: Enter the length of the input: 1 Enter a string:"
8,Does Lamini support generating code for speech recognition tasks?,"Yes, Lamini supports generating code for speech recognition tasks through its LLM Engine.",A: Lamini is a speech recognition library. It is not a speech recognition library. A: LAMINI is a speech recognition library. It's not a speech recognition library. LAMINI is a library that can be used to generate code for speech recognition tasks. A: I think you are confusing LAMINI with LAMINI-S. LAMINI is an open source speech recognition library. It is not an open source speech recognition library. LAMINA-S is a speech recognition library that can be used to generate speech recognition code. LAMINA-S can be used to generate speech recognition speech. LAMINA-S has a speech recognition speech engine. LAMINA-S supports speech recognition speech. LAMINI-S supports speech recognition speech.
9,Is it possible to fine-tune Lamini on a specific dataset for content generation tasks?,"Yes, it is possible to fine-tune LAMI on a specific dataset for content generation. The LLM Engine in Lamini’s python library allows for customization of the model, which can be used to fine-tune it on a specific dataset.","A: I think you can use the Lamini-Net library. Lamini-Net is a library for generating content from text. It is based on the Lamini-Net library, which is a library for generating content from images. Lamini-net is a library for generating content from image. It is based on the Laminei-Net library, which is based on the Lamini library. Lamini is a library for generating content from video. It is based on the Lama-Net library, which is based off the Lamini library. Lamini is a library that generates content from video. It is based off the Lamini-Net library and Lamini-Net. Lamini-NET is a library for generating content from audio. It is based on the Lami-Net library, which is based of the Lamini library. The Lamini-Net library is based on the Lamini and Lamini-Net libraries. Lamini-Nets are a library for generating content from text and audio. It is based on the libnet library. Lamini and L"
