<a href="https://colab.research.google.com/github/jlopetegui98/Literary-Fine-Tuning-of-LLM/blob/main/ClassifierWildeVsMistral/dataset_generatiion_for_clf_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Generation of a dataset of text from Mistral7B model (base model) for classifier training

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -U simpletransformers
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q trl xformers wandb datasets einops gradio sentencepiece

Collecting simpletransformers
  Downloading simpletransformers-0.64.5-py3-none-any.whl (250 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.7/250.7 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting datasets (from simpletransformers)
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorboardx (from simpletransformers)
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
Collecting wandb>=0.10.32 (from simpletransformers)
 

In [3]:
import torch
import simpletransformers
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging, TextStreamer
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os, wandb, platform, gradio, warnings
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset
from trl import SFTTrainer
from huggingface_hub import notebook_login
import json
from tqdm import tqdm

In [4]:
# data and models paths
dir_root = './drive/MyDrive/DL-ENS'
dir_data = f'{dir_root}/dataset'
list_to_generate_path = f'{dir_data}/story_prompts_2.txt'

In [5]:
model_name = "mistralai/Mistral-7B-Instruct-v0.1"

In [6]:
# Load base model(Mistral 7B)
bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)
model = AutoModelForCausalLM.from_pretrained(
   model_name,
    quantization_config=bnb_config,
    device_map={"": 0}
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [7]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.padding_side = 'left'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token

tokenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

(True, True)

In [8]:
# function to tokenize the input in the expected form of the prompt
def tokenize(tokenizer, text):
  return tokenizer(f"<s>[INST]This are the first lines of a work of fiction. Continue it. {text} [/INST]", return_tensors = "pt", add_special_tokens = False)

In [9]:
# function for texts generation
def generate_texts(model, tokenizer, texts):
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

  generated_texts = []
  for input in tqdm(texts):
    tokens = tokenize(tokenizer, input)
    model_inputs = tokens.to(device)
    generated_ids = model.generate(**model_inputs, max_new_tokens=500, do_sample=True)
    decoded = tokenizer.batch_decode(generated_ids)
    generated_texts.extend(decoded)
    del model_inputs
    del decoded
    del generated_ids
  return generated_texts

In [10]:
texts = []
with open(list_to_generate_path, 'r+', encoding='utf-8') as fd:
  texts = fd.readlines()
texts = [text[:-1] for text in texts]

In [11]:
generated_texts = generate_texts(model, tokenizer, texts)

  0%|          | 0/310 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 1/310 [00:20<1:44:22, 20.27s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 2/310 [00:56<2:31:05, 29.43s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|          | 3/310 [01:23<2:26:52, 28.71s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  1%|▏         | 4/310 [01:34<1:49:45, 21.52s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  2%|▏         | 5/310 [01:47<1:34:30, 18.59s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  2%|▏         | 6/310 [01:56<1:17:45, 15.35s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  2%|▏         | 7/310 [02:12<1:17:24, 15.33s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  3%|▎         | 8/310 [02:31<1:23:32, 16.60s/it]Setting `pad_token_id` to

In [12]:
# function to save the results
def save_generated_texts(texts):
  dict_generated_texts = {'texts': []}

  for i in range(len(texts)):
    dict_generated_texts['texts'].append(texts[i])

  with open(dir_data + f"/dataset_mistral7B_gen_texts.json", 'w+') as fd:
    json.dump(dict_generated_texts, fd)

In [14]:
print(generated_texts[0])

<s> [INST]This are the first lines of a work of fiction. Continue it. Doug stuck his hand in the box and immediately pulled it out. "Ow," he said. He licked the side of his index finger as if it had honey on it. (Continue to write about who Doug is, where he is, and what is in the box ... ) [/INST] Doug is a young man who always has an adventurous spirit. He loves to explore the outdoors and discover new things. He was on a camping trip in the middle of the woods, and he stumbled upon a small box lying in the dirt.

The box was old and rusty, but it had a strange writing on it. Doug couldn't understand what it said, but he was curious to see what was inside. So he picked up the box, gritted his teeth, and pulled out his hand.

As he licked the side of his index finger, he tasted something sweet and tangy. It reminded him of honey or a strange fruit he had never eaten before. Doug couldn't wait to find out what was inside the box. He carefully opened it to reveal a small vial of the mos