**Generate dataset of fiction stories from Mistral-7B-Instruct (baseline model) for classifier training**

In [None]:
# uncomment the following lines to run in colab
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# run this cell to install the necessary requirements if you are running in colab
# !pip install -U simpletransformers
# !pip install -q -U bitsandbytes
# !pip install -q -U git+https://github.com/huggingface/transformers.git
# !pip install -q -U git+https://github.com/huggingface/peft.git
# !pip install -q -U git+https://github.com/huggingface/accelerate.git
# !pip install -q trl xformers wandb datasets einops gradio sentencepiece

In [None]:
# imports for the notebook
import torch
import simpletransformers
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging, TextStreamer
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os, platform, gradio, warnings
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset
from huggingface_hub import notebook_login
import json
from tqdm import tqdm

In [None]:
# data paths
dir_root = './' # comment this line if you are running in colab
# dir_root = './drive/MyDrive/DL-ENS' # uncomment this line if you are running in colab
dir_data = f'{dir_root}/dataset'
list_to_generate_path = f'{dir_data}/story_prompts.txt'

In [None]:
# baseline model
model_name = "mistralai/Mistral-7B-Instruct-v0.1"

In [None]:
# Load base model(Mistral 7B-Instruct)
bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)
model = AutoModelForCausalLM.from_pretrained(
   model_name,
    quantization_config=bnb_config,
    device_map={"": 0}
)

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.padding_side = 'left'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token

In [None]:
# function to tokenize the input in the expected form of the prompt
def tokenize(tokenizer, text):
  """
  Tokenize the input in the expected form of the prompt
  inputs:
    tokenizer: the tokenizer to use
    text: the text to tokenize (before addapting the prompt)
  outputs:
    the tokenized text
  """
  return tokenizer(f"<s>[INST]These are the first lines of a work of fiction. Continue it. {text} [/INST]", return_tensors = "pt", add_special_tokens = False)

In [None]:
# function for texts generation
def generate_texts(model, tokenizer, texts):
  """
  Generate texts from the input texts
  inputs:
    model: the model to use for generation
    tokenizer: the tokenizer to use
    texts: the inputs for text generation
  outputs:
    the generated texts
  """
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

  generated_texts = []
  for input in tqdm(texts):
    tokens = tokenize(tokenizer, input)
    model_inputs = tokens.to(device)
    generated_ids = model.generate(**model_inputs, max_new_tokens=500, do_sample=True)
    decoded = tokenizer.batch_decode(generated_ids)
    generated_texts.extend(decoded)
    del model_inputs
    del decoded
    del generated_ids
  return generated_texts

In [None]:
# input texts for fiction generation
texts = []
with open(list_to_generate_path, 'r+', encoding='utf-8') as fd:
  texts = fd.readlines()
texts = [text[:-1] for text in texts]

In [None]:
# texts generation
generated_texts = generate_texts(model, tokenizer, texts)

In [None]:
# function to save the results
def save_generated_texts(texts):
      """
      Save the generated texts
      inputs:
        texts: the generated texts without the prompt specific tokens
    """
    dict_generated_texts = {'texts': []}
    # clean the generated texts
    patt = r'\[INST]|\[\/INST]|\<s>|\</s>|This are the first lines of a work of fiction. Continue it.'
    clean_texts = [re.sub(patt, '', x) for x in generated_texts]
    
    for i in range(len(clean_texts)):
        dict_generated_texts['texts'].append(clean_texts[i])
    with open(dir_data + f"BaseModelCompletionsToTrainClassifier/dataset_mistral7B_gen_texts.json", 'w+') as fd:
        json.dump(dict_generated_texts, fd)

In [None]:
save_generated_texts(generated_texts)

In [None]:
# print one example of generated text
print(generated_texts[0])

In [1]:
print('done')

done
