<a href="https://colab.research.google.com/github/jlopetegui98/Literary-Fine-Tuning-of-LLM/blob/main/Experiments/experiments_base_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Experiments with base Mistral 7B model

In [29]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [30]:
!pip install -U simpletransformers

NotImplementedError: A UTF-8 locale is required. Got ANSI_X3.4-1968

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q trl xformers wandb datasets einops gradio sentencepiece

In [31]:
import torch
import simpletransformers
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging, TextStreamer
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os, wandb, platform, gradio, warnings
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset
from trl import SFTTrainer
from huggingface_hub import notebook_login
import json

In [32]:
# data and models paths
dir_root = './drive/MyDrive/DL-ENS'
dir_data = f'{dir_root}/dataset'
authors_paths = [f'{dir_data}/wilde_complete.txt', f'{dir_data}/kipling_complete.txt']
authors_names = ["Wilde", "Kipling"]
clf_path = f'{dir_root}/models/BertClassifier(BERTAA)_balanced_data.pt'
list_to_generate_path = f'{dir_data}/story_prompts.txt'

In [33]:
# load classifier (wilde vs kipling)
clf = torch.load(clf_path)
clf

<simpletransformers.classification.classification_model.ClassificationModel at 0x7decd4a35ba0>

In [34]:
# function to read the texts of an specific author
def read_texts(path: str, label, len_to_read =None, max_length = 350):
    text = ''
    with open(path, 'r+', encoding='utf-8') as fd:
      text = fd.read()
      if len_to_read != None:
        text = text[:len_to_read]
    text_splited = text.split()
    dt = {'text': [], 'label': []}
    for i in range(0,len(text_splited),max_length):
      text = ' '.join(text_splited[i:min(i+max_length, len(text_splited))])
      dt['text'].append(text)
      dt['label'].append(label)
    return dt

In [35]:
# iterate through every author and build the dataset
dt = {'text': [], 'label': []}
for i,path in enumerate(authors_paths[1:]):
  dt_i = read_texts(path,i)
  dt['text'].extend(dt_i['text'])
  dt['label'].extend(dt_i['label'])

In [36]:
model_name = "mistralai/Mistral-7B-Instruct-v0.1"

In [37]:
# load mistral 7B base model
# Load base model(Mistral 7B)
bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)
model = AutoModelForCausalLM.from_pretrained(
   model_name,
    quantization_config=bnb_config,
    device_map={"": 0}
)
model.config.use_cache = False # silence the warnings. Please re-enable for inference!
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.padding_side = 'left'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

(True, True)

In [38]:
# def tokenize(tokenizer, text):
#   return tokenizer(f"<s>This are the first lines of a work of fiction. Continue it. [INST] {text} [/INST]", return_tensors = "pt", add_special_tokens = False)

class PromptsDataset(Dataset):
    def __init__(self, sequences, tokenizer):
        self.tokenizer = tokenizer
        self.sequences = self.init_sequences(sequences)

    def init_sequences(self, sequences):
        seqs = []
        for i,seq in enumerate(tqdm(sequences)):
          seqs.append(tokenizer(f"<s>This are the first lines of a work of fiction. Continue it. [INST] {seq} [/INST]", return_tensors = "pt", add_special_tokens = True, max_length = 105, padding='max_length', truncation=True))
        return seqs

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        # print(idx)
        input = self.sequences[idx]
        # print(input.input_ids.shape)
        return input

In [39]:
from tqdm import tqdm

In [40]:
# text = f"<s>This are the first lines of a work of fiction. Continue it. [INST] {dt['text'][5][:100]} [/INST]"

# encodeds = tokenizer(text,return_tensors="pt", add_special_tokens=False)

def clf_exp(model, tokenizer, clf, texts, batch_size = 10):
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

  dataset = PromptsDataset(texts, tokenizer)
  dataloader = DataLoader(
            dataset, batch_size=batch_size
        )
  generated_texts = []
  label_predictions = []
  for i,input in tqdm(enumerate(dataloader)):
    # tokens = tokenize(tokenizer, text)
    # print(input)
    input = input['input_ids'].squeeze(1)
    model_inputs = input.to(device)
    # print(model_inputs.shape)
    generated_ids = model.generate(input_ids = model_inputs, max_new_tokens=350, do_sample=True)
    decoded = tokenizer.batch_decode(generated_ids)
    preds, _ = clf.predict(decoded)
    label_predictions.extend(preds)
    print(type(decoded[0]))
    generated_texts.extend(decoded)
    del model_inputs
    del decoded
    del generated_ids
  return label_predictions, generated_texts

In [41]:
texts = []
with open(list_to_generate_path, 'r+', encoding='utf-8') as fd:
  texts = fd.readlines()
texts = [text[:-1] for text in texts]

In [42]:
# predict author for each text in the experimental list
author_preds, generated_texts = clf_exp(model, tokenizer, clf, texts[:10])

100%|██████████| 10/10 [00:00<00:00, 2137.99it/s]
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

1it [1:41:40, 6100.85s/it]

<class 'str'>





In [45]:
print(generated_texts[3])

</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s><s><s> This are the first lines of a work of fiction. Continue it. [INST] The bell rang, and I sprinted toward my locker. I had to get out of there before… [/INST]</s>rijne

I could only sit there and feel the burning pain in my lungs, trying to find the will to breathe. It felt like the air was constricting, making it harder and harder to take another breath. I reached down and unlocked my locker, my fingers trembling with the effort, my heart pounding in my chest.

I pushed open the door and stumbled out into the crowded, chaotic school hallway. I could hear students shouting and laughing, their voices blending together in a cacophony that assaulted my ears. I clamped my eyes shut and tried to find some peace in the darkness, but all I could hear was the p

In [44]:
# check distribution of authors
sum(author_preds)/len(author_preds)

1.0