<a href="https://colab.research.google.com/github/jlopetegui98/Literary-Fine-Tuning-of-LLM/blob/main/Experiments/experiments_wilde_ft_mistral.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Experiments with the Mistral 7B fine tuned model with Oscar Wilde texts

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Experiments


In [None]:
!pip install -U simpletransformers
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q trl xformers wandb datasets einops gradio sentencepiece

In [None]:
import torch
import simpletransformers
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging, TextStreamer
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os, wandb, platform, gradio, warnings
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset
from trl import SFTTrainer
from huggingface_hub import notebook_login
import json
from tqdm import tqdm

In [None]:
# data and models paths
dir_root = './drive/MyDrive/DL-ENS'
dir_data = f'{dir_root}/dataset'
clf_path = f'{dir_root}/models/BertClassifier(BERTAA)_balanced_data.pt'
list_to_generate_path = f'{dir_data}/story_prompts.txt'
ft_model = f'{dir_root}/models/ModelFineTuned.pt'

In [None]:
# load classifier (wilde vs kipling)
clf = torch.load(clf_path)
clf

In [None]:
#base model
model_name = "mistralai/Mistral-7B-Instruct-v0.1"

In [None]:
# Load the model
model = torch.load(ft_model)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.padding_side = 'left'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token

In [None]:
# function to tokenize the input in the expected form of the prompt
def tokenize(tokenizer, text):
  return tokenizer(f"<s>[INST]This are the first lines of a work of fiction. Continue it. {text} [/INST]", return_tensors = "pt", add_special_tokens = False)

In [None]:
# main function for experiments
def clf_exp(model, tokenizer, clf, texts):
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

  generated_texts = []
  label_predictions = []
  for input in tqdm(texts):
    tokens = tokenize(tokenizer, input)
    model_inputs = tokens.to(device)
    generated_ids = model.generate(**model_inputs, max_new_tokens=500, do_sample=True)
    decoded = tokenizer.batch_decode(generated_ids)
    preds, _ = clf.predict(decoded)
    label_predictions.extend(preds)
    generated_texts.extend(decoded)
    del model_inputs
    del decoded
    del generated_ids
  return label_predictions, generated_texts

In [None]:
texts = []
with open(list_to_generate_path, 'r+', encoding='utf-8') as fd:
  texts = fd.readlines()
texts = [text[:-1] for text in texts]

In [None]:
# predict author for each input prompt
author_preds, generated_texts = clf_exp(model, tokenizer, clf, texts)

In [None]:
# function to save the results
def save_generated_texts_and_labels(texts, labels, model = 'baseline'):
  dict_text_to_author = {'text': [], 'label': []}

  for i in range(len(texts)):
    dict_text_to_author['text'].append(texts[i])
    dict_text_to_author['label'].append(labels[i])

  with open(dir_data + f"/{model}_generated_texts.json", 'w+') as fd:
    json.dump(dict_text_to_author, fd)

In [None]:
save_generated_texts_and_labels(generated_texts, author_preds,model = 'ft_mistral')