### **Install Ludwig and Ludwig's LLM related dependencies.**

In [None]:
%pip uninstall -y tensorflow --quiet
%pip install --upgrade git+https://github.com/huggingface/transformers
# %pip install transformers==4.30
%pip install --upgrade git+https://github.com/huggingface/peft.git
# %pip install peft==1.0.0
%pip install git+https://github.com/ludwig-ai/ludwig.git@master --quiet
# %pip install ludwig==0.3.2

In [None]:
%pip install --upgrade datasets

In [None]:
%pip install py7zr

In [None]:
%pip install xformers

In [None]:
%pip install matplotlib

In [None]:
%pip show torch

In [None]:
%pip show transformers

In [None]:

import os

import copy

import gc

from typing import Any, Callable

import time

from functools import wraps
from inspect import (
    BoundArguments,
    signature,
)

from collections import OrderedDict

import yaml

import numpy as np
import pandas as pd

import torch
from torch import Tensor

import datasets
from datasets import load_dataset, Dataset, DatasetDict

import transformers
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, LlamaForCausalLM, MistralForCausalLM, AutoTokenizer, LlamaTokenizerFast, GenerationConfig, TextGenerationPipeline, BatchEncoding
from transformers.generation.utils import GreedySearchDecoderOnlyOutput

from peft import PeftModel, PeftModelForCausalLM, PeftConfig, LoraConfig

from ludwig.api import LudwigModel, TrainingResults


import logging

Enable text wrapping so we don't have to scroll horizontally and create a function to flush CUDA cache.

In [None]:
from IPython.display import HTML, display

def set_css() -> None:
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))

get_ipython().events.register('pre_run_cell', set_css)


In [None]:
def predict(model: LudwigModel, df_test: pd.DataFrame) -> list[list[str]]:
  return model.predict(df_test)[0]['summary_response'].tolist()

### **Setup Your HuggingFace Token** 🤗

Obtain a [HuggingFace API Token](https://huggingface.co/settings/tokens) before proceeding. You may need to signup on HuggingFace if you do not aleady have an account: https://huggingface.co/join


In [None]:
import getpass
# import locale; locale.getpreferredencoding = lambda: 'UTF-8'
import logging
import os
import torch
import yaml

import locale
def getpreferredencoding(do_setlocale = True):
    return 'UTF-8'

locale.getpreferredencoding = getpreferredencoding


os.environ['HUGGING_FACE_HUB_TOKEN'] = getpass.getpass('Token:')
assert os.environ['HUGGING_FACE_HUB_TOKEN']

### **Import The Dataset** 📋



In [None]:
import datasets
from datasets import load_dataset, Dataset, DatasetDict

In [None]:
dataset_dict: DatasetDict = datasets.load_dataset('databricks/databricks-dolly-15k')
dataset_dict

In [None]:
from sklearn.model_selection import train_test_split
df = pd.DataFrame(dataset_dict['train'].to_pandas())
[df_train, test_dataset] = train_test_split(df,train_size = 0.9)
[df_test, df_validation] = train_test_split(test_dataset,train_size = 0.5)

In [None]:
df_evaluation: pd.DataFrame = df_train.sample(n=10, random_state=200)
df_train = df_train.drop(df_evaluation.index)

In [None]:
df_train = df_train.sample(n=700, random_state=200)
df_test = df_test.sample(n=200, random_state=200)
df_validation = df_validation.sample(n=100, random_state=200)

In [None]:
assert df_train.shape[0] == 700
assert df_test.shape[0] == 200
assert df_validation.shape[0] == 100

In [None]:
df_train['split'] = np.zeros(df_train.shape[0])
df_test['split'] = np.ones(df_test.shape[0])
df_validation['split'] = np.full(df_validation.shape[0], 2)

In [None]:
df_dataset = pd.concat([df_train, df_test, df_validation])

In [None]:
df_dataset['split'] = df_dataset['split'].astype(int)

In [None]:
df_dataset.shape

In [None]:
assert df_dataset[df_dataset['split'] == 0].shape[0] == 700
assert df_dataset[df_dataset['split'] == 1].shape[0] == 200
assert df_dataset[df_dataset['split'] == 2].shape[0] == 100

In [None]:
df_dataset.head(10)

In [None]:
# Calculating the length of each cell in each column
df_dataset['num_characters_instruction'] = df_dataset['instruction'].apply(lambda x: len(x))
df_dataset['num_characters_context'] = df_dataset['context'].apply(lambda x: len(x))
df_dataset['num_characters_response'] = df_dataset['response'].apply(lambda x: len(x))

# Show Distribution
df_dataset.hist(column=['num_characters_instruction', 'num_characters_context', 'num_characters_response'])

# Calculating the average
average_chars_instruction = df_dataset['num_characters_instruction'].mean()
average_chars_context = df_dataset['num_characters_context'].mean()
average_chars_response = df_dataset['num_characters_response'].mean()

print(f'Average number of tokens in the instruction column: {(average_chars_instruction / 3):.0f}')
print(f'Average number of tokens in the context column: {(average_chars_context / 3):.0f}')
print(f'Average number of tokens in the response column: {(average_chars_response / 3):.0f}')


In [None]:
df_evaluation

In [None]:
prompt_template: str = '''
Respond to this instruction with the given context:

### Context: {context}

### Instruction: {instruction}

### Response:
'''

In [None]:
test_context: str = '''
A list of American Presidents in chronological order:
1. George Washington
2. John Adams
3. Thomas Jefferson
4. James Madison
5. James Monroe
6. John Quincy Adams
7. Andrew Jackson
8. Martin Van Buren
9. William Henry Harrison
10. John Tyler
11. James K. Polk
12. Zachary Taylor
13. Millard Fillmore
14. Franklin Pierce
15. James Buchanan
16. Abraham Lincoln
17. Andrew Johnson
18. Ulysses S. Grant
19. Rutherford B. Hayes
20. James A. Garfield
21. Chester A. Arthur
22. Grover Cleveland
23. Benjamin Harrison
24. Grover Cleveland
25. William McKinley
26. Theodore Roosevelt
27. William Howard Taft
28. Woodrow Wilson
29. Warren G. Harding
30. Calvin Coolidge
31. Herbert Hoover
32. Franklin D. Roosevelt
33. Harry S. Truman
34. Dwight D. Eisenhower
35. John F. Kennedy
36. Lyndon B. Johnson
37. Richard Nixon
38. Gerald Ford
39. Jimmy Carter
40. Ronald Reagan
41. George H. W. Bush
42. Bill Clinton
43. George W. Bush
44. Barack Obama
45. Donald Trump
46. Joe Biden
'''

In [None]:
test_instruction: str = '''
What is the 16th President of the United States of America?
'''

In [None]:
test_prompt: str = prompt_template.format(**{'context': test_context, 'instruction': test_instruction})

In [None]:
mistral_7b_sharded_base_model_name: str = 'alexsherstinsky/Mistral-7B-v0.1-sharded'

In [None]:
mistral_7b_sharded_base_model_name: str = 'mistralai/Mistral-7B-v0.1'

Load the pre-trained (general) Mistral 7B LLM (the sharded version).

In [None]:
bnb_config_base_model: BitsAndBytesConfig = BitsAndBytesConfig(
    # load_in_8bit=True,
    load_in_8bit_fp32_cpu_offload=True,
)

In [None]:
base_model_tokenizer: LlamaTokenizerFast = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=mistral_7b_sharded_base_model_name, trust_remote_code=True, padding_side='left')
print(base_model_tokenizer.eos_token)
base_model_tokenizer.pad_token = base_model_tokenizer.eos_token

In [None]:
base_model: MistralForCausalLM = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=mistral_7b_sharded_base_model_name, device_map='auto', torch_dtype=torch.float16, offload_folder='offload', trust_remote_code=True, low_cpu_mem_usage=True, quantization_config=bnb_config_base_model)

Run inference on the pre-trained (general) Mistral 7B LLM to establish baseline quality.

In [None]:
base_model_sequences_generator: TextGenerationPipeline = transformers.pipeline(
    task='text-generation',
    tokenizer=base_model_tokenizer,
    model=base_model,
    torch_dtype=torch.float16,
    device_map='auto',
)

In [None]:
base_model_sequences: list[dict] | list[list[dict]] = base_model_sequences_generator(
    text_inputs=test_prompt,
    do_sample=True,
    top_k=50,
    num_return_sequences=1,
    eos_token_id=base_model_tokenizer.eos_token_id,
    max_length=512,
    return_text=True,
)

In [None]:
base_model_sequence: dict = base_model_sequences[0]

In [None]:
print(f"\n[GENERATED_TEXT] BASE_MODEL_PREDICTION:\n{base_model_sequence['generated_text']} ; TYPE: {str(type(base_model_sequence['generated_text']))}")

In [None]:
gc.collect()

Run inference on the pre-trained (general) Mistral 7B LLM over the evaluation dataset.

In [None]:
df_model_evaluation: pd.DataFrame = df_evaluation.copy()

In [None]:
df_model_evaluation['prompt'] = df_model_evaluation.apply(lambda x: prompt_template.format(**{'context': x['context'], 'instruction': x['instruction']}), axis = 1)

In [None]:
base_model_sequences = base_model_sequences_generator(
    text_inputs=df_model_evaluation['prompt'].to_list(),
    do_sample=True,
    top_k=50,
    num_return_sequences=1,
    eos_token_id=base_model_tokenizer.eos_token_id,
    max_length=512,
    return_text=True,
)

In [None]:
print(f'\n[BASE_MODEL_EVALUATION_BEGIN]')
idx: int = 0
print(f'\n[=============EXAMPLE_{idx}_BEGIN=============]')
for context, instruction, response in zip(df_model_evaluation['context'].to_list(), df_model_evaluation['instruction'].to_list(), base_model_sequences):
  print(f"\n[BASE_MODEL_EVALUATION] GENERATED_SUMMARY:\n{response[0]['generated_text']}")
  print(f'\n[=============EXAMPLE_{idx}_END=============]')
  idx += 1

print(f'\n[BASE_MODEL_EVALUATION_END]')

In [None]:
gc.collect()

Configure and execute finetuning process.

In [None]:
qlora_fine_tuning_config: dict = yaml.safe_load(
'''
model_type: llm
base_model: alexsherstinsky/Mistral-7B-v0.1-sharded

input_features:
  - name: context
    type: text
    preprocessing:
      max_sequence_length: 1024

input_features:
  - name: instruction
    type: text
    preprocessing:
      max_sequence_length: 1024

output_features:
  - name: response
    type: text
    preprocessing:
      max_sequence_length: 512

prompt:
  template: >-
    Respond to this instruction with the given context:

    ### Context: {context}

    ### Instruction: {instruction}

    ### Response:

generation:
  temperature: 0.1
  max_new_tokens: 512

adapter:
  type: lora
  postprocessor:
    merge_adapter_into_base_model: true
    progressbar: true

quantization:
  bits: 8

preprocessing:
  split:
    # type: random
    # probabilities: [0.9, 0.05, 0.05]
    type: fixed

trainer:
  type: finetune
  epochs: 5
  batch_size: 1
  eval_batch_size: 2
  gradient_accumulation_steps: 16  # effective batch size = batch size * gradient_accumulation_steps
  learning_rate: 2.0e-4
  enable_gradient_checkpointing: true
  learning_rate_scheduler:
    decay: cosine
    warmup_fraction: 0.03
    reduce_on_plateau: 0
'''
)

In [None]:
model: LudwigModel = LudwigModel(config=qlora_fine_tuning_config, logging_level=logging.INFO)

In [None]:
results: TrainingResults = model.train(dataset=df_dataset)

#### Perform Inference

We can now use the model we finetuned above to make predictions on some test examples to see whether finetuning the large language model improve its ability to follow instructions/the tasks we're asking it to perform.

In [None]:
predictions_and_probabilities: tuple[pd.DataFrame, pd.DataFrame] = model.predict(df_evaluation)

In [None]:
df_predictions: pd.DataFrame = predictions_and_probabilities[0]

In [None]:
df_predictions.columns

In [None]:
print('\n\n')
for dialogue_with_summary in zip(df_evaluation['dialogue'], df_predictions['summary_response']):
  print(f'Dialogue:\n{dialogue_with_summary[0]}')
  print(f'Generated Summary:\n{dialogue_with_summary[1][0]}')
  print('\n\n')

In [None]:
df_control_example: pd.DataFrame = pd.DataFrame(
    data={
      'dialogue': [samsum_test_dialogue,],
    }
)
df_control_example

In [None]:
predictions: list[list[str]] = predict(model=model, df_test=df_control_example)

In [None]:
one_prediction: str = predictions[0][0]
print(f'\n[GENERATED_TEXT] PREDICTION:\n{one_prediction}')

# **Upload Trained Model Artifacts To HuggingFace** 🤗

Now that we have a fine-tuned model, we can export the model weights to HuggingFace hub so we can use them in downstream tasks or in production. Ludwig supports uploading model weights directly to HuggingFace Hub via the `upload` Ludwig command.

```
!ludwig upload hf_hub --repo_id <hf_user_name>/<repo_name> --model_path <top_level_model_directory>
```

In [None]:
!ludwig upload hf_hub --repo_id 'alexsherstinsky/mistralai-7B-v01-based-finetuned-using-ludwig-with-samsum-A100-sharded-8bit-merged' --model_path /content/results/api_experiment_run

In [None]:
samsum_fine_tuned_model_name: str = 'alexsherstinsky/mistralai-7B-v01-based-finetuned-using-ludwig-with-samsum-A100-sharded-8bit-merged'

In [None]:
gc.collect()

Load the finetuned LoRA layers for Mistral 7B LLM (adapter weights for SAMSum dataset).

In [None]:
print(f'\n[LOADING_MODEL] FINE_TUNED_AND_MERGED:\n{samsum_fine_tuned_model_name}')
samsum_tokenizer: LlamaTokenizerFast = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=samsum_fine_tuned_model_name, trust_remote_code=True, padding_side='left')
bnb_config_samsum_fine_tuned_model: BitsAndBytesConfig = BitsAndBytesConfig(
  load_in_8bit=True,
)
samsum_model: MistralForCausalLM = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=samsum_fine_tuned_model_name, torch_dtype=torch.float16, trust_remote_code=True, device_map='auto', quantization_config=bnb_config_samsum_fine_tuned_model, low_cpu_mem_usage=True)

In [None]:
gc.collect()

Run inference on finetuned Mistral 7B LLM for SAMSum dataset to verify quality.

In [None]:
samsum_sequences_generator: TextGenerationPipeline = transformers.pipeline(
    task='text-generation',
    tokenizer=samsum_tokenizer,
    model=samsum_model,
    torch_dtype=torch.float16,
    device_map='auto',
)

In [None]:
samsum_sequences: list[dict] | list[list[dict]] = samsum_sequences_generator(
    text_inputs=samsum_test_prompt,
    do_sample=True,
    top_k=50,
    num_return_sequences=1,
    eos_token_id=samsum_tokenizer.eos_token_id,
    max_length=512,
    return_text=True,
)

In [None]:
samsum_sequence: dict = samsum_sequences[0]

In [None]:
print(f'\n[GENERATED_TEXT] FINE_TUNED_MODEL_PREDICTION:\n{samsum_sequence['generated_text']} ; TYPE: {str(type(samsum_sequence['generated_text']))}')

In [None]:
gc.collect()

Run inference on finetuned Mistral 7B LLM for SAMSum dataset over the evaluation dataset.

In [None]:
df_samsum_evaluation: pd.DataFrame = df_evaluation.copy()

In [None]:
df_samsum_evaluation['prompt'] = df_samsum_evaluation['dialogue'].apply(lambda x: samsum_prompt_template.format(**{'dialogue': x}))

In [None]:
samsum_sequences: list[dict] | list[list[dict]] = samsum_sequences_generator(
    text_inputs=df_samsum_evaluation['prompt'].to_list(),
    do_sample=True,
    top_k=50,
    num_return_sequences=1,
    eos_token_id=samsum_tokenizer.eos_token_id,
    max_length=512,
    return_text=True,
)

In [None]:
print(f'\n[FINE_TUNED_MODEL_EVALUATION_BEGIN]')
idx: int = 0
print(f'\n[=============EXAMPLE_{idx}_BEGIN=============]')
for dialogue, synopsis in zip(df_samsum_evaluation['dialogue'].to_list(), samsum_sequences):
  print(f'\n[FINE_TUNED_MODEL_EVALUATION] GENERATED_SUMMARY:\n{synopsis[0]['generated_text']}')
  print(f'\n[=============EXAMPLE_{idx}_END=============]')
  idx += 1

print(f'\n[FINE_TUNED_MODEL_EVALUATION_END]')