In [1]:
import os
import copy

import gc

from typing import Any, Callable

import time

from functools import wraps
from inspect import (
    BoundArguments,
    signature,
)

from collections import OrderedDict

import yaml

import numpy as np
import pandas as pd

import torch
from torch import Tensor

import datasets
from datasets import load_dataset, Dataset, DatasetDict

import transformers
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, LlamaForCausalLM, MistralForCausalLM, AutoTokenizer, LlamaTokenizerFast, GenerationConfig, TextGenerationPipeline, BatchEncoding
from transformers.generation.utils import GreedySearchDecoderOnlyOutput

from peft import PeftModel, PeftModelForCausalLM, PeftConfig, LoraConfig

from ludwig.api import LudwigModel, TrainingResults


import logging

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def predict(model: LudwigModel, df_test: pd.DataFrame) -> list[list[str]]:
  return model.predict(df_test)[0]["summary_response"].tolist()

In [4]:
scrambled_dataset_dict: DatasetDict = datasets.load_dataset("kurtn718/scrambled_words_multiple_choice")
scrambled_dataset_dict

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]
Downloading data:   0%|          | 0.00/1.04M [00:00<?, ?B/s][A
Downloading data: 100%|██████████| 1.04M/1.04M [00:00<00:00, 4.96MB/s][A
Downloading data files:  33%|███▎      | 1/3 [00:00<00:00,  4.57it/s]
Downloading data:   0%|          | 0.00/346k [00:00<?, ?B/s][A
Downloading data: 100%|██████████| 346k/346k [00:00<00:00, 2.30MB/s][A
Downloading data files:  67%|██████▋   | 2/3 [00:00<00:00,  5.40it/s]
Downloading data:   0%|          | 0.00/344k [00:00<?, ?B/s][A
Downloading data: 100%|██████████| 344k/344k [00:00<00:00, 2.10MB/s][A
Downloading data files: 100%|██████████| 3/3 [00:00<00:00,  5.38it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 757.32it/s]
Generating train split: 6000 examples [00:00, 97595.67 examples/s]
Generating validation split: 2000 examples [00:00, 143223.63 examples/s]
Generating test split: 2000 examples [00:00, 146349.52 examples/s]


DatasetDict({
    train: Dataset({
        features: ['question', 'correct_answer', 'scrambled', 'word'],
        num_rows: 6000
    })
    validation: Dataset({
        features: ['question', 'correct_answer', 'scrambled', 'word'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['question', 'correct_answer', 'scrambled', 'word'],
        num_rows: 2000
    })
})

In [5]:
train_dataset: Dataset = scrambled_dataset_dict["train"]
test_dataset: Dataset = scrambled_dataset_dict["test"]
validation_dataset: Dataset = scrambled_dataset_dict["validation"]

df_train: pd.DataFrame = train_dataset.to_pandas()
df_test: pd.DataFrame = test_dataset.to_pandas()
df_validation: pd.DataFrame = validation_dataset.to_pandas()

df_train = df_train.sample(n=700, random_state=200)
df_test = df_test.sample(n=200, random_state=200)
df_validation = df_validation.sample(n=100, random_state=200)

In [6]:
assert df_train.shape[0] == 700
assert df_test.shape[0] == 200
assert df_validation.shape[0] == 100

In [8]:
df_train["split"] = np.zeros(df_train.shape[0])
df_test["split"] = np.ones(df_test.shape[0])
df_validation["split"] = np.full(df_validation.shape[0], 2)

df_dataset = pd.concat([df_train, df_test, df_validation])

df_dataset["split"] = df_dataset["split"].astype(int)

In [9]:
prompt_template: str = """
Answer this multiple choice question:

### Question: {question}

### Answer:
"""

In [10]:
mistral_7b_sharded_base_model_name: str = "alexsherstinsky/Mistral-7B-v0.1-sharded"

In [11]:
bnb_config_base_model: BitsAndBytesConfig = BitsAndBytesConfig(
    load_in_8bit=True,
)

In [12]:
base_model_tokenizer: LlamaTokenizerFast = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=mistral_7b_sharded_base_model_name, trust_remote_code=True, padding_side="left")
print(base_model_tokenizer.eos_token)
base_model_tokenizer.pad_token = base_model_tokenizer.eos_token

tokenizer_config.json: 100%|██████████| 979/979 [00:00<00:00, 2.15MB/s]
tokenizer.model: 100%|██████████| 493k/493k [00:00<00:00, 10.7MB/s]
tokenizer.json: 100%|██████████| 1.80M/1.80M [00:00<00:00, 8.85MB/s]
added_tokens.json: 100%|██████████| 42.0/42.0 [00:00<00:00, 211kB/s]
special_tokens_map.json: 100%|██████████| 145/145 [00:00<00:00, 811kB/s]

</s>





In [13]:
base_model: MistralForCausalLM = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=mistral_7b_sharded_base_model_name, device_map="auto", torch_dtype=torch.float16, offload_folder="offload", trust_remote_code=True, low_cpu_mem_usage=True, quantization_config=bnb_config_base_model)

We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards: 100%|██████████| 8/8 [00:12<00:00,  1.59s/it]


In [14]:
base_model_sequences_generator: TextGenerationPipeline = transformers.pipeline(
    task="text-generation",
    tokenizer=base_model_tokenizer,
    model=base_model,
    torch_dtype=torch.float16,
    device_map="auto",
)

In [17]:
test_question = "Find the right word for 'taht'.\nA: that\nB: gamma\nC: parcel\nD: drawer"
test_prompt: str = prompt_template.format(**{"question": test_question})

base_model_sequences: list[dict] | list[list[dict]] = base_model_sequences_generator(
    text_inputs=test_prompt,
    do_sample=True,
    top_k=50,
    num_return_sequences=1,
    eos_token_id=base_model_tokenizer.eos_token_id,
    max_length=512,
    return_text=True,
)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [18]:
base_model_sequence: dict = base_model_sequences[0]
print(f'\n[GENERATED_TEXT] BASE_MODEL_PREDICTION:\n{base_model_sequence["generated_text"]} ; TYPE: {str(type(base_model_sequence["generated_text"]))}')


[GENERATED_TEXT] BASE_MODEL_PREDICTION:

Answer this multiple choice question:

### Question: Find the right word for 'taht'.
A: that
B: gamma
C: parcel
D: drawer

### Answer:
B: gamma

## 7.4

Find the right word for 'taht' in the following context
Answer this multiple choice question:

### Question:
``There is a large box full of \_\_\_\_ in the wardrobe. We'll need it when we hang the curtains.''
### Answer:
B: stuff

## 7.5

Here is a dialog in which you take part. Find the appropriate word for the gap:

##### A:
``Where should I put this book that I just bought?''
##### P:
``\_\_\_\_ that in the box in the bookshelf.''

### Answer:
B: shove

## 7.6

Fill in the blank with the correct word:

### Question:
``This is a \_\_\_\_, the small container to put keys and little things in.''

### Answer:
A: box

## 7.7

Listen to the dialog. Where in the dialog does the person ask a question to which he already knows the answer? You can see the dialog in the following transcript:

### Dialo

In [24]:
qlora_fine_tuning_config: dict = yaml.safe_load(
"""
model_type: llm
base_model: alexsherstinsky/Mistral-7B-v0.1-sharded

input_features:
  - name: question
    type: text
    preprocessing:
      max_sequence_length: 512

output_features:
  - name: correct_answer
    type: text
    preprocessing:
      max_sequence_length: 128

prompt:
  template: >-
    Answer the following question:

    ### Question: {question}

    ### Answer:

generation:
  temperature: 0.1
  max_new_tokens: 512

adapter:
  type: lora
#  postprocessor:
#    merge_adapter_into_base_model: true
#    progressbar: true

quantization:
  bits: 8

preprocessing:
  split:
    # type: random
    # probabilities: [0.9, 0.05, 0.05]
    type: fixed

trainer:
  type: finetune
  epochs: 5
  batch_size: 1
  eval_batch_size: 2
  gradient_accumulation_steps: 16  # effective batch size = batch size * gradient_accumulation_steps
  learning_rate: 2.0e-4
  enable_gradient_checkpointing: true
  learning_rate_scheduler:
    decay: cosine
    warmup_fraction: 0.03
    reduce_on_plateau: 0
"""
)

In [25]:
model: LudwigModel = LudwigModel(config=qlora_fine_tuning_config, logging_level=logging.INFO)

In [26]:
results: TrainingResults = model.train(dataset=df_dataset)


╒════════════════════════╕
│ EXPERIMENT DESCRIPTION │
╘════════════════════════╛

╒══════════════════╤══════════════════════════════════════════════════════════════════════════════════════════╕
│ Experiment name  │ api_experiment                                                                           │
├──────────────────┼──────────────────────────────────────────────────────────────────────────────────────────┤
│ Model name       │ run                                                                                      │
├──────────────────┼──────────────────────────────────────────────────────────────────────────────────────────┤
│ Output directory │ /workspace/results/api_experiment_run_0                                                  │
├──────────────────┼──────────────────────────────────────────────────────────────────────────────────────────┤
│ ludwig_version   │ '0.8.6'                                                                                  │
├──────────────────┼─

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Max length of feature 'None': 56 (without start and stop symbols)
Setting max length using dataset: 58 (including start and stop symbols)
max sequence length is 58 for feature 'None'
Loaded HuggingFace implementation of alexsherstinsky/Mistral-7B-v0.1-sharded tokenizer
No padding token id found. Using eos_token as pad_token.


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Max length of feature 'correct_answer': 2 (without start and stop symbols)
Setting max length using dataset: 4 (including start and stop symbols)
max sequence length is 4 for feature 'correct_answer'
Loaded HuggingFace implementation of alexsherstinsky/Mistral-7B-v0.1-sharded tokenizer
No padding token id found. Using eos_token as pad_token.


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Loaded HuggingFace implementation of alexsherstinsky/Mistral-7B-v0.1-sharded tokenizer
No padding token id found. Using eos_token as pad_token.


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Building dataset: DONE
Writing preprocessed training set cache to /workspace/c671a168979011ee9ada0242c0a81002.training.hdf5
Writing preprocessed validation set cache to /workspace/c671a168979011ee9ada0242c0a81002.validation.hdf5
Writing preprocessed test set cache to /workspace/c671a168979011ee9ada0242c0a81002.test.hdf5
Writing train set metadata to /workspace/c671a168979011ee9ada0242c0a81002.meta.json

Dataset Statistics
╒════════════╤═══════════════╤════════════════════╕
│ Dataset    │   Size (Rows) │ Size (In Memory)   │
╞════════════╪═══════════════╪════════════════════╡
│ Training   │           700 │ 164.19 Kb          │
├────────────┼───────────────┼────────────────────┤
│ Validation │           200 │ 47.00 Kb           │
├────────────┼───────────────┼────────────────────┤
│ Test       │           100 │ 23.56 Kb           │
╘════════════╧═══════════════╧════════════════════╛

╒═══════╕
│ MODEL │
╘═══════╛

Loading large language model...
We will use 90% of the memory on device 0 

Loading checkpoint shards: 100%|██████████| 8/8 [00:13<00:00,  1.65s/it]


Done.
Loaded HuggingFace implementation of alexsherstinsky/Mistral-7B-v0.1-sharded tokenizer
No padding token id found. Using eos_token as pad_token.
Trainable Parameter Summary For Fine-Tuning
Fine-tuning with adapter: lora
trainable params: 3,407,872 || all params: 7,245,139,968 || trainable%: 0.04703666202518836
Gradient checkpointing enabled for training.

╒══════════╕
│ TRAINING │
╘══════════╛

Creating fresh model training run.
Training for 3500 step(s), approximately 5 epoch(s).
Early stopping policy: 5 round(s) of evaluation, or 3500 step(s), approximately 5 epoch(s).

Starting with step 0, epoch: 0
Training:   0%|          | 0/3500 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Training:  20%|██        | 700/3500 [13:11<56:36,  1.21s/it, loss=1.06e-5]   
Running evaluation for step: 700, epoch: 0
Evaluation valid: 100%|██████████| 100/100 [00:39<00:00,  2.51it/s]
Evaluation test : 100%|██████████| 50/50 [00:18<00:00,  2.66it/s]
Evaluation took 58.8287s

╒═══════════════════════╤════════════╤══════════════╤════════════╕
│                       │      train │   validation │       test │
╞═══════════════════════╪════════════╪══════════════╪════════════╡
│ bleu                  │     0.0000 │       0.0000 │     0.0000 │
├───────────────────────┼────────────┼──────────────┼────────────┤
│ char_error_rate       │    87.3864 │      71.7900 │    71.7800 │
├───────────────────────┼────────────┼──────────────┼────────────┤
│ loss                  │     2.1098 │       0.0540 │     0.0003 │
├───────────────────────┼────────────┼──────────────┼────────────┤
│ next_token_perplexity │ 16327.2246 │   11869.0039 │ 11776.5430 │
├───────────────────────┼────────────┼───────────

In [27]:
!ludwig upload hf_hub --repo_id kurtn718/scrambled_multiple_choice --model_path /workspace/results/api_experiment_run_0/model

/bin/bash: line 1: ludwig: command not found


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [32]:
import safetensors

adapter_tensor_dict = safetensors.torch.load_file(
    "./results/api_experiment_run_0/model/model_weights/adapter_model.safetensors", device="cpu"
)
torch.save(adapter_tensor_dict, "./results/api_experiment_run_0/model/model_weights/adapter_model.bin")

In [33]:
# Execute in terminal
#ludwig upload hf_hub --repo_id kurtn718/scrambled_multiple_choice --model_path /workspace/results/api_experiment_run_0