# Installs

In [1]:
from google.colab import drive
import sys
drive.mount('/mnt/drive')

storage_path = "/mnt/drive/MyDrive/cs224u_final_project"
sys.path.append(storage_path)

Drive already mounted at /mnt/drive; to attempt to forcibly remount, call drive.mount("/mnt/drive", force_remount=True).


In [2]:
%%capture
!pip install wget openai datasets pandas pyarrow

# Imports

In [3]:
import pandas as pd
import pyarrow as pa
from datasets import Dataset
import json

# Load data on rerun and mark faulty

In [4]:
training_examples_df = pd.read_parquet(f"{storage_path}/gpt35_examples_500_num_tokens.parquet")
training_examples_df.head()

Unnamed: 0,input,output,category,output_renum,gpt35_guide,result_guide,guide_correct,prompt_training_base,prompt_training_guide,prompt_training_guide_num_token,prompt_training_base_num_token,gpt35_guide_num_token
125472,Jayden gave Emma the balloon the cookie was lo...,Jayden ( 38 ) ; Emma ( 17 ) ; * balloon ( 41 )...,length_ood,Jayden ( 1 ) ; Emma ( 2 ) ; * balloon ( 3 ) ; ...,1. Identify entities: Jayden (1); Emma (2); *b...,1. Identify entities: Jayden (1); Emma (2); *b...,False,<s>[INST] Jayden gave Emma the balloon the coo...,<s>[INST] You are given a sentence and must co...,1346,161,429
124055,Liam hoped that the dog preferred to run Sophi...,Liam ( 0 ) ; * dog ( 23 ) ; Sophia ( 50 ) ; Ja...,length_ood,Liam ( 1 ) ; * dog ( 2 ) ; Sophia ( 3 ) ; Jack...,1. Identify entities: Liam ( 1 ) ; * dog ( 2 )...,1. Identify entities: Liam ( 1 ) ; * dog ( 2 )...,False,<s>[INST] Liam hoped that the dog preferred to...,<s>[INST] You are given a sentence and must co...,1771,260,846
78031,Liam ran .,"Liam ( 52 ) ; run ( 26 ) AND agent ( 26 , 52 )",in_distribution,"Liam ( 1 ) ; run ( 2 ) AND agent ( 2 , 1 )",1. Identify entities: Liam ( 1 )\n2. Locate ma...,1. Identify entities: Liam ( 1 )\n2. Locate ma...,False,<s>[INST] Liam ran . [/INST] Liam ( 1 ) ; run ...,<s>[INST] You are given a sentence and must co...,1012,35,113
114594,The girl forwarded the melon in a house to Emma .,* girl ( 53 ) ; * melon ( 18 ) ; house ( 41 ) ...,in_distribution,* girl ( 1 ) ; * melon ( 2 ) ; house ( 3 ) ; E...,1. Identify entities: * girl ( 1 ) ; * melon (...,1. Identify entities: * girl ( 1 ) ; * melon (...,False,<s>[INST] The girl forwarded the melon in a ho...,<s>[INST] You are given a sentence and must co...,1230,94,322
85560,A scientist gave the radio to Emma .,scientist ( 48 ) ; * radio ( 59 ) ; Emma ( 21 ...,in_distribution,scientist ( 1 ) ; * radio ( 2 ) ; Emma ( 3 ) ;...,1. Identify entities: scientist ( 1 ) ; * radi...,1. Identify entities: scientist ( 1 ) ; * radi...,False,<s>[INST] A scientist gave the radio to Emma ....,<s>[INST] You are given a sentence and must co...,1132,69,229


# Prepare and train model

Adapted from this [notebook](https://colab.research.google.com/drive/1Dyauq4kTZoLewQ1cApceUQVNcnnNTzg_?usp=sharing#scrollTo=2eSvM9zX_2d3)

In [5]:
import torch
major_version, minor_version = torch.cuda.get_device_capability()
# Must install separately since Colab has torch 2.2.1, which breaks packages
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
    # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

Collecting unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-27d3_cby/unsloth_9145815f6cf54b408ce6646117179856
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-27d3_cby/unsloth_9145815f6cf54b408ce6646117179856
  Resolved https://github.com/unslothai/unsloth.git to commit a68aebc1fa17755ffbcdafc9239e7ca37ab21657
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [6]:
import gc
from unsloth import FastLanguageModel
import torch
from tqdm import tqdm

from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from transformers import TrainingArguments

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

training_params = [
    # ("base", 161),
    ("guide", 50),
    ("base", 50),
    ("base", 100),
    ("guide", 100),
    ("base", 10),
    ("guide", 10),
]

for base_or_guide, num_training in training_params:
  print("Working on:", base_or_guide, num_training)
  model_name = f"model_{base_or_guide}_{num_training}"

  try:
    del model, tokenizer
    gc.collect()
    torch.cuda.empty_cache()
  except:
    pass

  model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit)

  model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = True,
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    )

  FastLanguageModel.for_training(model)

  dataset_df = training_examples_df.loc[training_examples_df.guide_correct, :].iloc[:num_training, :]
  if base_or_guide == "base":
    train_dataset = Dataset(pa.Table.from_pandas(pd.DataFrame({"text": dataset_df.prompt_training_base})))
  elif base_or_guide == "guide":
    train_dataset = Dataset(pa.Table.from_pandas(pd.DataFrame({"text": dataset_df.prompt_training_guide})))

  collator = DataCollatorForCompletionOnlyLM(tokenizer.encode("\n[/INST]", add_special_tokens = False)[3:],
                                            tokenizer=tokenizer)

  trainer = SFTTrainer(
      model = model,
      tokenizer = tokenizer,
      train_dataset = train_dataset,
      # formatting_func=formatting_prompts_func,
      data_collator=collator,
      dataset_text_field = "text",
      max_seq_length = max_seq_length,
      dataset_num_proc = 2,
      packing = False, # Can make training 5x faster for short sequences.
      args = TrainingArguments(
          per_device_train_batch_size = 2,
          gradient_accumulation_steps = 4,
          warmup_steps = 5,
          num_train_epochs=10,
          # max_steps = 200,
          learning_rate = 2e-4,
          fp16 = not torch.cuda.is_bf16_supported(),
          bf16 = torch.cuda.is_bf16_supported(),
          logging_steps = 1, # max(1, num_training // 8 // 2)
          optim = "adamw_8bit",
          weight_decay = 0.01,
          lr_scheduler_type = "linear",
          seed = 3407,
          # save_steps=21,
          save_strategy="epoch",
          overwrite_output_dir=True,
          output_dir = f"{storage_path}/{model_name}",
      ),
  )

  results = trainer.train()

  with open(f"{storage_path}/{model_name}_logs.json", "w") as file:
    json.dump(trainer.state.log_history, file)

Working on: base 161
==((====))==  Unsloth: Fast Mistral patching release 2024.3
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.25. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unsloth 2024.3 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Map (num_proc=2):   0%|          | 0/161 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 161 | Num Epochs = 10
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 200
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,7.0836
2,6.7523
3,4.7834
4,4.9188
5,4.0981
6,2.8185
7,1.588
8,0.7306
9,0.5062
10,0.2108


Working on: guide 50
==((====))==  Unsloth: Fast Mistral patching release 2024.3
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.25. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Map (num_proc=2):   0%|          | 0/50 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50 | Num Epochs = 10
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,0.3086
2,0.2066
3,0.1945
4,0.193
5,0.1393
6,0.1093
7,0.1236
8,0.1096
9,0.0674
10,0.106


Working on: base 50
==((====))==  Unsloth: Fast Mistral patching release 2024.3
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.25. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Map (num_proc=2):   0%|          | 0/50 [00:00<?, ? examples/s]

Step,Training Loss
1,5.824
2,5.298
3,5.801
4,4.6071
5,3.1358
6,1.8271
7,1.296
8,0.6688
9,0.3453
10,0.2578


Working on: base 100
==((====))==  Unsloth: Fast Mistral patching release 2024.3
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.25. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth




ValueError: 
                    Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the
                    quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules
                    in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom `device_map` to
                    `from_pretrained`. Check
                    https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu
                    for more details.
                    