In [1]:
import os
from dataclasses import dataclass, field
from typing import Optional

import torch
from datasets import load_dataset #pip
from datasets import load_from_disk
from peft import LoraConfig #pip
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig, #pip install bitsandbytes
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
)
from tqdm.notebook import tqdm

from trl import SFTTrainer #pip
from huggingface_hub import interpreter_login

interpreter_login()

compute_dtype = getattr(torch, "float16")





    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: ··········
Add token as git credential? (Y/n) Y
Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [22]:
from transformers import (BitsAndBytesConfig)

In [11]:
pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.42.0-py3-none-any.whl (105.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.42.0


In [2]:
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype='float16',
        bnb_4bit_use_double_quant=False,
    )
device_map = {"": 0}

In [20]:
!pip install accelerate

Collecting accelerate
  Using cached accelerate-0.26.1-py3-none-any.whl (270 kB)
Installing collected packages: accelerate
Successfully installed accelerate-0.26.1


In [3]:
#Download model
model = AutoModelForCausalLM.from_pretrained(
        "microsoft/phi-2",
        quantization_config=bnb_config,
        device_map=device_map,
        trust_remote_code=True,
        use_auth_token=True
    )

model.config.pretraining_tp = 1

peft_config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules= ["Wqkv", "fc1", "fc2" ] # ["Wqkv", "out_proj", "fc1", "fc2" ], - 41M params
    # modules_to_save=["embed_tokens","lm_head"]
)





model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

In [4]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True,use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

training_arguments = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    optim="paged_adamw_8bit",
    save_steps=1000, #CHANGE THIS IF YOU WANT IT TO SAVE LESS OFTEN. I WOULDN'T SAVE MORE OFTEN BECAUSE OF SPACE
    logging_steps=10,
    learning_rate=2e-4,
    max_grad_norm=.3,
    fp16=True,
        # flash_attn=True,
        # flash_rotary=True,
        # fused_dense=True,
        # low_cpu_memory_usage=True,
    max_steps=10000,
    warmup_ratio=.03,
    group_by_length=True,
    lr_scheduler_type="constant",
)

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
dataset = load_dataset("julep-ai-archive/samantha-dataset-v4-instructonly-prompt_response-2048",split="train")

Downloading readme:   0%|          | 0.00/531 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/246M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/44.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/128737 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/23080 [00:00<?, ? examples/s]

In [6]:
import pandas as pd

In [7]:
df = pd.DataFrame(dataset)

In [14]:
df.head()

Unnamed: 0,prompt,response
0,<|im_start|>situation\n2019-09-29\n\nSamantha ...,Right. Or after some work. As my ex-husband wo...
1,<|im_start|>situation\nUser is talking to an A...,I think that the answer to this question can b...
2,<|im_start|>situation\nUser is talking to an A...,the sink<|im_end|>
3,<|im_start|>situation\nUser is talking to an A...,She shares the moment she first met her daught...
4,<|im_start|>situation\nUser is talking to an A...,unloosening Kajo's bonds<|im_end|>


In [8]:
def format_row(row):
  return row["prompt"] + row["response"]

In [9]:
df["formatted"] = df.apply(format_row,axis=1)

In [10]:
new_df = df.rename(columns={"formatted":"text"})

In [18]:
new_df.head()

Unnamed: 0,prompt,response,text
0,<|im_start|>situation\n2019-09-29\n\nSamantha ...,Right. Or after some work. As my ex-husband wo...,<|im_start|>situation\n2019-09-29\n\nSamantha ...
1,<|im_start|>situation\nUser is talking to an A...,I think that the answer to this question can b...,<|im_start|>situation\nUser is talking to an A...
2,<|im_start|>situation\nUser is talking to an A...,the sink<|im_end|>,<|im_start|>situation\nUser is talking to an A...
3,<|im_start|>situation\nUser is talking to an A...,She shares the moment she first met her daught...,<|im_start|>situation\nUser is talking to an A...
4,<|im_start|>situation\nUser is talking to an A...,unloosening Kajo's bonds<|im_end|>,<|im_start|>situation\nUser is talking to an A...


In [11]:
new_df.to_csv("formatted_samantha.csv",index=False)

In [12]:
new_dataset = load_dataset("csv",data_files="./formatted_samantha.csv",split="train")

Generating train split: 0 examples [00:00, ? examples/s]

In [13]:
model.config.use_cache = False

trainer = SFTTrainer(
    model=model,
    train_dataset=new_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
)



Map:   0%|          | 0/128737 [00:00<?, ? examples/s]

In [14]:
trainer.train()

You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,0.0
20,0.0
30,0.0


KeyboardInterrupt: 

In [None]:
new

In [18]:
dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'response'],
        num_rows: 128737
    })
    test: Dataset({
        features: ['prompt', 'response'],
        num_rows: 23080
    })
})

In [5]:
pip install transformers

Collecting transformers
  Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m57.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
Successfully installed transformers-4.36.2


In [4]:
pip uninstall -y transformers

Found existing installation: transformers 4.35.2
Uninstalling transformers-4.35.2:
  Successfully uninstalled transformers-4.35.2
