In [6]:
!pip install -q torch peft==0.4.0 transformers accelerate tqdm einops scipy trl bitsandbytes

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/72.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m293.4/293.4 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [7]:
import os
from dataclasses import dataclass, field
from typing import Optional

In [8]:
import torch
from datasets import load_dataset, load_from_disk
from peft import LoraConfig, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments
)

In [9]:
from tqdm.notebook import tqdm

In [10]:
from trl import SFTTrainer

In [11]:
from huggingface_hub import interpreter_login

In [None]:
interpreter_login()

In [12]:
dataset = load_dataset("Amod/mental_health_counseling_conversations", split="train")

README.md:   0%|          | 0.00/2.82k [00:00<?, ?B/s]

combined_dataset.json:   0%|          | 0.00/4.79M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3512 [00:00<?, ? examples/s]

In [13]:
import pandas as pd

In [14]:
df = pd.DataFrame(dataset)

In [15]:
df.head(2)

Unnamed: 0,Context,Response
0,I'm going through some things with my feelings...,"If everyone thinks you're worthless, then mayb..."
1,I'm going through some things with my feelings...,"Hello, and thank you for your question and see..."


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3512 entries, 0 to 3511
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Context   3512 non-null   object
 1   Response  3512 non-null   object
dtypes: object(2)
memory usage: 55.0+ KB


In [17]:
def format_row(row):
  question= row['Context']
  answer = row['Response']
  formatted_string = f"[INST] {question} [/INST] {answer}"
  return formatted_string

In [18]:
df["Formatted"] = df.apply(format_row,axis=1)

In [19]:
df["Formatted"]

Unnamed: 0,Formatted
0,[INST] I'm going through some things with my f...
1,[INST] I'm going through some things with my f...
2,[INST] I'm going through some things with my f...
3,[INST] I'm going through some things with my f...
4,[INST] I'm going through some things with my f...
...,...
3507,[INST] My grandson's step-mother sends him to ...
3508,[INST] My boyfriend is in recovery from drug a...
3509,[INST] The birth mother attempted suicide seve...
3510,[INST] I think adult life is making him depres...


In [20]:
new_df = df.rename(columns = {"Formatted":"Text"})

In [21]:
new_df

Unnamed: 0,Context,Response,Text
0,I'm going through some things with my feelings...,"If everyone thinks you're worthless, then mayb...",[INST] I'm going through some things with my f...
1,I'm going through some things with my feelings...,"Hello, and thank you for your question and see...",[INST] I'm going through some things with my f...
2,I'm going through some things with my feelings...,First thing I'd suggest is getting the sleep y...,[INST] I'm going through some things with my f...
3,I'm going through some things with my feelings...,Therapy is essential for those that are feelin...,[INST] I'm going through some things with my f...
4,I'm going through some things with my feelings...,I first want to let you know that you are not ...,[INST] I'm going through some things with my f...
...,...,...,...
3507,My grandson's step-mother sends him to school ...,Absolutely not! It is never in a child's best ...,[INST] My grandson's step-mother sends him to ...
3508,My boyfriend is in recovery from drug addictio...,I'm sorry you have tension between you and you...,[INST] My boyfriend is in recovery from drug a...
3509,The birth mother attempted suicide several tim...,"The true answer is, ""no one can really say wit...",[INST] The birth mother attempted suicide seve...
3510,I think adult life is making him depressed and...,How do you help yourself to believe you requir...,[INST] I think adult life is making him depres...


In [22]:
new_df = new_df[["Text"]]

In [23]:
new_df.head(3)

Unnamed: 0,Text
0,[INST] I'm going through some things with my f...
1,[INST] I'm going through some things with my f...
2,[INST] I'm going through some things with my f...


In [24]:
new_df.to_csv("formatted_data.csv", index=False)

In [25]:
training_dataset = load_dataset("csv",data_files= "formatted_data.csv",split= "train")

Generating train split: 0 examples [00:00, ? examples/s]

In [26]:
training_dataset

Dataset({
    features: ['Text'],
    num_rows: 3512
})

In [35]:
base_model = "microsoft/phi-2"
new_model = "phi2-mental-health"

tokenizer = AutoTokenizer.from_pretrained(base_model, use_fst = True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

bnb_config = BitsAndBytesConfig(
  load_in_4bit = True,
  bnb_4bit_quant_type =  "nf4" ,
  bnb_4bit_compute_dtype = torch.float16,
  bnb_4bit_use_double_quant =  False,
)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config = bnb_config,
    trust_remote_code=True,
    flash_attn = True,
    flash_rotary = True,
    fused_dense = True,
    low_cpu_mem_usage = True,
    device_map = {" ", 0},
    revision = "refs/pr/23"
)

model.config.use_cache = False
model.config.pretraining_tp = 1

model = prepare_model_for_kbit_training(
    model,
    use_gradient_checkpointing=True
)

training_arguments = TrainingArguments(
    output_dir = "./mhGPT",
    num_train_epochs = 2,
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 32,
    evaluation_strategy = "steps",
    eval_steps = 1500,
    logging_steps =  15,
    optim = "paged_adamw_8bit",
    learning_rate = 2e-4,
    lr_scheduler_type = "cosine",
    save_steps = 1500,
    warmup_ratio = 0.05,
    weight_decay = 0.01,
    max_steps = -1
    )

peft_config = LoraConfig(
    r = 32,
    lora_alpha = 64,
    lora_dropout = 0.05,
    bias_type = "none",
    task_type = "CAUSAL_LM",
    target_modules = ["Wqkv","fc1","fc2"]
)

trainer = SFTTrainer(
    model = model,
    train_dataset = training_dataset,
    peft_config = peft_config,
    dataset_text_config = "Text",
   max_seq_length = 690,
    tokenizer = tokenizer,
    args = training_arguments
)

AttributeError: 'set' object has no attribute 'values'

In [None]:
trainer.train()