In [1]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [2]:
import re
import json
import os
import torch
from tqdm import tqdm
from unsloth import FastLanguageModel, is_bfloat16_supported
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import TrainerCallback, TrainingArguments
from trl import SFTTrainer

def parse_conversation(content):
    """Extracts only Therapist-Patient conversation turns and ensures correct order."""
    # Extract only lines that start with "Therapist:" or "Patient:"
    turns = re.findall(r'(?:Therapist|Patient):.*', content)
    prompt_response_pairs = []

    for i in range(len(turns) - 1):
        speaker_1, text_1 = turns[i].split(":", 1)
        speaker_2, text_2 = turns[i + 1].split(":", 1)

        # Strip whitespace
        text_1, text_2 = text_1.strip(), text_2.strip()

        # Ensure we have an alternating Patient → Therapist order
        if speaker_1 == "Patient" and speaker_2 == "Therapist":
            prompt_response_pairs.append({
                'instruction': "You are an AI CBT therapist. Respond appropriately in the following conversation, you must give only one response as a therapist:",
                'input': f"{text_1}\n",
                'output': f"<think>Step by step, analyze the Patient's thoughts: identify patterns and distortions, evaluate supporting evidence, and restructure them into balanced, evidence-based alternatives for healthier thinking...</think>\n{text_2}"
            })

    return prompt_response_pairs

def process_combined_file(file_path):
    """Reads and processes the input conversation file."""
    all_pairs = []
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Split content into individual conversations
    conversations = content.split('</conversation>')

    for conv in conversations:
        if conv.strip():
            conv_content = conv.strip()
            pairs = parse_conversation(conv_content)
            all_pairs.extend(pairs)

    return all_pairs

# Process the input file and create the dataset
input_file = 'Synthetic_Transcripts.txt'
output_file = 'cbt_dataset.json'

prompt_response_pairs = process_combined_file(input_file)

dataset_dict = {
    'instruction': [pair['instruction'] for pair in prompt_response_pairs],
    'input': [pair['input'] for pair in prompt_response_pairs],
    'output': [pair['output'] for pair in prompt_response_pairs]
}

# Save the dataset dictionary as a JSON file
with open(output_file, 'w', encoding='utf-8') as file:
    json.dump(dataset_dict, file, ensure_ascii=False, indent=2)

print(f"Processed {len(prompt_response_pairs)} therapist-patient exchanges and saved to {output_file}")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
Processed 30771 therapist-patient exchanges and saved to cbt_dataset.json


In [3]:
import json
import random

# Load the dataset
dataset_file = 'cbt_dataset.json'

with open(dataset_file, 'r', encoding='utf-8') as file:
    dataset = json.load(file)

# Get total number of examples
num_samples = len(dataset["instruction"])

# Select a random index
random_index = random.randint(0, num_samples - 1)

# Display one example
print("\n🔹 **Sample from CBT Dataset** 🔹\n")
print(f"🔢 Sample Index: {random_index}")
print(f"📝 Instruction:\n{dataset['instruction'][random_index]}\n")
print(f"🗣️ Input:\n{dataset['input'][random_index]}\n")
print(f"💡 Output:\n{dataset['output'][random_index]}\n")


🔹 **Sample from CBT Dataset** 🔹

🔢 Sample Index: 2430
📝 Instruction:
You are an AI CBT therapist. Respond appropriately in the following conversation, you must give only one response as a therapist:

🗣️ Input:
Photography has been a nice outlet for me. I went on a hike last weekend and took some photos of the landscape. It felt good to focus on something I enjoy and be out in nature.


💡 Output:
<think>Step by step, analyze the Patient's thoughts: identify patterns and distortions, evaluate supporting evidence, and restructure them into balanced, evidence-based alternatives for healthier thinking...</think>
That's wonderful. Engaging in activities that bring you joy and help you connect with yourself and your environment is an essential part of managing stress and maintaining your mental well-being. Let's make sure to continue incorporating those activities into your routine.



In [4]:
from datasets import Dataset
from datasets import load_dataset

# Load existing dataset
cbt_dataset_file = 'cbt_dataset.json'

with open(cbt_dataset_file, 'r', encoding='utf-8') as file:
    cbt_dataset = json.load(file)

# Load ShenLab/MentalChat16K from Hugging Face
mentalchat_dataset = load_dataset("ShenLab/MentalChat16K", split="train")

# Extract relevant fields from MentalChat16K
mentalchat_pairs = []

for sample in mentalchat_dataset:
    mentalchat_pairs.append({
        "instruction": "You are an AI CBT therapist. Respond appropriately in the following conversation, you must give only one response as a therapist:",
        "input": sample["input"],  # Keeping structure consistent
        "output": f"<think>Step by step, analyze the Patient's thoughts: identify patterns and distortions, evaluate supporting evidence, and restructure them into balanced, evidence-based alternatives for healthier thinking...</think>\n{sample['output']}"  # Correctly formatted
    })

# Merge both datasets
combined_dataset = cbt_dataset  # Start with existing dataset
combined_dataset["instruction"].extend(pair["instruction"] for pair in mentalchat_pairs)
combined_dataset["input"].extend(pair["input"] for pair in mentalchat_pairs)
combined_dataset["output"].extend(pair["output"] for pair in mentalchat_pairs)

# Save merged dataset
output_file = "cbt_mentalchat_dataset.json"
with open(output_file, "w", encoding="utf-8") as file:
    json.dump(combined_dataset, file, ensure_ascii=False, indent=2)

print(f"✅ Successfully merged datasets! New dataset saved as {output_file}")
print(f"🔹 Total Samples in Combined Dataset: {len(combined_dataset['instruction'])}")

✅ Successfully merged datasets! New dataset saved as cbt_mentalchat_dataset.json
🔹 Total Samples in Combined Dataset: 46855


In [5]:
import random
from datasets import Dataset

# Define model and tokenizer parameters
model_name = "unsloth/DeepSeek-R1-Distill-Qwen-1.5B-unsloth-bnb-4bit"
max_seq_length = 4069
dtype = None
load_in_4bit = True

# Load the model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    cache_dir="/mnt/batch/tasks/shared/LS_root/mounts/clusters/a10048/code/models"
)

# Configure the PEFT model
model = FastLanguageModel.get_peft_model(
    model,
    r=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=32,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

# Define the EOS_TOKEN and Alpaca prompt format
EOS_TOKEN = tokenizer.eos_token
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# Function to format prompts
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    for instruction, input_text, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input_text, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

# Create a Hugging Face dataset
dataset = Dataset.from_dict(combined_dataset)

# Shuffle dataset before splitting
dataset = dataset.shuffle(seed=42)

print("Dataset size:", len(dataset))
print("Dataset features:", dataset.features)
print("First shuffled example:", dataset[0])

# Format the entire dataset
formatted_dataset = dataset.map(formatting_prompts_func, batched=True)

# Split the formatted dataset into train, validation, and test sets
train_val_test = formatted_dataset.train_test_split(test_size=0.1, seed=42)
train_val = train_val_test['train'].train_test_split(test_size=0.1, seed=42)

formatted_train_dataset = train_val['train']
formatted_val_dataset = train_val['test']
formatted_test_dataset = train_val_test['test']

print("Dataset successfully shuffled and split!")

==((====))==  Unsloth 2025.2.12: Fast Qwen2 patching. Transformers: 4.49.0.
   \\   /|    GPU: NVIDIA A100 80GB PCIe. Max memory: 79.151 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Dataset size: 46855
Dataset features: {'instruction': Value(dtype='string', id=None), 'input': Value(dtype='string', id=None), 'output': Value(dtype='string', id=None)}
First shuffled example: {'instruction': 'You are an AI CBT therapist. Respond appropriately in the following conversation, you must give only one response as a therapist:', 'input': "Hi, I'm back for session 10. Here is the summary you provided me at the end of our last session:\n", 'output': "<think>Step by step, analyze the Patient's thoughts: identify pattern

Unsloth 2025.2.12 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


Map:   0%|          | 0/46855 [00:00<?, ? examples/s]

Dataset successfully shuffled and split!


In [None]:
# Define a callback for logging losses
class LossLoggingCallback(TrainerCallback):
    def __init__(self, log_file):
        self.log_file = log_file

    def on_log(self, args, state, control, logs=None, **kwargs):
        if state.is_local_process_zero and logs is not None:
            with open(self.log_file, 'a') as f:
                if 'loss' in logs:
                    f.write(f"Step: {state.global_step}, Loss: {logs['loss']}\n")
                if 'eval_loss' in logs:
                    f.write(f"Step: {state.global_step}, Eval Loss: {logs['eval_loss']}\n")

log_file = os.path.join("outputs", "loss_log.txt")

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=formatted_train_dataset,
    eval_dataset=formatted_val_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=32,  
        per_device_eval_batch_size=32,   
        gradient_accumulation_steps=8,  
        warmup_steps=int(0.05 * (len(formatted_train_dataset) // (32 * 8))), 
        num_train_epochs=2,
        gradient_checkpointing=True, 
        learning_rate=5e-5,  
        bf16=True,  
        logging_steps=10, 
        optim="adamw_torch_fused",  
        weight_decay=0.005,  
        lr_scheduler_type="cosine",  
        seed=3407,
        output_dir="outputs",
        eval_strategy="steps",
        eval_steps=50,
        save_strategy="steps",
        save_steps=50, 
    ),
    callbacks=[LossLoggingCallback(log_file)],
)

# Train the model
trainer_stats = trainer.train()

# Save the fine-tuned model
model.save_pretrained_gguf("DeepLlama", tokenizer, quantization_method="q8_0")

In [1]:
import re
import json
from xml.etree import ElementTree as ET
from unsloth import FastLanguageModel
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import TrainerCallback, TrainingArguments
from trl import SFTTrainer
from unsloth import is_bfloat16_supported
import os
import torch
from tqdm import tqdm

def parse_conversation(content):
    # Split the content into turns
    turns = re.split(r'\n(?=(?:Therapist:|Patient:))', content)
    conversation_history = []
    prompt_response_pairs = []

    for turn in turns:
        speaker, _, text = turn.partition(':')
        text = text.strip()

        # Collect each turn into conversation history
        conversation_history.append(f"{speaker}: {text}")

        # If the turn is from the AI, generate a prompt-response pair
        if speaker == 'Therapist':
            # Use the entire conversation history up to this point as the prompt
            prompt = '\n'.join(conversation_history[:-1])
            # Add the prompt-response pair to the list with <think> structure
            prompt_response_pairs.append({
                'instruction': "You are an AI CBT therapist. Respond appropriately in the following conversation, you must give only one response as a therapist:",
                'input': prompt,
                'output': f"<think>Step by step, analyze the client’s thoughts: identify patterns and distortions, evaluate supporting evidence, and restructure them into balanced, evidence-based alternatives for healthier thinking...</think>\n{text}"
            })

    return prompt_response_pairs

def process_combined_file(file_path):
    all_pairs = []
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Split the content into individual conversations
    conversations = content.split('</conversation>')

    for conv in conversations:
        if conv.strip():
            conv_content = conv.split('\n', 1)[1].strip() if '\n' in conv else conv.strip()
            pairs = parse_conversation(conv_content)
            all_pairs.extend(pairs)

    return all_pairs


# Process the input file and create the dataset
input_file = 'Synthetic_Transcripts.txt'
output_file = 'cbt_dataset.json'
prompt_response_pairs = process_combined_file(input_file)
dataset_dict = {
    'instruction': [pair['instruction'] for pair in prompt_response_pairs],
    'input': [pair['input'] for pair in prompt_response_pairs],
    'output': [pair['output'] for pair in prompt_response_pairs]
}

# Save the dataset dictionary as a JSON file
with open(output_file, 'w', encoding='utf-8') as file:
    json.dump(dataset_dict, file, ensure_ascii=False, indent=2)

print(f"Processed {len(prompt_response_pairs)} prompt-response pairs and saved to {output_file}")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
Processed 31695 prompt-response pairs and saved to cbt_dataset.json


In [2]:
# Define model and tokenizer parameters
model_name = "unsloth/Meta-Llama-3.1-8B-bnb-4bit"
max_seq_length = 10000
dtype = None
load_in_4bit = True

# Load the model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

# Configure the PEFT model
model = FastLanguageModel.get_peft_model(
    model,
    r=8,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj",],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

# Define the EOS_TOKEN and Alpaca prompt format
EOS_TOKEN = tokenizer.eos_token
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# Function to format prompts
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

# Create a Hugging Face dataset
dataset = Dataset.from_dict(dataset_dict)

print("Dataset size:", len(dataset))
print("Dataset features:", dataset.features)
print("First example:", dataset[30])

# Format the entire dataset
formatted_dataset = dataset.map(formatting_prompts_func, batched=True)

# Split the formatted dataset into train, validation, and test sets
train_val_test = formatted_dataset.train_test_split(test_size=0.1, seed=42)
train_val = train_val_test['train'].train_test_split(test_size=0.1, seed=42)

formatted_train_dataset = train_val['train']
formatted_val_dataset = train_val['test']
formatted_test_dataset = train_val_test['test']


==((====))==  Unsloth 2025.2.12: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    GPU: NVIDIA A100 80GB PCIe. Max memory: 79.151 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Dataset size: 31695
Dataset features: {'instruction': Value(dtype='string', id=None), 'input': Value(dtype='string', id=None), 'output': Value(dtype='string', id=None)}
First example: {'instruction': 'You are an AI CBT therapist. Respond appropriately in the following conversation, you must give only one response as a therapist:', 'input': 'Patient: Hi, I\'m back for session 12. Here is the summary you provided me at the end of our last session:\n\n- Main topics discussed: Challenging perfectionistic thoughts, addressing guilt 

Unsloth 2025.2.12 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Map:   0%|          | 0/31695 [00:00<?, ? examples/s]

In [3]:
# Define a callback for logging losses
class LossLoggingCallback(TrainerCallback):
    def __init__(self, log_file):
        self.log_file = log_file

    def on_log(self, args, state, control, logs=None, **kwargs):
        if state.is_local_process_zero and logs is not None:
            with open(self.log_file, 'a') as f:
                if 'loss' in logs:
                    f.write(f"Step: {state.global_step}, Loss: {logs['loss']}\n")
                if 'eval_loss' in logs:
                    f.write(f"Step: {state.global_step}, Eval Loss: {logs['eval_loss']}\n")

log_file = os.path.join("outputs", "loss_log.txt")

# Configure the SFTTrainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=formatted_train_dataset,
    eval_dataset=formatted_val_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=32,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=16,
        warmup_steps=10,
        num_train_epochs=1,
        gradient_checkpointing=True,
        learning_rate=1e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.1,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        evaluation_strategy="steps",
        eval_steps=10,
        save_strategy="steps",
        save_steps=10,
    ),
    callbacks=[LossLoggingCallback(log_file)],
)

# Train the model
trainer_stats = trainer.train()

# Save the fine-tuned model
model.save_pretrained_gguf("DeepLlama", tokenizer, quantization_method="q8_0")



Applying chat template to train dataset (num_proc=2):   0%|          | 0/25672 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/25672 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/25672 [00:00<?, ? examples/s]

Applying chat template to eval dataset (num_proc=2):   0%|          | 0/2853 [00:00<?, ? examples/s]

Tokenizing eval dataset (num_proc=2):   0%|          | 0/2853 [00:00<?, ? examples/s]

Tokenizing eval dataset (num_proc=2):   0%|          | 0/2853 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 25,672 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 32 | Gradient Accumulation steps = 16
\        /    Total batch size = 512 | Total steps = 50
 "-____-"     Number of trainable parameters = 20,971,520
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mibrahimshaban1994[0m ([33mibrahimshaban1994-university-of-london[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wan

KeyboardInterrupt: 

In [None]:
import os
import json
from datasets import Dataset

# Path to the JSON file with your augmented dataset.
json_path = "expanded_dataset.json"

# Load the JSON file.
print("Loading dataset from JSON...")
with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Create a Hugging Face dataset from the JSON data.
dataset = Dataset.from_dict(data)

print("Dataset size:", len(dataset))
print("Dataset features:", dataset.features)
print("First example:", dataset[0])

# Split the dataset into train, validation, and test sets.
print("\nSplitting dataset into train/validation/test sets...")
train_val_test = dataset.train_test_split(test_size=0.1, seed=42)
train_val = train_val_test["train"].train_test_split(test_size=0.1, seed=42)

formatted_train_dataset = train_val["train"]
formatted_val_dataset = train_val["test"]
formatted_test_dataset = train_val_test["test"]

print("Train set size:", len(formatted_train_dataset))
print("Validation set size:", len(formatted_val_dataset))
print("Test set size:", len(formatted_test_dataset))

# Continue with further processing or training...

In [3]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    allocated = torch.cuda.memory_allocated(device) / 1024 / 1024  # in MB
    reserved = torch.cuda.memory_reserved(device) / 1024 / 1024    # in MB
    print(f"GPU Memory Allocated: {allocated:.2f} MB")
    print(f"GPU Memory Reserved: {reserved:.2f} MB")
else:
    print("CUDA is not available.")

GPU Memory Allocated: 0.00 MB
GPU Memory Reserved: 0.00 MB


In [None]:
# Define model and tokenizer parameters
model_name = "unsloth/DeepSeek-R1-Distill-Llama-8B-bnb-4bit"
max_seq_length = 10000
dtype = None
load_in_4bit = True

# Load the model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

# Configure the PEFT model
model = FastLanguageModel.get_peft_model(
    model,
    r=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj",],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

# Define the EOS_TOKEN and Alpaca prompt format
EOS_TOKEN = tokenizer.eos_token
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# Function to format prompts
def formatting_prompts_func(examples):
    instructions = examples["Instruction"]  
    inputs = examples["Input"]  
    outputs = examples["Assistant"]  
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

# Create a Hugging Face dataset
dataset = Dataset.from_dict(dataset_dict)

print("Dataset size:", len(dataset))
print("Dataset features:", dataset.features)
print("First example:", dataset[30])

# Format the entire dataset
formatted_dataset = dataset.map(formatting_prompts_func, batched=True)

# Split the formatted dataset into train, validation, and test sets
train_val_test = formatted_dataset.train_test_split(test_size=0.1, seed=42)
train_val = train_val_test['train'].train_test_split(test_size=0.1, seed=42)

formatted_train_dataset = train_val['train']
formatted_val_dataset = train_val['test']
formatted_test_dataset = train_val_test['test']

In [4]:
print("Formatted Example:", formatted_train_dataset[30]["text"])

Formatted Example: Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You are an AI CBT therapist. Respond appropriately in the following conversation, you must give only one response as a therapist, you never break your role as a CBT therapist:

### Input:
Patient: Hi, I'm back for session 7. Here is the summary you provided me at the end of our last session:

- Main topics discussed: Managing financial stress, challenging negative thoughts related to self-worth and success, addressing feelings of guilt around relying on friends for support
- Techniques or exercises used: 
  - Behavioral activation: Engaging in planned activities like hiking and reading before bed to counteract avoidance and withdrawal. Plan to continue implementing this technique and monitor mood in response to activities.
  - Thought records: Challenging negative thoughts related to self-worth

In [5]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    allocated = torch.cuda.memory_allocated(device) / 1024 / 1024  # in MB
    reserved = torch.cuda.memory_reserved(device) / 1024 / 1024    # in MB
    print(f"GPU Memory Allocated: {allocated:.2f} MB")
    print(f"GPU Memory Reserved: {reserved:.2f} MB")
else:
    print("CUDA is not available.")

GPU Memory Allocated: 5626.65 MB
GPU Memory Reserved: 5628.00 MB


In [None]:
import os
from transformers import TrainerCallback, TrainingArguments
from peft import LoraConfig
from trl import SFTTrainer

# Define a callback for logging losses
class LossLoggingCallback(TrainerCallback):
    def __init__(self, log_file):
        self.log_file = log_file

    def on_log(self, args, state, control, logs=None, **kwargs):
        if state.is_local_process_zero and logs is not None:
            with open(self.log_file, 'a') as f:
                if 'loss' in logs:
                    f.write(f"Step: {state.global_step}, Loss: {logs['loss']}\n")
                if 'eval_loss' in logs:
                    f.write(f"Step: {state.global_step}, Eval Loss: {logs['eval_loss']}\n")

log_file = os.path.join("outputs", "loss_log.txt")


# Configure the SFTTrainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=formatted_train_dataset,
    eval_dataset=formatted_val_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=16,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=16,
        warmup_steps=100,
        num_train_epochs=1,
        gradient_checkpointing=True,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        eval_strategy="steps",
        eval_steps=500,
        save_strategy="steps",
        save_steps=500,
    ),
    callbacks=[LossLoggingCallback(log_file)],
)

# Train the model
trainer_stats = trainer.train()

# Save the fine-tuned model
model.save_pretrained_gguf("Fine Tuned deepmental 8b - new format", tokenizer, quantization_method="q8_0")