In [145]:
# Happy Brain Mental Health Chatbot Training Notebook
import os
import torch
import pandas as pd
import numpy as np
from datasets import load_dataset, concatenate_datasets, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Set device
# Force selection of the NVIDIA GPU (assumed as device 0)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cpu


In [146]:
print('Loading datasets from Hugging Face...')

# Mental Health FAQ dataset from HF
try:
    mh_faq = load_dataset('csv', data_files='hf://datasets/tolu07/Mental_Health_FAQ/Mental_Health_FAQ.csv')
    print('MH FAQ dataset loaded from HF')
except Exception as e:
    print('Error loading MH FAQ dataset from HF:', e)
    mh_faq = None

Loading datasets from Hugging Face...
MH FAQ dataset loaded from HF


In [147]:
# Mental Health Counseling Conversations
try:
    mh_counseling = load_dataset('Amod/mental_health_counseling_conversations')
    print('Mental Health Counseling Conversations loaded')
except Exception as e:
    print('Error loading Mental Health Counseling Conversations:', e)
    mh_counseling = None

Mental Health Counseling Conversations loaded


In [148]:
# AI Medical Chatbot dialogues
try:
    ai_medical = load_dataset('parquet', data_files='hf://datasets/ruslanmv/ai-medical-chatbot/dialogues.parquet')
    print('AI Medical Chatbot dialogues loaded')
except Exception as e:
    print('Error loading AI Medical Chatbot dialogues:', e)
    ai_medical = None

AI Medical Chatbot dialogues loaded


In [149]:
# ChatDoctor-HealthCareMagic dataset
try:
    chatdoctor = load_dataset('parquet', data_files='hf://datasets/lavita/ChatDoctor-HealthCareMagic-100k/data/train-00000-of-00001-5e7cb295b9cff0bf.parquet')
    print('ChatDoctor dataset loaded')
except Exception as e:
    print('Error loading ChatDoctor dataset:', e)
    chatdoctor = None

ChatDoctor dataset loaded


In [150]:
# Mental Health Chatbot dataset from HF
try:
    mh_chatbot = load_dataset('parquet', data_files='hf://datasets/heliosbrahma/mental_health_chatbot_dataset/data/train-00000-of-00001-01391a60ef5c00d9.parquet')
    print('Mental Health Chatbot dataset loaded from HF')
except Exception as e:
    print('Error loading Mental Health Chatbot dataset from HF:', e)
    mh_chatbot = None

Mental Health Chatbot dataset loaded from HF


In [151]:
# List the datasets for further processing
hf_datasets = [
    (mh_faq, 'mh_faq'),
    (mh_counseling, 'mh_counseling'),
    (ai_medical, 'ai_medical'),
    (chatdoctor, 'chatdoctor'),
    (mh_chatbot, 'mh_chatbot')
]


In [152]:
print('Loading local CSV files...')

# Load Mental Health FAQ local CSV
try:
    local_mh_faq = pd.read_csv('./data/mental_health_faq.csv', encoding='utf-8')
    print('Local Mental Health FAQ CSV loaded, columns:', list(local_mh_faq.columns))
except Exception as e:
    print('Error loading local Mental Health FAQ CSV:', e)
    local_mh_faq = None

# Load transformed mental health chatbot CSV
transformed_mh_chatbot = None
try:
    transformed_mh_chatbot = pd.read_csv("transformed_mental_health_chatbot.csv")
    print("Loaded transformed_mental_health_chatbot.csv with", len(transformed_mh_chatbot), "rows")
except Exception as e:
    print("Error loading transformed_mental_health_chatbot.csv:", e)
    transformed_mh_chatbot = None

Loading local CSV files...
Local Mental Health FAQ CSV loaded, columns: ['Question_ID', 'Questions', 'Answers']
Error loading transformed_mental_health_chatbot.csv: [Errno 2] No such file or directory: 'transformed_mental_health_chatbot.csv'


In [153]:
# Load the Friend mode and Professional mode Responses CSV
try:
    responses = pd.read_csv('./data/Mental Health Chatbot Dataset - Friend mode and Professional mode Responses.csv', encoding='utf-8')
    print('Responses CSV loaded, columns:', list(responses.columns))
except Exception as e:
    print('Error loading Responses CSV:', e)
    responses = None

Responses CSV loaded, columns: ['User Input', 'Friend Mode Response', 'Professional Mode Response']


In [154]:
# Clean local_mh_faq: Separate into question and answer columns if possible
if local_mh_faq is not None:
    # Attempt to find columns that contain 'question' and 'answer'
    possible_q = [col for col in local_mh_faq.columns if 'question' in col.lower()]
    possible_a = [col for col in local_mh_faq.columns if 'answer' in col.lower()]

    if possible_q and possible_a:
        local_mh_faq['text'] = 'User: ' + local_mh_faq[possible_q[0]].astype(str) + '\nAssistant: ' + local_mh_faq[possible_a[0]].astype(str)
    else:
        # If specific columns are not found, use all columns concatenated
        if 'text' in local_mh_faq.columns:
            local_mh_faq['text'] = local_mh_faq['text']
        else:
            local_mh_faq['text'] = local_mh_faq.apply(lambda row: ' '.join(row.dropna().astype(str)), axis=1)
    print('Local MH FAQ cleaned.')

Local MH FAQ cleaned.


In [155]:
# Clean responses CSV: Create separate texts for friend mode and professional mode
if responses is not None:
    if 'User Input' in responses.columns and 'Friend Mode Response' in responses.columns and 'Professional Mode Response' in responses.columns:
        responses['friend_text'] = 'User: ' + responses['User Input'].astype(str) + '\nAssistant: ' + responses['Friend Mode Response'].astype(str)
        responses['professional_text'] = 'User: ' + responses['User Input'].astype(str) + '\nAssistant: ' + responses['Professional Mode Response'].astype(str)
    else:
        responses['text'] = responses.apply(lambda row: ' '.join(row.dropna().astype(str)), axis=1)
    print('Responses CSV cleaned by concatenation.')

print('Local CSV files loading and cleaning complete.')

Responses CSV cleaned by concatenation.
Local CSV files loading and cleaning complete.


In [156]:
# Load Mental Health FAQ local CSV
try:
    local_mh_faq = pd.read_csv('./data/mental_health_faq.csv', encoding='utf-8')
    print('Local Mental Health FAQ CSV loaded, columns:', list(local_mh_faq.columns))
except Exception as e:
    print('Error loading local Mental Health FAQ CSV:', e)
    local_mh_faq = None

Local Mental Health FAQ CSV loaded, columns: ['Question_ID', 'Questions', 'Answers']


In [157]:
# Load the Friend mode and Professional mode Responses CSV
try:
    responses = pd.read_csv('./data/Mental Health Chatbot Dataset - Friend mode and Professional mode Responses.csv', encoding='utf-8')
    print('Responses CSV loaded, columns:', list(responses.columns))
except Exception as e:
    print('Error loading Responses CSV:', e)
    responses = None

Responses CSV loaded, columns: ['User Input', 'Friend Mode Response', 'Professional Mode Response']


In [158]:
# Clean local_mh_faq: Separate into question and answer columns if possible
if local_mh_faq is not None:
    # Attempt to find columns that contain 'question' and 'answer'
    possible_q = [col for col in local_mh_faq.columns if 'question' in col.lower()]
    possible_a = [col for col in local_mh_faq.columns if 'answer' in col.lower()]

    if possible_q and possible_a:
        local_mh_faq['text'] = 'User: ' + local_mh_faq[possible_q[0]].astype(str) + '\nAssistant: ' + local_mh_faq[possible_a[0]].astype(str)
    else:
        # If specific columns are not found, use all columns concatenated
        if 'text' in local_mh_faq.columns:
            local_mh_faq['text'] = local_mh_faq['text']
        else:
            local_mh_faq['text'] = local_mh_faq.apply(lambda row: ' '.join(row.dropna().astype(str)), axis=1)
    print('Local MH FAQ cleaned.')

Local MH FAQ cleaned.


In [159]:
# Clean responses CSV: Create separate texts for friend mode and professional mode
if responses is not None:
    if 'User Input' in responses.columns and 'Friend Mode Response' in responses.columns and 'Professional Mode Response' in responses.columns:
        responses['friend_text'] = 'User: ' + responses['User Input'].astype(str) + '\nAssistant: ' + responses['Friend Mode Response'].astype(str)
        responses['professional_text'] = 'User: ' + responses['User Input'].astype(str) + '\nAssistant: ' + responses['Professional Mode Response'].astype(str)
    else:
        responses['text'] = responses.apply(lambda row: ' '.join(row.dropna().astype(str)), axis=1)
    print('Responses CSV cleaned by concatenation.')

print('Local CSV files loading and cleaning complete.')

Responses CSV cleaned by concatenation.
Local CSV files loading and cleaning complete.


In [160]:
def format_sample(example, source):
    # For FAQ data, assume 'text' column exists
    if source in ['mh_faq', 'local_mh_faq']:
        if 'text' in example:
            return example['text']
        else:
            return str(example)
    elif source == 'mh_counseling':
        if 'conversation' in example:
            return example['conversation']
        else:
            return str(example)
    elif source == 'ai_medical':
        if 'dialogue' in example:
            return example['dialogue']
        elif 'text' in example:
            return example['text']
        else:
            return str(example)
    elif source == 'chatdoctor':
        if 'text' in example:
            return example['text']
        elif 'instruction' in example and 'output' in example:
            return 'User: ' + example['instruction'] + '\nAssistant: ' + example['output']
        else:
            return str(example)
    elif source == 'mh_chatbot':
        if 'text' in example:
            return example['text']
        elif 'conversation' in example:
            return example['conversation']
        elif 'question' in example and 'answer' in example:
            return 'User: ' + example['question'] + '\nAssistant: ' + example['answer']
        else:
            return str(example)
    elif source == 'responses_friend':
        if 'friend_text' in example:
            return example['friend_text']
        else:
            return str(example)
    elif source == 'responses_professional':
        if 'professional_text' in example:
            return example['professional_text']
        else:
            return str(example)
    else:
        return str(example)

In [161]:
# Function to apply formatting to a dataset from Hugging Face
def preprocess_dataset(dataset, source):
    if dataset is None:
        return None
    split = 'train' if 'train' in dataset else list(dataset.keys())[0]
    data = dataset[split]
    formatted = data.map(lambda x: {'text': format_sample(x, source)})
    return formatted

print('Preprocessing function defined.')
# Removed duplicate and improperly indented code

# Function to preprocess CSV datasets
def preprocess_csv_dataset(df, source):
    # Create a dataset from the dataframe
    dataset_dict = {
        "question": df["question"].tolist(),
        "answer": df["answer"].tolist()
    }
    dataset = Dataset.from_dict(dataset_dict)
    
    # Format the dataset
def format_example(example):
    return {
        "text": f"Question: {example['question']}\nAnswer: {example['answer']}",  # Format as question-answer pairs
        "source": source
    }
    
    formatted = dataset.map(format_example)
    return formatted

Preprocessing function defined.


In [162]:
formatted_datasets = []

# Process datasets from HF
for ds, source in hf_datasets:
    if ds is not None:
        formatted = preprocess_dataset(ds, source)
        if formatted is not None:
            formatted_datasets.append(formatted)
            print('Added', source, 'to formatted datasets')

Added mh_faq to formatted datasets
Added mh_counseling to formatted datasets
Added ai_medical to formatted datasets
Added chatdoctor to formatted datasets
Added mh_chatbot to formatted datasets


In [163]:
# Process local datasets
if local_mh_faq is not None:
    # Convert pandas DataFrame to Dataset
    local_mh_faq_ds = Dataset.from_pandas(local_mh_faq[['text']])
    formatted_datasets.append(local_mh_faq_ds)
    print('Added local_mh_faq to formatted datasets')

if responses is not None:
    # Create two datasets: one for friend mode and one for professional mode
    if 'friend_text' in responses.columns:
        friend_ds = Dataset.from_pandas(responses[['friend_text']].rename(columns={'friend_text': 'text'}))
        formatted_datasets.append(friend_ds)
        print('Added responses_friend to formatted datasets')
    
    if 'professional_text' in responses.columns:
        prof_ds = Dataset.from_pandas(responses[['professional_text']].rename(columns={'professional_text': 'text'}))
        formatted_datasets.append(prof_ds)
        print('Added responses_professional to formatted datasets')

Added local_mh_faq to formatted datasets
Added responses_friend to formatted datasets
Added responses_professional to formatted datasets


In [164]:
# Combine all datasets
if formatted_datasets:
    combined_dataset = concatenate_datasets(formatted_datasets)
    print('Combined dataset created with', len(combined_dataset), 'samples')
else:
    print('No datasets were successfully processed')
    combined_dataset = None

Combined dataset created with 373371 samples


In [165]:
# Initialize tokenizer
model_name = 'gpt2'  # Base model to fine-tune
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [166]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

if combined_dataset is not None:
    tokenized_dataset = combined_dataset.map(tokenize_function, batched=True)
    print('Dataset tokenized')

    # Split the dataset
    train_size = int(0.9 * len(tokenized_dataset))
    test_size = len(tokenized_dataset) - train_size
    
    train_dataset = tokenized_dataset.select(range(train_size))
    test_dataset = tokenized_dataset.select(range(train_size, len(tokenized_dataset)))
    
    print('Dataset split into', len(train_dataset), 'training samples and', len(test_dataset), 'validation samples')
else:
    print('No dataset to tokenize')


Dataset tokenized
Dataset split into 336033 training samples and 37338 validation samples


In [167]:
# Initialize model
model = AutoModelForCausalLM.from_pretrained(model_name)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./happy_brain',  # Use the new model name
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    eval_steps=400,
    save_steps=800,
    warmup_steps=500,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy='steps',
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to='none',
)



In [168]:
# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

In [169]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

print('Training setup complete')


Training setup complete


In [170]:
# Train the model
trainer.train()

# Save the model
trainer.save_model('./happy_brain')
loss_type = 'mse'  # Example loss type, adjust as needed
tokenizer.save_pretrained('./happy_brain')

print('Model trained and saved as "happy_brain"')


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# Load the trained model for inference
inference_model = AutoModelForCausalLM.from_pretrained('./happy_brain')
inference_tokenizer = AutoTokenizer.from_pretrained('./happy_brain')

In [None]:
# Function to generate responses
def generate_response(prompt, max_length=100):
    inputs = inference_tokenizer('User: ' + prompt + '
Assistant:', return_tensors='pt')
    outputs = inference_model.generate(
        inputs['input_ids'],
        max_length=max_length,
        num_return_sequences=1,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=inference_tokenizer.eos_token_id
    )
    response = inference_tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract only the assistant's response
    if 'Assistant:' in response:
        response = response.split('Assistant:')[1].strip()
    return response

# Test the model with a few examples
test_prompts = [
    "I'm feeling really anxious about my upcoming exam.",
    "I've been feeling sad lately and I don't know why.",
    "How can I improve my mental health?"
]

for prompt in test_prompts:
    response = generate_response(prompt)
    print('User:', prompt)
    print('Happy Brain:', response)
    print('---')

print('Inference test complete')