In [1]:
import re
from transformers import AutoTokenizer
from datasets import load_dataset



In [2]:
ds = load_dataset("heliosbrahma/mental_health_chatbot_dataset")

In [3]:
# Clean text function
def clean_text(text):
    # Remove tags <HUMAN>: and <ASSISTANT>:
    text = re.sub(r'<.*?>', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [4]:
# Apply cleaning function to the dataset
ds_cleaned = ds.map(lambda x: {'text': clean_text(x['text'])})

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [5]:
# Tokenize the cleaned text
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

In [6]:
# Tokenize the dataset
ds_tokenized = ds_cleaned.map(tokenize_function, batched=True)

# Preview tokenized data
print(ds_tokenized['train'][0])

{'text': ': What is a panic attack? : Panic attacks come on suddenly and involve intense and often overwhelming fear. They’re accompanied by very challenging physical symptoms, like a racing heartbeat, shortness of breath, or nausea. Unexpected panic attacks occur without an obvious cause. Expected panic attacks are cued by external stressors, like phobias. Panic attacks can happen to anyone, but having more than one may be a sign of panic disorder, a mental health condition characterized by sudden and repeated panic attacks.', 'input_ids': [101, 1024, 2054, 2003, 1037, 6634, 2886, 1029, 1024, 6634, 4491, 2272, 2006, 3402, 1998, 9125, 6387, 1998, 2411, 10827, 3571, 1012, 2027, 1521, 2128, 5642, 2011, 2200, 10368, 3558, 8030, 1010, 2066, 1037, 3868, 12251, 1010, 2460, 2791, 1997, 3052, 1010, 2030, 19029, 1012, 9223, 6634, 4491, 5258, 2302, 2019, 5793, 3426, 1012, 3517, 6634, 4491, 2024, 16091, 2094, 2011, 6327, 6911, 5668, 1010, 2066, 6887, 16429, 7951, 1012, 6634, 4491, 2064, 4148, 200

In [7]:
# Check the number of tokens in the first example
num_tokens = len(ds_tokenized['train'][0]['input_ids'])
print(f"Number of tokens in the first example: {num_tokens}")


Number of tokens in the first example: 512


In [8]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments

# Load pre-trained GPT-2 model and tokenizer
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Ensure the tokenizer is compatible with our input format
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 doesn't have a pad token by default


In [9]:
# Format dataset for text generation
def format_for_gpt2(examples):
    # We'll use the same text for both input and output in this case (auto-regressive generation)
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

# Apply the formatting to the dataset
ds_tokenized = ds_tokenized.map(format_for_gpt2, batched=True)

# Print the first example of the tokenized dataset
print(ds_tokenized['train'][0])


{'text': ': What is a panic attack? : Panic attacks come on suddenly and involve intense and often overwhelming fear. They’re accompanied by very challenging physical symptoms, like a racing heartbeat, shortness of breath, or nausea. Unexpected panic attacks occur without an obvious cause. Expected panic attacks are cued by external stressors, like phobias. Panic attacks can happen to anyone, but having more than one may be a sign of panic disorder, a mental health condition characterized by sudden and repeated panic attacks.', 'input_ids': [25, 1867, 318, 257, 13619, 1368, 30, 1058, 34478, 3434, 1282, 319, 6451, 290, 6211, 8157, 290, 1690, 9721, 3252, 13, 1119, 447, 247, 260, 11791, 416, 845, 9389, 3518, 7460, 11, 588, 257, 11717, 36051, 11, 1790, 1108, 286, 8033, 11, 393, 32122, 13, 471, 42072, 13619, 3434, 3051, 1231, 281, 3489, 2728, 13, 1475, 7254, 13619, 3434, 389, 269, 1739, 416, 7097, 5503, 669, 11, 588, 872, 672, 4448, 13, 34478, 3434, 460, 1645, 284, 2687, 11, 475, 1719, 517,

In [10]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

# Load pre-trained GPT-2 model and tokenizer
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Ensure the tokenizer is compatible with our input format
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 doesn't have a pad token by default

# Format dataset for text generation 
def format_for_gpt2(examples):
    # Tokenize the text and apply padding/truncation
    inputs = tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)
    inputs['labels'] = inputs['input_ids'].copy()  # The labels are the same as input_ids for LM
    return inputs

# Apply the formatting to the dataset (already tokenized)
ds_tokenized = ds_tokenized.map(format_for_gpt2, batched=True)

# Split the tokenized dataset into train and validation sets using the `train_test_split` from the datasets library
train_test_split = ds_tokenized['train'].train_test_split(test_size=0.2, seed=42)
train_data = train_test_split['train']
val_data = train_test_split['test']

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",    # Directory for model output
    num_train_epochs=3,               # Number of epochs
    per_device_train_batch_size=4,    # Batch size per device
    per_device_eval_batch_size=8,     # Evaluation batch size
    warmup_steps=500,                 # Number of warmup steps
    weight_decay=0.01,                # Weight decay
    logging_dir='./logs',             # Directory for storing logs
    logging_steps=10,
    save_steps=500,                   # Save the model every 500 steps
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,         # Training dataset
    eval_dataset=val_data,            # Validation dataset
)

# Train the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,7.3492
20,6.2649
30,4.8425
40,3.7253
50,2.924
60,2.4274
70,1.8486
80,1.4839
90,1.0398
100,1.2817


('./gpt2-finetuned/tokenizer_config.json',
 './gpt2-finetuned/special_tokens_map.json',
 './gpt2-finetuned/vocab.json',
 './gpt2-finetuned/merges.txt',
 './gpt2-finetuned/added_tokens.json')

In [11]:
# Evaluate the model after training
trainer.evaluate()


{'eval_loss': 0.8819199204444885,
 'eval_runtime': 2.8772,
 'eval_samples_per_second': 12.165,
 'eval_steps_per_second': 1.738,
 'epoch': 3.0}

In [12]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./gpt2-finetuned")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-finetuned")

# Move the model to CPU or GPU 
device = torch.device('cpu') 
model.to(device)

# Generate text from a prompt
input_text = "What are some symptoms of anxiety?"
input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

# Generate response
output = model.generate(input_ids, max_length=100, num_return_sequences=1, no_repeat_ngram_size=2)

# Decode the output and print the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


What are some symptoms of anxiety?

A few common symptoms include:
. . . a feeling of hopelessness or hopeless focus.
, . , a sense of helplessness.
