
Note that the model was fine tuned on Google Colabs, this is the respective file for reference.

In [None]:
# Install Libraries
!pip install transformers datasets accelerate -q
!pip install pandas scikit-learn -q
!pip install numpy==1.26.4 --q


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m83.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m55.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Load and Prepare Dataset
import pandas as pd
from datasets import Dataset

df = pd.read_csv("mental_health_dataset.csv")
dataset = Dataset.from_pandas(df)

# Map text labels (e.g., Anxiety) to integers
label2id = {label: idx for idx, label in enumerate(sorted(df["status"].unique()))}
id2label = {v: k for k, v in label2id.items()}

df = df.dropna(subset=["statement", "status"])

In [None]:
# Add numeric label column
df["label"] = df["status"].map(label2id)

# Prepare Hugging Face dataset
dataset = Dataset.from_pandas(df[["statement", "label"]])


In [None]:
# Tokenize using Roberta tokenizer
from transformers import RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer

model_ckpt = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_ckpt)

def preprocess_function(examples):
    # Ensure all statements are strings and not None
    texts = [str(x) if x is not None else "" for x in examples["statement"]]
    return tokenizer(texts, truncation=True, padding=True)

tokenized_dataset = dataset.train_test_split(test_size=0.2)
tokenized_dataset = tokenized_dataset.map(preprocess_function, batched=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Map:   0%|          | 0/42144 [00:00<?, ? examples/s]

Map:   0%|          | 0/10537 [00:00<?, ? examples/s]

In [None]:
# Load and configure model
model = RobertaForSequenceClassification.from_pretrained(
    model_ckpt,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# TrainingArguments + Trainer
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average='weighted')
    }

# Training arguments
training_args = TrainingArguments(
    output_dir="./roberta-mental-health",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    fp16=True  # only if using GPU
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mlishaangral[0m ([33mlishaangral-netaji-subhas-university-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4458,0.454554,0.836576,0.837729
2,0.3521,0.442276,0.84768,0.846358
3,0.253,0.524967,0.860492,0.860641


TrainOutput(global_step=15804, training_loss=0.39116942034706653, metrics={'train_runtime': 4037.346, 'train_samples_per_second': 31.316, 'train_steps_per_second': 3.914, 'total_flos': 3.32671503458304e+16, 'train_loss': 0.39116942034706653, 'epoch': 3.0})

In [None]:
# Save the tuned model
tokenizer.save_pretrained("./roberta-mental-health")
model.save_pretrained("./roberta-mental-health")

In [None]:
!pip install nltk
import nltk
import os

# Loading tokenizer
tokenizer = RobertaTokenizer.from_pretrained("./roberta-mental-health")

# Downloading punkt tokenizer
nltk.download('punkt_tab')

# Reading the book
with open("./Diagnostic and Statistical Manual of Mental Disorders (5th ed.).txt", "r", encoding="utf-8") as f:
    text = f.read()

# Split into sentences (to avoid cutting mid-sentence)
from nltk.tokenize import sent_tokenize
sentences = sent_tokenize(text)

# Join short sentences for better context
block_size = 512
inputs = tokenizer(sentences, return_special_tokens_mask=True, truncation=True, padding="max_length", max_length=block_size)

# Create Hugging Face Dataset
dataset2 = Dataset.from_dict(inputs)



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM.from_pretrained("./roberta-mental-health")

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at ./roberta-mental-health and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="../models/roberta-mental-health-v2",  # Save new version
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=500,
    save_total_limit=2,
    eval_strategy="no",
    logging_steps=100,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=100
)

In [None]:
# Data Collator for MLM
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

In [None]:
# Train again on Psychology book

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset2,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()


  trainer = Trainer(


Step,Training Loss
100,11.0767
200,6.6771
300,5.0714
400,4.4124
500,4.2
600,3.8536
700,3.601
800,3.4838
900,3.3567
1000,3.2456


TrainOutput(global_step=7491, training_loss=2.4205393416272702, metrics={'train_runtime': 3062.8121, 'train_samples_per_second': 19.565, 'train_steps_per_second': 2.446, 'total_flos': 1.57761828880128e+16, 'train_loss': 2.4205393416272702, 'epoch': 3.0})

In [None]:
from huggingface_hub import notebook_login
notebook_login()


In [None]:
# Load your fine-tuned model
model = RobertaForSequenceClassification.from_pretrained("../models/roberta-mental-health-v2")
tokenizer = RobertaTokenizer.from_pretrained("../models/roberta-mental-health-v2")

# Define repo name
repo_name = "lishaangral/roberta-mental-health"

# Push model and tokenizer to Hugging Face Hub
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)