In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from transformers import pipeline

# https://huggingface.co/mental/mental-roberta-base
#or
# https://huggingface.co/mental/mental-bert-base-uncased -- USED THIS

plt.style.use('ggplot')

MODEL = "mental/mental-bert-base-uncased"
# Uncomment this and command on line 5 in loading and preparing data
#  SampleSize = 10000

DATA_CSV_PATH = 'dataset/Combined Data 2.csv'

TEXT_COLUMN = 'statement'
LABEL_COLUMN = 'status'

LABELS = ['Anxiety', 'Bipolar', 'Stress', 'Depression', 'Normal', 'Personality disorder', 'Suicidal']
NUM_LABELS = len(LABELS)
label_to_id = {label: i for i, label in enumerate(LABELS)}
id_to_label = {i: label for i, label in enumerate(LABELS)}

TRAINING_ARGS = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none"
)

## Loading and Preparing Data


In [10]:
print(f"Loading data from: {DATA_CSV_PATH}")
df_data = pd.read_csv(DATA_CSV_PATH)
# UNCOMMENT TO SET SPECIFIED SAMPLE SIZE
# df_data = df_data.sample(SampleSize)
print("Original DataFrame head:")
print(df_data.head())
print(f"Original DataFrame shape: {df_data.shape}")

df_data.dropna(subset=[TEXT_COLUMN, LABEL_COLUMN], inplace=True)
df_data = df_data[df_data[LABEL_COLUMN].isin(LABELS)]
df_data[TEXT_COLUMN] = df_data[TEXT_COLUMN].astype(str).str.strip()
df_data = df_data[df_data[TEXT_COLUMN].str.len() > 0]


print(f"\nDataFrame shape after cleaning: {df_data.shape}")
if df_data.empty:
    print("Error: No data left after cleaning. Check your CSV and column names.")
    exit()

df_data['labels'] = df_data[LABEL_COLUMN].map(label_to_id)
if df_data['labels'].isnull().any():
    print("Error: Some labels in your CSV did not match the defined LABELS. Check for typos.")
    print("Unique labels in CSV:", df_data[LABEL_COLUMN].unique())
    print("Defined LABELS:", LABELS)
    exit()

train_df, eval_df = train_test_split(df_data, test_size=0.2, random_state=42, stratify=df_data['labels'])

train_dataset = Dataset.from_pandas(train_df[[TEXT_COLUMN, 'labels']])
eval_dataset = Dataset.from_pandas(eval_df[[TEXT_COLUMN, 'labels']])

print(f"\nTraining data samples: {len(train_dataset)}")
print(f"Evaluation data samples: {len(eval_dataset)}")
print("First training example:", train_dataset[0])

Loading data from: dataset/Combined Data 2.csv
Original DataFrame head:
       Unnamed: 0                                          statement  \
22261       22261  Just as the the title says. I feel like one is...   
41400       41400  a blackened sky encroached tugging behind it m...   
20065       20065  It gives you insomnia, which in turn makes you...   
30036       30036  Hello all, I'm a new submitter to this channel...   
780           780                   Thank God the CB is over for Eid   

           status  
22261  Depression  
41400  Depression  
20065  Depression  
30036      Normal  
780        Normal  
Original DataFrame shape: (10000, 3)

DataFrame shape after cleaning: (9919, 3)

Training data samples: 7935
Evaluation data samples: 1984
First training example: {'statement': 'i like this restaurant because they give you free bread.', 'labels': 4, '__index_level_0__': 32571}


## Tokenizer and Model for Fine-tuning


In [11]:
print(f"\nLoading tokenizer and model '{MODEL}' for fine-tuning...")
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL,
    num_labels=NUM_LABELS,
    id2label=id_to_label,
    label2id=label_to_id
)

def tokenize_function(examples):
    return tokenizer(examples[TEXT_COLUMN], truncation=True, padding="max_length", max_length=512)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

tokenized_train_dataset.set_format("torch", columns=['input_ids', 'attention_mask', 'labels'])
tokenized_eval_dataset.set_format("torch", columns=['input_ids', 'attention_mask', 'labels'])

if '__index_level_0__' in tokenized_train_dataset.column_names:
    tokenized_train_dataset = tokenized_train_dataset.remove_columns(['__index_level_0__'])
if '__index_level_0__' in tokenized_eval_dataset.column_names:
    tokenized_eval_dataset = tokenized_eval_dataset.remove_columns(['__index_level_0__'])



Loading tokenizer and model 'mental/mental-bert-base-uncased' for fine-tuning...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/7935 [00:00<?, ? examples/s]

Map:   0%|          | 0/1984 [00:00<?, ? examples/s]

## Metrics and Trainer

In [12]:
def compute_metrics(p):
    predictions = np.argmax(p.predictions, axis=1)
    return {"accuracy": accuracy_score(p.label_ids, predictions)}

trainer = Trainer(
    model=model,
    args=TRAINING_ARGS,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

## Training Model

In [13]:
print("\nStarting model training...")
trainer.train()
print("\nTraining complete. Evaluating model on evaluation set...")


Starting model training...


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

## Evaluating Model

In [6]:
eval_results = trainer.evaluate()
print(f"\nEvaluation Results: {eval_results}")

predictions_output = trainer.predict(tokenized_eval_dataset)
y_pred = np.argmax(predictions_output.predictions, axis=1)
y_true = predictions_output.label_ids

print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=LABELS))


Evaluation Results: {'eval_loss': 1.0742179155349731, 'eval_accuracy': 0.5757575757575758, 'eval_runtime': 7.2771, 'eval_samples_per_second': 13.604, 'eval_steps_per_second': 1.786, 'epoch': 3.0}

Classification Report:
                      precision    recall  f1-score   support

             Anxiety       0.00      0.00      0.00         5
             Bipolar       0.00      0.00      0.00         5
              Stress       0.00      0.00      0.00         5
          Depression       0.43      0.97      0.59        30
              Normal       0.90      0.90      0.90        31
Personality disorder       0.00      0.00      0.00         2
            Suicidal       0.00      0.00      0.00        21

            accuracy                           0.58        99
           macro avg       0.19      0.27      0.21        99
        weighted avg       0.41      0.58      0.46        99



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Inference with fine tune model

In [7]:
print("\n--- Performing Inference with the Fine-tuned Model ---")

new_texts = [
    "I'm feeling incredibly anxious about my upcoming exam.",
    "My mood swings have been uncontrollable lately, from ecstatic to rock bottom.",
    "I'm exhausted and stressed out with all the deadlines.",
    "I just want to stay in bed all day and not do anything.",
    "Everything feels okay right now, just a normal day.",
    "I sometimes feel like I'm not really myself, or my personality keeps changing.",
    "I don't see any way out of this situation. I'm just done."
]

classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

print("\nPredictions for new text data:")
for text in new_texts:
    prediction = classifier(text)
    predicted_label = prediction[0]['label']
    score = prediction[0]['score']
    print(f"Text: '{text}'")
    print(f"Predicted Category: {predicted_label} (Score: {score:.4f})")
    print("-" * 50)

Device set to use mps:0



--- Performing Inference with the Fine-tuned Model ---

Predictions for new text data:
Text: 'I'm feeling incredibly anxious about my upcoming exam.'
Predicted Category: Normal (Score: 0.6485)
--------------------------------------------------
Text: 'My mood swings have been uncontrollable lately, from ecstatic to rock bottom.'
Predicted Category: Depression (Score: 0.4047)
--------------------------------------------------
Text: 'I'm exhausted and stressed out with all the deadlines.'
Predicted Category: Normal (Score: 0.4862)
--------------------------------------------------
Text: 'I just want to stay in bed all day and not do anything.'
Predicted Category: Depression (Score: 0.4287)
--------------------------------------------------
Text: 'Everything feels okay right now, just a normal day.'
Predicted Category: Normal (Score: 0.3516)
--------------------------------------------------
Text: 'I sometimes feel like I'm not really myself, or my personality keeps changing.'
Predicted C

## Saving Model

In [8]:
SAVE_FILE_NAME = "./monke"
model.save_pretrained(SAVE_FILE_NAME)
tokenizer.save_pretrained(SAVE_FILE_NAME)
print("\nFine-tuned model saved to " + SAVE_FILE_NAME)



Fine-tuned model saved to ./monke
