In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from transformers import pipeline

# https://huggingface.co/mental/mental-roberta-base
#or
# https://huggingface.co/mental/mental-bert-base-uncased -- USED THIS

plt.style.use('ggplot')

MODEL = "mental/mental-bert-base-uncased"
# Uncomment this and command on line 5 in loading and preparing data
#  SampleSize = 10000

DATA_CSV_PATH = 'dataset/Combined Data 2.csv'

TEXT_COLUMN = 'statement'
LABEL_COLUMN = 'status'

LABELS = ['Anxiety', 'Bipolar', 'Stress', 'Depression', 'Normal', 'Personality disorder', 'Suicidal']
NUM_LABELS = len(LABELS)
label_to_id = {label: i for i, label in enumerate(LABELS)}
id_to_label = {i: label for i, label in enumerate(LABELS)}

TRAINING_ARGS = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none"
)

## Loading and Preparing Data


In [None]:
print(f"Loading data from: {DATA_CSV_PATH}")
df_data = pd.read_csv(DATA_CSV_PATH)
# UNCOMMENT TO SET SPECIFIED SAMPLE SIZE
# df_data = df_data.sample(SampleSize)
print("Original DataFrame head:")
print(df_data.head())
print(f"Original DataFrame shape: {df_data.shape}")

df_data.dropna(subset=[TEXT_COLUMN, LABEL_COLUMN], inplace=True)
df_data = df_data[df_data[LABEL_COLUMN].isin(LABELS)]
df_data[TEXT_COLUMN] = df_data[TEXT_COLUMN].astype(str).str.strip()
df_data = df_data[df_data[TEXT_COLUMN].str.len() > 0]


print(f"\nDataFrame shape after cleaning: {df_data.shape}")
if df_data.empty:
    print("Error: No data left after cleaning. Check your CSV and column names.")
    exit()

df_data['labels'] = df_data[LABEL_COLUMN].map(label_to_id)
if df_data['labels'].isnull().any():
    print("Error: Some labels in your CSV did not match the defined LABELS. Check for typos.")
    print("Unique labels in CSV:", df_data[LABEL_COLUMN].unique())
    print("Defined LABELS:", LABELS)
    exit()

train_df, eval_df = train_test_split(df_data, test_size=0.2, random_state=42, stratify=df_data['labels'])

train_dataset = Dataset.from_pandas(train_df[[TEXT_COLUMN, 'labels']])
eval_dataset = Dataset.from_pandas(eval_df[[TEXT_COLUMN, 'labels']])

print(f"\nTraining data samples: {len(train_dataset)}")
print(f"Evaluation data samples: {len(eval_dataset)}")
print("First training example:", train_dataset[0])

## Tokenizer and Model for Fine-tuning


In [None]:
print(f"\nLoading tokenizer and model '{MODEL}' for fine-tuning...")
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL,
    num_labels=NUM_LABELS,
    id2label=id_to_label,
    label2id=label_to_id
)

def tokenize_function(examples):
    return tokenizer(examples[TEXT_COLUMN], truncation=True, padding="max_length", max_length=512)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

tokenized_train_dataset.set_format("torch", columns=['input_ids', 'attention_mask', 'labels'])
tokenized_eval_dataset.set_format("torch", columns=['input_ids', 'attention_mask', 'labels'])

if '__index_level_0__' in tokenized_train_dataset.column_names:
    tokenized_train_dataset = tokenized_train_dataset.remove_columns(['__index_level_0__'])
if '__index_level_0__' in tokenized_eval_dataset.column_names:
    tokenized_eval_dataset = tokenized_eval_dataset.remove_columns(['__index_level_0__'])


## Metrics and Trainer

In [None]:
def compute_metrics(p):
    predictions = np.argmax(p.predictions, axis=1)
    return {"accuracy": accuracy_score(p.label_ids, predictions)}

trainer = Trainer(
    model=model,
    args=TRAINING_ARGS,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

## Training Model

In [None]:
print("\nStarting model training...")
trainer.train()
print("\nTraining complete. Evaluating model on evaluation set...")

## Evaluating Model

In [None]:
eval_results = trainer.evaluate()
print(f"\nEvaluation Results: {eval_results}")

predictions_output = trainer.predict(tokenized_eval_dataset)
y_pred = np.argmax(predictions_output.predictions, axis=1)
y_true = predictions_output.label_ids

print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=LABELS))

## Inference with fine tune model

In [None]:
print("\n--- Performing Inference with the Fine-tuned Model ---")

new_texts = [
    "I'm feeling incredibly anxious about my upcoming exam.",
    "My mood swings have been uncontrollable lately, from ecstatic to rock bottom.",
    "I'm exhausted and stressed out with all the deadlines.",
    "I just want to stay in bed all day and not do anything.",
    "Everything feels okay right now, just a normal day.",
    "I sometimes feel like I'm not really myself, or my personality keeps changing.",
    "I don't see any way out of this situation. I'm just done."
]

classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

print("\nPredictions for new text data:")
for text in new_texts:
    prediction = classifier(text)
    predicted_label = prediction[0]['label']
    score = prediction[0]['score']
    print(f"Text: '{text}'")
    print(f"Predicted Category: {predicted_label} (Score: {score:.4f})")
    print("-" * 50)

## Saving Model

In [None]:
SAVE_FILE_NAME = "./monke"
model.save_pretrained(SAVE_FILE_NAME)
tokenizer.save_pretrained(SAVE_FILE_NAME)
print("\nFine-tuned model saved to " + SAVE_FILE_NAME)
