In [1]:
# train_detector_fast_fixed.py
# pip install transformers datasets torch scikit-learn accelerate

import numpy as np
import torch
import sklearn.metrics as metrics
from datasets import load_dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)

# -------------------------------
# ‚öôÔ∏è CONFIGURATION
# -------------------------------
MODEL_NAME = "distilbert-base-uncased"   # Lightweight BERT
NUM_LABELS = 2                           # 0 = Non-suicidal, 1 = Suicidal
FAST_MODE = True                         # Quick test mode
TEST_SIZE = 0.2                          # 80% train, 20% test

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"üß† Using device: {device}")

# -------------------------------
# 1Ô∏è‚É£ LOAD DATASET
# -------------------------------
print("\nüîπ Loading dataset from Hugging Face...")
ds = load_dataset("cypsiSAS/transformed_Suicidal_ideation")
print("‚úÖ Original splits:", list(ds.keys()))

# Some Hugging Face datasets only have a "train" split
if "train" in ds and len(ds.keys()) == 1:
    print("‚ö†Ô∏è Dataset has no test/validation split ‚Äî creating one manually...")
    ds = ds["train"].train_test_split(test_size=TEST_SIZE, seed=42)
    ds = DatasetDict({
        "train": ds["train"],
        "test": ds["test"]
    })
print("‚úÖ Dataset splits after fix:", list(ds.keys()))

# Detect text column dynamically
sample = ds["train"][0]
text_col = "formatted" if "formatted" in sample else ("text" if "text" in sample else "content")
print(f"üßæ Using text column: '{text_col}'")

# -------------------------------
# 2Ô∏è‚É£ HANDLE LABELS
# -------------------------------
if "labels" not in ds["train"].features and "label" not in ds["train"].features:
    print("‚ö†Ô∏è No labels found ‚Äî creating synthetic ones from keywords...")

    def create_labels(examples):
        suicidal_keywords = [
            "kill myself", "end my life", "commit suicide", "want to die",
            "suicidal", "end it all", "take my own life", "better off dead",
            "not worth living", "want to disappear", "hurt myself", "self harm"
        ]
        labels = []
        for text in examples[text_col]:
            text_lower = text.lower()
            labels.append(1 if any(k in text_lower for k in suicidal_keywords) else 0)
        return {"labels": labels}

    ds = ds.map(create_labels, batched=True)
else:
    label_col = "label" if "label" in ds["train"].features else "labels"
    if label_col != "labels":
        ds = ds.rename_column(label_col, "labels")

# Label distribution
if "labels" in ds["train"].features:
    y = np.array(ds["train"]["labels"])
    print(f"\nüìä Label distribution ‚Äî Non-suicidal (0): {np.sum(y==0)}, Suicidal (1): {np.sum(y==1)}")

# -------------------------------
# 3Ô∏è‚É£ FAST MODE (subset for quick testing)
# -------------------------------
if FAST_MODE:
    print("\n‚ö° Fast Mode: using a small subset for quick training.")
    ds["train"] = ds["train"].select(range(min(50, len(ds["train"]))))
    ds["test"] = ds["test"].select(range(min(10, len(ds["test"]))))

# -------------------------------
# 4Ô∏è‚É£ TOKENIZATION
# -------------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess(examples):
    return tokenizer(
        examples[text_col],
        truncation=True,
        padding="max_length",
        max_length=256
    )

print("\nüîπ Tokenizing...")
ds = ds.map(preprocess, batched=True)
print("‚úÖ Tokenization complete.")

format_cols = ["input_ids", "attention_mask"]
if "labels" in ds["train"].features:
    format_cols.append("labels")
ds.set_format(type="torch", columns=format_cols)

# -------------------------------
# 5Ô∏è‚É£ MODEL INITIALIZATION
# -------------------------------
print("\nüîπ Loading model...")
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)
model.to(device)
print("‚úÖ Model loaded successfully.")

# -------------------------------
# 6Ô∏è‚É£ METRIC FUNCTION
# -------------------------------
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    precision = metrics.precision_score(labels, preds, zero_division=0)
    recall = metrics.recall_score(labels, preds, zero_division=0)
    f1 = metrics.f1_score(labels, preds, zero_division=0)
    acc = metrics.accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


  from .autonotebook import tqdm as notebook_tqdm


üß† Using device: cpu

üîπ Loading dataset from Hugging Face...
‚úÖ Original splits: ['train']
‚ö†Ô∏è Dataset has no test/validation split ‚Äî creating one manually...
‚úÖ Dataset splits after fix: ['train', 'test']
üßæ Using text column: 'formatted'
‚ö†Ô∏è No labels found ‚Äî creating synthetic ones from keywords...


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 52928/52928 [00:00<00:00, 206792.91 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 13233/13233 [00:00<00:00, 156182.13 examples/s]



üìä Label distribution ‚Äî Non-suicidal (0): 963, Suicidal (1): 51965

‚ö° Fast Mode: using a small subset for quick training.

üîπ Tokenizing...


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:00<00:00, 5030.23 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:00<00:00, 1982.09 examples/s]

‚úÖ Tokenization complete.

üîπ Loading model...



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úÖ Model loaded successfully.


In [2]:
# -------------------------------
# 7Ô∏è‚É£ TRAINING ARGUMENTS
# -------------------------------
training_args = TrainingArguments(
    output_dir="out_model",
    do_eval=True,
    eval_steps=10,
    save_steps=20,
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=5,
    fp16=torch.cuda.is_available(),
)


In [3]:
# -------------------------------
# 8Ô∏è‚É£ TRAINER SETUP
# -------------------------------
eval_split = "validation" if "validation" in ds else "test"

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds[eval_split],
    compute_metrics=compute_metrics,
)


In [4]:
# -------------------------------
# 9Ô∏è‚É£ TRAIN & EVALUATE
# -------------------------------
print("\nüöÄ Starting training...")
trainer.train()

print("\nüìà Evaluating model on unseen test data...")
results = trainer.evaluate(ds[eval_split])

print("\n‚úÖ Evaluation results:")
print(f"Accuracy : {results['eval_accuracy']*100:.2f}%")
print(f"Precision: {results['eval_precision']*100:.2f}%")
print(f"Recall   : {results['eval_recall']*100:.2f}%")
print(f"F1 Score : {results['eval_f1']*100:.2f}%")



üöÄ Starting training...




Step,Training Loss
5,0.5854
10,0.3833
15,0.2001
20,0.2064
25,0.0721
30,0.0511
35,0.1869





üìà Evaluating model on unseen test data...





‚úÖ Evaluation results:
Accuracy : 90.00%
Precision: 90.00%
Recall   : 100.00%
F1 Score : 94.74%
