In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
!pip -q install transformers datasets accelerate scikit-learn


In [3]:
import os
import numpy as np
import pandas as pd

PROJECT_ROOT = "/content/drive/MyDrive/Autism_Therapist_Collaboration_AI"
SPLITS_DIR   = os.path.join(PROJECT_ROOT, "01_datasets/splits")
MODEL_DIR    = os.path.join(PROJECT_ROOT, "03_models/issue_classifier")

os.makedirs(MODEL_DIR, exist_ok=True)

train_path = os.path.join(SPLITS_DIR, "caregiver_train.csv")
val_path   = os.path.join(SPLITS_DIR, "caregiver_val.csv")
test_path  = os.path.join(SPLITS_DIR, "caregiver_test.csv")

print("Train:", train_path, "->", os.path.exists(train_path))
print("Val  :", val_path,   "->", os.path.exists(val_path))
print("Test :", test_path,  "->", os.path.exists(test_path))


Train: /content/drive/MyDrive/Autism_Therapist_Collaboration_AI/01_datasets/splits/caregiver_train.csv -> True
Val  : /content/drive/MyDrive/Autism_Therapist_Collaboration_AI/01_datasets/splits/caregiver_val.csv -> True
Test : /content/drive/MyDrive/Autism_Therapist_Collaboration_AI/01_datasets/splits/caregiver_test.csv -> True


In [4]:
df_train = pd.read_csv(train_path)
df_val   = pd.read_csv(val_path)
df_test  = pd.read_csv(test_path)

print(df_train.shape, df_val.shape, df_test.shape)
df_train.head()


(7000, 10) (1500, 10) (1500, 10)


Unnamed: 0,message_id,caregiver_message,base_message_en,language,child_age_group,timestamp,issue_category,urgency_level,risk_flag,sentiment
0,MSG007146,my child this week at the supermarket has been...,my child this week at the supermarket has been...,en,8-12,2025-04-16,routine_change,low,no,negative
1,MSG005097,my daughter for the past 3 days at school has ...,my daughter for the past 3 days at school has ...,en,2-4,2025-02-26,sleep_issue,low,no,neutral
2,MSG007846,my son since yesterday at school has been is w...,my son since yesterday at school has been is w...,en,13-17,2025-02-19,sleep_issue,low,no,negative
3,MSG002595,my son over the last 2 weeks at the playground...,my son over the last 2 weeks at the playground...,en,13-17,2025-09-20,routine_change,low,no,neutral
4,MSG007089,என் மகள்: my daughter today at the supermarket...,my daughter today at the supermarket has been ...,ta,13-17,2025-06-25,health_concern,medium,no,neutral


In [5]:
TEXT_COL  = "base_message_en"
LABEL_COL = "issue_category"

df_train = df_train[[TEXT_COL, LABEL_COL]].dropna()
df_val   = df_val[[TEXT_COL, LABEL_COL]].dropna()
df_test  = df_test[[TEXT_COL, LABEL_COL]].dropna()

print("✅ Columns used:", TEXT_COL, LABEL_COL)
print("Unique labels:", sorted(df_train[LABEL_COL].unique()))


✅ Columns used: base_message_en issue_category
Unique labels: ['aggression', 'anxiety_meltdown', 'daily_progress', 'feeding_issue', 'health_concern', 'regression_social', 'regression_speech', 'repetitive_behavior', 'routine_change', 'school_concern', 'self_injury', 'sensory_overload', 'sleep_issue']


In [6]:
labels = sorted(df_train[LABEL_COL].unique())
label2id = {lab: i for i, lab in enumerate(labels)}
id2label = {i: lab for lab, i in label2id.items()}

df_train["label"] = df_train[LABEL_COL].map(label2id)
df_val["label"]   = df_val[LABEL_COL].map(label2id)
df_test["label"]  = df_test[LABEL_COL].map(label2id)

df_train.head()


Unnamed: 0,base_message_en,issue_category,label
0,my child this week at the supermarket has been...,routine_change,8
1,my daughter for the past 3 days at school has ...,sleep_issue,12
2,my son since yesterday at school has been is w...,sleep_issue,12
3,my son over the last 2 weeks at the playground...,routine_change,8
4,my daughter today at the supermarket has been ...,health_concern,4


In [7]:
from datasets import Dataset

train_ds = Dataset.from_pandas(df_train[[TEXT_COL, "label"]], preserve_index=False)
val_ds   = Dataset.from_pandas(df_val[[TEXT_COL, "label"]], preserve_index=False)
test_ds  = Dataset.from_pandas(df_test[[TEXT_COL, "label"]], preserve_index=False)

train_ds, val_ds, test_ds


(Dataset({
     features: ['base_message_en', 'label'],
     num_rows: 7000
 }),
 Dataset({
     features: ['base_message_en', 'label'],
     num_rows: 1500
 }),
 Dataset({
     features: ['base_message_en', 'label'],
     num_rows: 1500
 }))

In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MODEL_NAME = "distilbert-base-uncased"  # change to "bert-base-uncased" if you want

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
def tokenize_batch(batch):
    return tokenizer(
        batch[TEXT_COL],
        truncation=True,
        padding="max_length",
        max_length=128
    )

train_tok = train_ds.map(tokenize_batch, batched=True)
val_tok   = val_ds.map(tokenize_batch, batched=True)
test_tok  = test_ds.map(tokenize_batch, batched=True)

cols_to_keep = ["input_ids", "attention_mask", "label"]
train_tok.set_format(type="torch", columns=cols_to_keep)
val_tok.set_format(type="torch", columns=cols_to_keep)
test_tok.set_format(type="torch", columns=cols_to_keep)


Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [10]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels_true = eval_pred
    preds = np.argmax(logits, axis=-1)

    acc = accuracy_score(labels_true, preds)
    p, r, f1, _ = precision_recall_fscore_support(labels_true, preds, average="weighted", zero_division=0)

    return {
        "accuracy": acc,
        "precision_weighted": p,
        "recall_weighted": r,
        "f1_weighted": f1
    }


In [11]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir=os.path.join(MODEL_DIR, "checkpoints"),

    # ✅ UPDATED ARGUMENT NAME
    eval_strategy="epoch",

    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=100,

    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,

    load_best_model_at_end=True,
    metric_for_best_model="f1_weighted",

    save_total_limit=2,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [12]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,Precision Weighted,Recall Weighted,F1 Weighted
1,0.027,0.012608,1.0,1.0,1.0,1.0
2,0.0081,0.004491,1.0,1.0,1.0,1.0
3,0.0051,0.003385,1.0,1.0,1.0,1.0


TrainOutput(global_step=1314, training_loss=0.20564494742423614, metrics={'train_runtime': 243.3084, 'train_samples_per_second': 86.31, 'train_steps_per_second': 5.401, 'total_flos': 695590269696000.0, 'train_loss': 0.20564494742423614, 'epoch': 3.0})