<a href="https://colab.research.google.com/github/huangd2/huggingface/blob/main/Transformer_BERK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 🧼 Disable wandb logging
import os
os.environ["WANDB_DISABLED"] = "true"
!pip uninstall -y transformers
!pip install -U transformers


Found existing installation: transformers 4.53.2
Uninstalling transformers-4.53.2:
  Successfully uninstalled transformers-4.53.2
Collecting transformers
  Downloading transformers-4.53.2-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.53.2-py3-none-any.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
Successfully installed transformers-4.53.2


In [2]:
import transformers
print(transformers.__version__)


4.53.2


In [3]:
# ✅ Install dependencies
!pip install -q transformers datasets evaluate

# 📚 Imports
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
import numpy as np
import evaluate

# 🔧 Create toy dataset
data = {
    "text": [
        "I forgot what I ate this morning.",
        "I completed all my tasks today.",
        "I can't remember names like I used to.",
        "I went to the store and bought everything I needed.",
        "I often lose track of time.",
        "I read a book and wrote down some notes.",
        "I misplace my phone multiple times a day.",
        "I handled all my appointments smoothly.",
        "I got lost on my way to a familiar place.",
        "I followed my daily routine without issue."
    ],
    "label": [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]  # 1: Impaired, 0: Normal
}
dataset = Dataset.from_dict(data).train_test_split(test_size=0.3)



[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
# ✅ Check if the sentence is in the training split
train_texts = [x['text'] for x in dataset['train']]
print("In training set?", "I forgot what I ate this morning." in train_texts)
print(train_texts)

In training set? True
['I often lose track of time.', 'I forgot what I ate this morning.', "I can't remember names like I used to.", 'I handled all my appointments smoothly.', 'I got lost on my way to a familiar place.', 'I followed my daily routine without issue.', 'I completed all my tasks today.']


In [6]:



# 🔡 Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length")

tokenized = dataset.map(tokenize)

# 🧠 Model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# ⚙️ TrainingArguments — MINIMAL VERSION
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4
)

# 🏃 Train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"]
)
trainer.train()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss


TrainOutput(global_step=6, training_loss=0.6159463326136271, metrics={'train_runtime': 167.1166, 'train_samples_per_second': 0.126, 'train_steps_per_second': 0.036, 'total_flos': 5525332162560.0, 'train_loss': 0.6159463326136271, 'epoch': 3.0})

In [7]:
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    outputs = model(**inputs)
    predicted = torch.argmax(outputs.logits, dim=1).item()
    return "Cognitive Impairment" if predicted == 1 else "Normal"

print(predict("I forgot what I ate this morning."))
print(predict("I can't remember faces like I used to"))


Cognitive Impairment
Cognitive Impairment
