In [4]:
!pip install --upgrade transformers datasets


Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading transformers-4.52.4-py3-none-any.whl (10.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m80.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, transformers, datasets
  Attempting uninstall: fsspec
    Found existing installation: 

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Load all files into one DataFrame
csvs = ["deployments.csv", "device_info.csv", "property.csv", "device_category.csv", "locations.csv"]
df = pd.concat([pd.read_csv(f) for f in csvs], ignore_index=True)

# Encode string labels into integers
label_encoder = LabelEncoder()
df["label_id"] = label_encoder.fit_transform(df["label"])

# Split dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["question"].tolist(), df["label_id"].tolist(), test_size=0.2, random_state=42
)


In [2]:
from transformers import BertTokenizer
from datasets import Dataset

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=64)

# Create HF Dataset objects
train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels}).map(tokenize, batched=True)
val_dataset = Dataset.from_dict({"text": val_texts, "label": val_labels}).map(tokenize, batched=True)

# Format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [5]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score
import os
os.environ["WANDB_DISABLED"] = "true"
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_encoder.classes_))

# Evaluation metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted")
    }

# Training settings
training_args = TrainingArguments(
    output_dir="./bert_query_classifier",
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="./logs",
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1985,0.002662,1.0,1.0
2,0.0023,0.001076,1.0,1.0
3,0.0013,0.000736,1.0,1.0
4,0.001,0.000655,1.0,1.0


TrainOutput(global_step=600, training_loss=0.0507479353249073, metrics={'train_runtime': 3822.5495, 'train_samples_per_second': 1.256, 'train_steps_per_second': 0.157, 'total_flos': 157868050636800.0, 'train_loss': 0.0507479353249073, 'epoch': 4.0})

In [7]:
# Save model and tokenizer
model.save_pretrained("bert_query_classifier")
tokenizer.save_pretrained("bert_query_classifier")

# Save label encoder
import pickle
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)


In [8]:
from google.colab import files

# Create a zip file
!zip -r bert_query_classifier.zip bert_query_classifier
files.download("bert_query_classifier.zip")
files.download("label_encoder.pkl")


  adding: bert_query_classifier/ (stored 0%)
  adding: bert_query_classifier/tokenizer_config.json (deflated 75%)
  adding: bert_query_classifier/model.safetensors (deflated 7%)
  adding: bert_query_classifier/checkpoint-600/ (stored 0%)
  adding: bert_query_classifier/checkpoint-600/tokenizer_config.json (deflated 75%)
  adding: bert_query_classifier/checkpoint-600/model.safetensors (deflated 7%)
  adding: bert_query_classifier/checkpoint-600/training_args.bin (deflated 52%)
  adding: bert_query_classifier/checkpoint-600/optimizer.pt (deflated 28%)
  adding: bert_query_classifier/checkpoint-600/scheduler.pt (deflated 56%)
  adding: bert_query_classifier/checkpoint-600/config.json (deflated 51%)
  adding: bert_query_classifier/checkpoint-600/trainer_state.json (deflated 72%)
  adding: bert_query_classifier/checkpoint-600/special_tokens_map.json (deflated 42%)
  adding: bert_query_classifier/checkpoint-600/rng_state.pth (deflated 24%)
  adding: bert_query_classifier/checkpoint-600/vocab

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [20]:
from google.colab import drive
drive.mount('/content/drive')

!cp bert_query_classifier.zip /content/drive/MyDrive/

Mounted at /content/drive


In [18]:
def predict_label(query):
    inputs = tokenizer(query, return_tensors="pt", truncation=True, padding="max_length", max_length=64)
    outputs = model(**inputs)
    pred_id = outputs.logits.argmax(-1).item()
    return label_encoder.inverse_transform([pred_id])[0]

# Test it
predict_label("How did the turbidity vary at CBYIP.E1 on June 1, 2022?")


'observation_query'