In [None]:
# !pip install transformers torch scikit-learn pandas tqdm -q

In [6]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.optim import AdamW
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

In [7]:
class SmartHomeDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=32):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(label, dtype=torch.long)
        }

In [11]:
# Load Dataset
df = pd.read_csv("../dataset/smart_home_dataset_v3.csv")

# We'll combine (device + state) into one label
df["label"] = df["device"] + "_" + df["state"]

print("Sample data:\n", df.head())

Sample data:
    temperature  humidity  light_intensity  noise_level  pir_motion  \
0        39.90     58.62            27.51        75.78           1   
1        23.09     66.01            38.71        38.57           0   
2        21.88     42.63            91.31        45.39           0   
3        32.46     48.93            16.82        60.76           0   
4        30.91     82.56            43.90        23.37           1   

   person_count door_state window_state  co2_level  gas_leak  smoke_detected  \
0             0     closed         open    1569.88         0               0   
1             1       open         open    1270.13         0               1   
2             3       open       closed    1558.44         0               1   
3             1     closed       closed     907.23         0               0   
4             0       open       closed    1155.96         0               0   

   rain_detected            voice_command room_type time_of_day  \
0              0 

In [13]:
# Combine device + state into one label
df["label"] = df["device"] + "_" + df["state"]

# Convert string labels to numeric IDs
le = LabelEncoder()
df["label_id"] = le.fit_transform(df["label"])

print(df[["voice_command", "label", "label_id"]].head())

             voice_command             label  label_id
0  check the phone_charger  phone_charger_ON        30
1         start the heater         heater_ON        22
2       turn on the heater         heater_ON        22
3         open the curtain        curtain_ON        10
4         start the window         window_ON        38


In [15]:
# Tokenization
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [18]:
num_labels = len(le.classes_)
print(f"Total unique intents: {num_labels}")

Total unique intents: 39


In [19]:
# Split dataset
train_size = int(0.9 * len(df))
train_df, test_df = df[:train_size], df[train_size:]

train_data = SmartHomeDataset(train_df["voice_command"], train_df["label_id"], tokenizer)
test_data = SmartHomeDataset(test_df["voice_command"], test_df["label_id"], tokenizer)

train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
test_loader = DataLoader(test_data, batch_size=16)

In [20]:
# Model Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

# Training Loop
EPOCHS = 3
model.train()

for epoch in range(EPOCHS):
    total_loss = 0
    progress = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}")
    for batch in progress:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

        progress.set_postfix({"loss": loss.item()})

    print(f"Average Loss for Epoch {epoch+1}: {total_loss / len(train_loader):.4f}")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3: 100%|███████████████████| 282/282 [00:41<00:00,  6.82it/s, loss=0.119]


Average Loss for Epoch 1: 1.1013


Epoch 2/3: 100%|██████████████████| 282/282 [00:40<00:00,  6.92it/s, loss=0.0297]


Average Loss for Epoch 2: 0.0452


Epoch 3/3: 100%|██████████████████| 282/282 [00:36<00:00,  7.66it/s, loss=0.0135]

Average Loss for Epoch 3: 0.0146





In [24]:
# Fix indexes for train/test DataFrames
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# Recreate dataset + dataloaders
train_data = SmartHomeDataset(train_df["voice_command"], train_df["label_id"], tokenizer)
test_data = SmartHomeDataset(test_df["voice_command"], test_df["label_id"], tokenizer)

train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
test_loader = DataLoader(test_data, batch_size=16, shuffle=False)

In [25]:
# Evaluation
model.eval()
correct, total = 0, 0

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

print(f"Test Accuracy: {correct / total * 100:.2f}%")

Test Accuracy: 100.00%


In [28]:
# Inference Function
def generate_reply(command):
    model.eval()
    inputs = tokenizer(command, return_tensors="pt", padding=True, truncation=True, max_length=32).to(device)
    outputs = model(**inputs)
    pred_label = torch.argmax(outputs.logits, dim=1).item()
    decoded = le.inverse_transform([pred_label])[0]
    device_name, state = decoded.split("_")
    return f"Okay, turning {state.lower()} the {device_name.replace('_',' ')}."

In [29]:
# Test Predictions
print(generate_reply("Turn on the light"))
print(generate_reply("It's too hot"))
print(generate_reply("Switch off the fan"))
print(generate_reply("Start the geyser"))
print(generate_reply("Close the door"))

Okay, turning on the light.
Okay, turning on the refrigerator.
Okay, turning off the fan.
Okay, turning on the geyser.
Okay, turning off the door.
