In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from datasets import DatasetDict, Dataset
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv(r'../driver-intent-classification-dataset.csv', encoding='ISO-8859-1')


In [3]:
df.head()

Unnamed: 0,Text,Intent
0,Roll down the driver's window.,roll down the driver side window
1,Driver's window down.,roll down the driver side window
2,Can you roll down the driver's side window?,roll down the driver side window
3,Lower the driver's window.,roll down the driver side window
4,I'd like the driver's window down.,roll down the driver side window


In [4]:
# I want to remove the intent to turn on the high beams since I think this is a more critical function
# and I want to validate that this works on non-critical functions first
df = df[df['Intent'] != 'turn on high beams']

In [5]:
df['Intent'].unique()

array(['roll down the driver side window', 'turn on the air conditioner',
       'roll down passenger window', 'turn on windshield wipers',
       'no intent'], dtype=object)

In [6]:
le = LabelEncoder()
df['labels'] = le.fit_transform(df['Intent'])

In [7]:
train_df, val_df = train_test_split(df, stratify=df['labels'], test_size=0.2, random_state=42)


In [8]:
train_df

Unnamed: 0,Text,Intent,labels
431,Slide open the passenger window.,roll down passenger window,1
344,Initiate cold mode.,turn on the air conditioner,3
393,Bring on the chill zone.,turn on the air conditioner,3
912,Turn on those wipers.,turn on windshield wipers,4
861,Need the wipers on.,turn on windshield wipers,4
...,...,...,...
1047,Is there a cinema nearby?,no intent,0
323,Can you start the cool flow?,turn on the air conditioner,3
102,Get the driver's window down.,roll down the driver side window,2
1139,Where can I adopt a dog?,no intent,0


In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['Text'], padding='max_length', truncation=True, max_length=512)

In [10]:
train_df_reset = train_df.reset_index(drop=True)
val_df_reset = val_df.reset_index(drop=True)

# Now create Dataset objects
train_dataset = Dataset.from_pandas(train_df_reset)
val_dataset = Dataset.from_pandas(val_df_reset)

# Apply tokenization function to the datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)

# Remove the original Text and Intent columns
tokenized_train_dataset = tokenized_train_dataset.remove_columns(['Text', 'Intent'])
tokenized_val_dataset = tokenized_val_dataset.remove_columns(['Text', 'Intent'])



Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [11]:
train_dataset

Dataset({
    features: ['Text', 'Intent', 'labels'],
    num_rows: 800
})

In [12]:
dataset_dict = DatasetDict(train=tokenized_train_dataset, test=tokenized_val_dataset)


In [13]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 800
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 200
    })
})

In [14]:
set(dataset_dict['train']['labels'])

{0, 1, 2, 3, 4}

In [15]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
def compute_metrics(p):
    logits, labels = p.predictions, p.label_ids
    preds = logits.argmax(axis=-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

In [17]:
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    output_dir='/results',
    num_train_epochs=3,
    evaluation_strategy="steps",
    save_steps=10,
    save_total_limit=2,
    remove_unused_columns=False,
    run_name='run_name',
    logging_dir='/logs',
    logging_steps=10,
    load_best_model_at_end=True,
)

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict["train"],
    eval_dataset=dataset_dict["test"],
    compute_metrics=compute_metrics, 
)

In [19]:
trainer.train()


Step,Training Loss,Validation Loss,Accuracy
10,1.5663,1.303219,0.59
20,1.1364,0.925005,0.845
30,0.7751,0.513401,0.98
40,0.4823,0.258322,0.995
50,0.2356,0.112884,0.995
60,0.1198,0.04453,1.0
70,0.048,0.038136,0.99
80,0.0209,0.011914,1.0
90,0.0134,0.030722,0.995
100,0.0089,0.022412,0.995


TrainOutput(global_step=300, training_loss=0.14947688460350037, metrics={'train_runtime': 165.7797, 'train_samples_per_second': 14.477, 'train_steps_per_second': 1.81, 'total_flos': 631483541913600.0, 'train_loss': 0.14947688460350037, 'epoch': 3.0})