In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from datasets import DatasetDict, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv(r'../driver-intent-classification-dataset.csv', encoding='ISO-8859-1')


In [3]:
df.head()

Unnamed: 0,Text,Intent
0,Roll down the driver's window.,roll down the driver side window
1,Driver's window down.,roll down the driver side window
2,Can you roll down the driver's side window?,roll down the driver side window
3,Lower the driver's window.,roll down the driver side window
4,I'd like the driver's window down.,roll down the driver side window


In [4]:
# I want to remove the intent to turn on the high beams since I think this is a more critical function
# and I want to validate that this works on non-critical functions first
df = df[df['Intent'] != 'turn on high beams']

In [5]:
df['Intent'].unique()

array(['roll down the driver side window', 'turn on the air conditioner',
       'roll down passenger window', 'turn on windshield wipers',
       'no intent'], dtype=object)

In [6]:
le = LabelEncoder()
df['Encoded_Intent'] = le.fit_transform(df['Intent'])

In [7]:
train_df, val_df = train_test_split(df, stratify=df['Encoded_Intent'], test_size=0.2, random_state=42)


In [11]:
train_df

Unnamed: 0,Text,Intent,Encoded_Intent
431,Slide open the passenger window.,roll down passenger window,1
344,Initiate cold mode.,turn on the air conditioner,3
393,Bring on the chill zone.,turn on the air conditioner,3
912,Turn on those wipers.,turn on windshield wipers,4
861,Need the wipers on.,turn on windshield wipers,4
...,...,...,...
1047,Is there a cinema nearby?,no intent,0
323,Can you start the cool flow?,turn on the air conditioner,3
102,Get the driver's window down.,roll down the driver side window,2
1139,Where can I adopt a dog?,no intent,0


In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['Text'], padding='max_length', truncation=True, max_length=256)

In [12]:
train_df_reset = train_df.reset_index(drop=True)
val_df_reset = val_df.reset_index(drop=True)

# Now create Dataset objects
train_dataset = Dataset.from_pandas(train_df_reset)
val_dataset = Dataset.from_pandas(val_df_reset)

# Apply tokenization function to the datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)

# Remove the original Text and Intent columns
tokenized_train_dataset = tokenized_train_dataset.remove_columns(['Text', 'Intent'])
tokenized_val_dataset = tokenized_val_dataset.remove_columns(['Text', 'Intent'])

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [13]:
train_dataset

Dataset({
    features: ['Text', 'Intent', 'Encoded_Intent'],
    num_rows: 800
})

In [14]:
dataset_dict = DatasetDict(train=tokenized_train_dataset, test=tokenized_val_dataset)


In [15]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['Encoded_Intent', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 800
    })
    test: Dataset({
        features: ['Encoded_Intent', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 200
    })
})