# LLM Text Classification - Data Loader for Training

In [1]:
# ignore the unwanted warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# required modules 
import os
import numpy
import pandas as pd
import datasets
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from transformers import (
    AutoTokenizer, 
    DataCollatorWithPadding
)

In [3]:
# load the dataset from csv file 
df = pd.read_csv('../data/filtered_data.csv')
df.head()

Unnamed: 0,text,generated
0,car car around sinc becam famou henri ford cre...,0
1,transport larg necess countri worldwid doubt c...,0
2,america love affair vehicl seem cool say elisa...,0
3,often ride car drive one motor vehicl work sto...,0
4,car wonder thing perhap one world greatest adv...,0


In [4]:
# shuffle the data
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df.head(10)

Unnamed: 0,text,generated
0,automobil use averag use transport ever sinc r...,0
1,univers educ prepar student employ also teach ...,1
2,purpos univers educ often debat believ prepar ...,1
3,believ univers educ multipl function import ed...,1
4,could imagin would like limit usag car could s...,0
5,univers educ mani purpos two main one prepar s...,1
6,believ univers educ provid student skill knowl...,1
7,fellow citizen mani reason limit car usag outs...,0
8,agre univers educ function function includ dev...,1
9,varieti opinion univers educ peopl think prepa...,1


In [5]:
# texts and labels
X = list(df['text'])
y = list(df['generated'])

In [6]:
# split the dataset into train and validation data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2)

In [7]:
# datasets object

# create data dictionary
train_data_dict = {
    'text': X_train,
    'generated': y_train,
}
validation_data_dict = {
    'text': X_valid,
    'generated': y_valid,
}

# create data object for both train split and validation split
train_dataset = datasets.Dataset.from_dict(train_data_dict)
validation_dataset = datasets.Dataset.from_dict(validation_data_dict)

# wraps up both the data objects into DatasetDict object
data = datasets.DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset,
})

In [8]:
# convert the texts into tokens using transformers AutoTokenizer

# model name or checkpoint name
checkpoint = "bert-base-uncased"
# initialize tokenizer object
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


# function which convert text into tokens 
def tokenize_function(example):
    return tokenizer(example['text'], truncation=True)

# apply tokenizer on all texts
tokenized_datasets = data.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

tokenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 4.68kB/s]
config.json: 100%|██████████| 570/570 [00:00<00:00, 95.1kB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 480kB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 693kB/s]
Map: 100%|██████████| 4332/4332 [00:03<00:00, 1173.27 examples/s]
Map: 100%|██████████| 1084/1084 [00:00<00:00, 1259.15 examples/s]


In [9]:
# remove unwanted columns from tokenized dataset
tokenized_datasets = tokenized_datasets.remove_columns(["text"])

# rename the "generated" to "labels"
tokenized_datasets = tokenized_datasets.rename_column("generated", "labels")

# Set the format of the datasets so they return PyTorch tensors instead of lists
tokenized_datasets.set_format("torch")

In [10]:
# define dataloaders for further process
train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

In [11]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 512]),
 'token_type_ids': torch.Size([8, 512]),
 'attention_mask': torch.Size([8, 512])}