# LLM Text Classification - Training

In [1]:
# ignore the unwanted warnings
import warnings
warnings.filterwarnings('ignore')

In [1]:
# import required modules
import os
import sys
import numpy
import matplotlib.pyplot as plt
import pandas as pd
import datasets
import torch
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import (
    AutoTokenizer, 
    DataCollatorWithPadding, 
    AutoModelForSequenceClassification, 
    AdamW, 
    get_scheduler
)
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load the dataset from csv file 
df = pd.read_csv('../data/filtered_data.csv')
df.head()

Unnamed: 0,text,generated
0,car car around sinc becam famou henri ford cre...,0
1,transport larg necess countri worldwid doubt c...,0
2,america love affair vehicl seem cool say elisa...,0
3,often ride car drive one motor vehicl work sto...,0
4,car wonder thing perhap one world greatest adv...,0


In [3]:
# shuffle the data
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df.head(10)

Unnamed: 0,text,generated
0,automobil use averag use transport ever sinc r...,0
1,univers educ prepar student employ also teach ...,1
2,purpos univers educ often debat believ prepar ...,1
3,believ univers educ multipl function import ed...,1
4,could imagin would like limit usag car could s...,0
5,univers educ mani purpos two main one prepar s...,1
6,believ univers educ provid student skill knowl...,1
7,fellow citizen mani reason limit car usag outs...,0
8,agre univers educ function function includ dev...,1
9,varieti opinion univers educ peopl think prepa...,1


In [4]:
# texts and labels
X = list(df['text'])[0:10]
y = list(df['generated'])[0:10]

In [5]:
# split the dataset into train and validation data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2)

In [6]:
# datasets object

# create data dictionary
train_data_dict = {
    'text': X_train,
    'generated': y_train,
}
validation_data_dict = {
    'text': X_valid,
    'generated': y_valid,
}

# create data object for both train split and validation split
train_dataset = datasets.Dataset.from_dict(train_data_dict)
validation_dataset = datasets.Dataset.from_dict(validation_data_dict)

# wraps up both the data objects into DatasetDict object
data = datasets.DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset,
})

In [7]:
# convert the texts into tokens using transformers AutoTokenizer

# model name or checkpoint name
checkpoint = "bert-base-uncased"
# initialize tokenizer object
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


# function which convert text into tokens 
def tokenize_function(example):
    return tokenizer(example['text'], truncation=True)

# apply tokenizer on all texts
tokenized_datasets = data.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map: 100%|██████████| 8/8 [00:00<00:00, 272.34 examples/s]
Map: 100%|██████████| 2/2 [00:00<00:00, 111.06 examples/s]


In [8]:
# remove unwanted columns from tokenized dataset
tokenized_datasets = tokenized_datasets.remove_columns(["text"])

# rename the "generated" to "labels"
tokenized_datasets = tokenized_datasets.rename_column("generated", "labels")

# Set the format of the datasets so they return PyTorch tensors instead of lists
tokenized_datasets.set_format("torch")

In [9]:
# define dataloaders for further process
train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

In [10]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 512]),
 'token_type_ids': torch.Size([8, 512]),
 'attention_mask': torch.Size([8, 512])}

In [11]:
# instantiate classification model 
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Add optimizer in model and learning rate
optimizer = AdamW(model.parameters(), lr=5e-5)



In [13]:
# set epochs, training steps and scheduler
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

3


In [14]:
# set hardware for model training
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cpu')

In [15]:
# training loop 
# shows progress of training
progress_bar = tqdm(range(num_training_steps))
model.train()

for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/3 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# Model Evaluation 

from sklearn.metrics import roc_curve, roc_auc_score

model.eval()
all_predictions = []
all_labels = []
all_probs = []

for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

    # Append predictions and labels for later evaluation
    all_predictions.extend(predictions.cpu().numpy())
    all_labels.extend(batch["labels"].cpu().numpy())
    all_probs.extend(torch.softmax(logits, dim=-1)[:, 1].cpu().numpy())  # Assuming binary classification

# Ensure that all_labels and logits have the same length
assert len(all_labels) == len(all_predictions), "Inconsistent number of samples"

# Compute metrics
accuracy = accuracy_score(all_labels, all_predictions)
precision = precision_score(all_labels, all_predictions)
recall = recall_score(all_labels, all_predictions)
f1 = f1_score(all_labels, all_predictions)
roc_auc = roc_auc_score(all_labels, all_probs)

# ROC Curve
fpr, tpr, _ = roc_curve(all_labels, all_probs)

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC:", roc_auc)

# Plot ROC Curve
plt.figure(figsize=(8, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Save the trained model
model.save_pretrained("saved_models/llm_text_detection_model")