In [71]:
!pip install transformers[torch]
!pip install accelerate -U
!pip install datasets

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
Installing collected 

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from transformers import T5Tokenizer, T5ForSequenceClassification
from sklearn.metrics import accuracy_score



In [44]:
train = pd.read_csv('./training.csv')

train.isna().any()

text     False
label    False
dtype: bool

In [45]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

print("Before undersampling:", Counter(train['label']))

class_counts = Counter(train['label'])

# Set the target number of samples per class
target_samples = 3000

# Determine classes needing oversampling and undersampling
classes_to_resample = {cls: target_samples for cls, count in class_counts.items() if count < target_samples}
classes_to_keep = {cls: target_samples for cls, count in class_counts.items() if count >= target_samples}
print('Class to OverSample: ', classes_to_resample)
print("Class to UnderSample: ", classes_to_keep)
# Oversample minority classes
oversampler = RandomOverSampler(sampling_strategy=classes_to_resample)

# Undersample majority classes
undersampler = RandomUnderSampler(sampling_strategy=classes_to_keep)


train_text_resampled, train_labels_resampled = oversampler.fit_resample(np.array(train['text']).reshape(-1,1), train['label'])

print('After Oversample:' , Counter(train_labels_resampled))

train_text_resampled, train_labels_resampled = undersampler.fit_resample(train_text_resampled, train_labels_resampled)

# Check the class distribution after resampling
print('After Undersample:', Counter(train_labels_resampled))

Before undersampling: Counter({1: 5362, 0: 4666, 3: 2159, 4: 1937, 2: 1304, 5: 572})
Class to OverSample:  {3: 3000, 2: 3000, 5: 3000, 4: 3000}
Class to UnderSample:  {0: 3000, 1: 3000}
After Oversample: Counter({1: 5362, 0: 4666, 3: 3000, 2: 3000, 5: 3000, 4: 3000})
After Undersample: Counter({0: 3000, 1: 3000, 2: 3000, 3: 3000, 4: 3000, 5: 3000})


In [46]:
from sklearn.model_selection import train_test_split
train_labels_resampled = np.array(train_labels_resampled)
X_train, X_temp, y_train, y_temp = train_test_split(train_text_resampled,train_labels_resampled,test_size=0.3,random_state=42)
X_val, X_test, y_val,y_test =  train_test_split(X_temp,y_temp,test_size=0.5, random_state=42)

In [5]:
!pip install sentencepiece



In [6]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [47]:
def tokenize(data):
  data = [sentence[0] for sentence in data]
  tokenized_data = tokenizer.batch_encode_plus(data,
                                             max_length=50,
                                            padding=True,
                                             truncation=True,
                                             return_tensors='pt')
  return tokenized_data



tokenized_data_train = tokenize(X_train)
tokenized_data_val = tokenize(X_val)
tokenized_data_test =  tokenize(X_test)

In [48]:
print(tokenized_data_train)

{'input_ids': tensor([[   3,   23, 1663,  ...,    0,    0,    0],
        [ 256, 1852,   59,  ...,    0,    0,    0],
        [   3,   23,  473,  ...,    0,    0,    0],
        ...,
        [   3,   23,  473,  ...,    0,    0,    0],
        [   3,   23,   19,  ...,    0,    0,    0],
        [   3,   23,  473,  ...,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [54]:
import torch
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, data_dict, labels):
        self.input_ids = data_dict['input_ids']
        self.attention_mask = data_dict['attention_mask']
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        return {
            'input_ids': self.input_ids[index],
            'attention_mask': self.attention_mask[index],
            'labels': self.labels[index]
        }

# Assuming data_dict is your dictionary and labels is your array of labels
train_dataset = CustomDataset(tokenized_data_train, y_train)

val_dataset = CustomDataset(tokenized_data_val, y_val)

test_dataset = CustomDataset(tokenized_data_test, y_test)


In [73]:
#model = T5ForSequenceClassification.from_pretrained('./t5-small.h5', num_labels=6)
model = T5ForSequenceClassification.from_pretrained('t5-small', num_labels=6)
#model.save_pretrained("./t5-small.h5")

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at t5-small and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [99]:
from datasets import load_metric
from transformers import TrainerCallback
from copy import deepcopy


In [None]:
from transformers import Trainer, TrainingArguments, default_data_collator
from transformers.trainer_callback import DefaultFlowCallback
from transformers import EarlyStoppingCallback, IntervalStrategy

# Step 5: Define training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=20,
    per_device_eval_batch_size=20,
    num_train_epochs=2,
    evaluation_strategy="epoch",
    output_dir="./logs",
    learning_rate=2e-5,
    load_best_model_at_end = True,
    save_strategy="epoch",
    metric_for_best_model="accuracy",
    greater_is_better=True
)

# Step 6: Instantiate the Trainer class
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

# Step 7: Fine-tune the model
trainer.train()

# Step 8: Evaluate the model
trainer.evaluate(test_dataset)

In [None]:
#Evaluation
test_results = trainer.evaluate(test_dataset)
print(test_results)
#print(f'Test Loss: {test_loss} Test Accuracy: {test_accuracy}')
# Extract accuracy from the test results


In [69]:
from torch.utils.data import DataLoader

def compute_accuracy(predictions, true_labels):
    predicted_labels = torch.argmax(predictions, dim=-1)
    accuracy = (predicted_labels == true_labels).float().mean().item()
    return accuracy

# Define your test dataloader
batch_size = 20 # Change batch size as needed
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

# Evaluate on the test set
predictions = []
true_labels = []
for batch in test_dataloader:
    input_ids = batch['input_ids'].to('cuda')
    attention_mask = batch['attention_mask'].to('cuda')
    labels = batch['labels'].to('cuda')

    with torch.no_grad():
        output = model(input_ids=input_ids.squeeze(0), attention_mask=attention_mask.squeeze(0))
        predictions.append(output.logits)
        true_labels.append(labels)

# Concatenate predictions and true labels
predictions = torch.cat(predictions, dim=0)
true_labels = torch.cat(true_labels, dim=0)

# Calculate accuracy
accuracy = compute_accuracy(predictions, true_labels)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.9351851940155029
