In [3]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
from sklearn.metrics import accuracy_score



In [4]:
train = pd.read_csv('./training.csv')

train.isna().any()

text     False
label    False
dtype: bool

In [5]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

print("Before undersampling:", Counter(train['label']))

class_counts = Counter(train['label'])

# Set the target number of samples per class
target_samples = 3000

# Determine classes needing oversampling and undersampling
classes_to_resample = {cls: target_samples for cls, count in class_counts.items() if count < target_samples}
classes_to_keep = {cls: target_samples for cls, count in class_counts.items() if count >= target_samples}
print('Class to OverSample: ', classes_to_resample)
print("Class to UnderSample: ", classes_to_keep)
# Oversample minority classes
oversampler = RandomOverSampler(sampling_strategy=classes_to_resample)

# Undersample majority classes
undersampler = RandomUnderSampler(sampling_strategy=classes_to_keep)


train_text_resampled, train_labels_resampled = oversampler.fit_resample(np.array(train['text']).reshape(-1,1), train['label'])

print('After Oversample:' , Counter(train_labels_resampled))

train_text_resampled, train_labels_resampled = undersampler.fit_resample(train_text_resampled, train_labels_resampled)

# Check the class distribution after resampling
print('After Undersample:', Counter(train_labels_resampled))

Before undersampling: Counter({1: 5362, 0: 4666, 3: 2159, 4: 1937, 2: 1304, 5: 572})
Class to OverSample:  {3: 3000, 2: 3000, 5: 3000, 4: 3000}
Class to UnderSample:  {0: 3000, 1: 3000}
After Oversample: Counter({1: 5362, 0: 4666, 3: 3000, 2: 3000, 5: 3000, 4: 3000})
After Undersample: Counter({0: 3000, 1: 3000, 2: 3000, 3: 3000, 4: 3000, 5: 3000})


In [6]:
from sklearn.model_selection import train_test_split
train_labels_resampled = np.array(train_labels_resampled)
X_train, X_temp, y_train, y_temp = train_test_split(train_text_resampled,train_labels_resampled,test_size=0.3,random_state=42)
X_val, X_test, y_val,y_test =  train_test_split(X_temp,y_temp,test_size=0.5, random_state=42)

In [7]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [8]:
def tokenize(data):
  data = [sentence[0] for sentence in data]
  tokenized_data = tokenizer.batch_encode_plus(data,
                                             max_length=50,
                                            padding=True,
                                             truncation=True,
                                             return_tensors='pt')
  return tokenized_data



tokenized_data_train = tokenize(X_train)
tokenized_data_val = tokenize(X_val)
tokenized_data_test =  tokenize(X_test)

In [9]:
print(tokenized_data_train)

{'input_ids': tensor([[  101,  1045,  2743,  ...,     0,     0,     0],
        [  101, 10047,  5121,  ...,     0,     0,     0],
        [  101,  1045,  2293,  ...,     0,     0,     0],
        ...,
        [  101,  1045,  2572,  ...,     0,     0,     0],
        [  101,  1045,  2514,  ...,     0,     0,     0],
        [  101,  1045,  2467,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [10]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(tokenized_data_train),
    y_train
)).shuffle(len(train['text'])).batch(32)

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(tokenized_data_val),
    y_val
)).batch(32)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(tokenized_data_test),
    y_test
)).batch(32)

In [11]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=6)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [12]:
# Define optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)

# Compile the model
model.compile(optimizer=optimizer,
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
print(model.summary)

# Fine-tuning
history = model.fit(train_dataset,
                    batch_size = 20,
                    epochs=30,
                    validation_data=val_dataset,
                    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss',min_delta=0.01, patience=5)]
                    )


<bound method Model.summary of <transformers.models.distilbert.modeling_tf_distilbert.TFDistilBertForSequenceClassification object at 0x7e566c0bc460>>
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30


In [13]:
#Evaluation
test_loss, test_accuracy = model.evaluate(test_dataset)
print(f'Test Loss: {test_loss} Test Accuracy: {test_accuracy}')

Test Loss: 0.19060316681861877 Test Accuracy: 0.95333331823349
