In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from sklearn.metrics import accuracy_score



In [2]:
!pip install imblearn



In [3]:
train = pd.read_csv('./training.csv')

train.isna().any()

text     False
label    False
dtype: bool

In [4]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

print("Before undersampling:", Counter(train['label']))

class_counts = Counter(train['label'])

# Set the target number of samples per class
target_samples = 3000

# Determine classes needing oversampling and undersampling
classes_to_resample = {cls: target_samples for cls, count in class_counts.items() if count < target_samples}
classes_to_keep = {cls: target_samples for cls, count in class_counts.items() if count >= target_samples}
print('Class to OverSample: ', classes_to_resample)
print("Class to UnderSample: ", classes_to_keep)
# Oversample minority classes
oversampler = RandomOverSampler(sampling_strategy=classes_to_resample)

# Undersample majority classes
undersampler = RandomUnderSampler(sampling_strategy=classes_to_keep)


train_text_resampled, train_labels_resampled = oversampler.fit_resample(np.array(train['text']).reshape(-1,1), train['label'])

print('After Oversample:' , Counter(train_labels_resampled))

train_text_resampled, train_labels_resampled = undersampler.fit_resample(train_text_resampled, train_labels_resampled)

# Check the class distribution after resampling
print('After Undersample:', Counter(train_labels_resampled))

Before undersampling: Counter({1: 5362, 0: 4666, 3: 2159, 4: 1937, 2: 1304, 5: 572})
Class to OverSample:  {3: 3000, 2: 3000, 5: 3000, 4: 3000}
Class to UnderSample:  {0: 3000, 1: 3000}
After Oversample: Counter({1: 5362, 0: 4666, 3: 3000, 2: 3000, 5: 3000, 4: 3000})
After Undersample: Counter({0: 3000, 1: 3000, 2: 3000, 3: 3000, 4: 3000, 5: 3000})


In [5]:
from sklearn.model_selection import train_test_split
train_labels_resampled = np.array(train_labels_resampled)
X_train, X_temp, y_train, y_temp = train_test_split(train_text_resampled,train_labels_resampled,test_size=0.3,random_state=42)
X_val, X_test, y_val,y_test =  train_test_split(X_temp,y_temp,test_size=0.5, random_state=42)

In [6]:
Demo = [sentence[0] for sentence in X_train]

In [7]:
tokenizer = Tokenizer(num_words=10000, oov_token='<UNK>', split=' ', char_level=False)
tokenizer.fit_on_texts(Demo)

def tokenize(data):
    data = [sentence[0] for sentence in data]
    sequences = tokenizer.texts_to_sequences(data)
    padded = pad_sequences(sequences, truncating = 'post', padding='post', maxlen=50)
    return padded

In [8]:
tokenized_data_train = tokenize(X_train)
tokenized_data_val = tokenize(X_val)
tokenized_data_test =  tokenize(X_test)

In [9]:
print(tokenized_data_test)

[[   2   32  113 ...    0    0    0]
 [   2   76 1659 ...    0    0    0]
 [  16    8   15 ...    0    0    0]
 ...
 [   2    3   14 ...    0    0    0]
 [   2    3  386 ...    0    0    0]
 [   2   88    3 ...    0    0    0]]


In [10]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    tokenized_data_train,
    y_train
)).shuffle(len(train['text'])).batch(32)

val_dataset = tf.data.Dataset.from_tensor_slices((
    tokenized_data_val,
    y_val
)).batch(32)

test_dataset = tf.data.Dataset.from_tensor_slices((
    tokenized_data_test,
    y_test
)).batch(32)

In [11]:
model = tf.keras.models.Sequential([
tf.keras.layers.Embedding(10000,64,input_length=50),
tf.keras.layers.SpatialDropout1D(0.2),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(80,dropout = 0.2, recurrent_dropout=0.2)),
tf.keras.layers.Dense(6, activation='sigmoid')
])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 64)            640000    
                                                                 
 spatial_dropout1d (Spatial  (None, 50, 64)            0         
 Dropout1D)                                                      
                                                                 
 bidirectional (Bidirection  (None, 160)               92800     
 al)                                                             
                                                                 
 dense (Dense)               (None, 6)                 966       
                                                                 
Total params: 733766 (2.80 MB)
Trainable params: 733766 (2.80 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [12]:
# Define optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-4)

# Compile the model
model.compile(optimizer=optimizer,
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
print(model.summary)

# Fine-tuning
history = model.fit(train_dataset,
                    batch_size = 20,
                    epochs=30,
                    validation_data=val_dataset,
                    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss',min_delta=0.01, patience=5)]
                    )




<bound method Model.summary of <keras.src.engine.sequential.Sequential object at 0x178304bd0>>
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30


In [13]:
#Evaluation
test_loss, test_accuracy = model.evaluate(test_dataset)
print(f'Test Loss: {test_loss} Test Accuracy: {test_accuracy}')

 1/85 [..............................] - ETA: 0s - loss: 0.4165 - accuracy: 0.8750

Test Loss: 0.2541908323764801 Test Accuracy: 0.9348148107528687
