In [1]:
#!pip install datasets

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


C-LSTM BINARY CLASSIFICATTION ON AG NEWS DATASET




In [8]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers
from datasets import load_dataset
from sklearn.model_selection import train_test_split

# Step 1: Load and preprocess the AG News dataset
dataset = load_dataset("ag_news")

# Extract texts and labels
texts = dataset['train']['text']
labels = dataset['train']['label']

# Convert labels to binary: combine classes 0 & 1 as 0, and classes 2 & 3 as 1
binary_labels = [0 if label in [0, 1] else 1 for label in labels]

# Tokenization and padding
VOCAB_SIZE = 10000
MAX_LEN = 300

tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
x_data = pad_sequences(sequences, maxlen=MAX_LEN)
y_data = np.array(binary_labels)

# Split data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

# Step 2: Load GloVe embeddings
def load_glove_embeddings(glove_file_path, embedding_dim=300):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            embedding_vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = embedding_vector
    return embeddings_index

# Create embedding matrix
def create_embedding_matrix(word_index, glove_embeddings, vocab_size, embedding_dim=300):
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in word_index.items():
        if i < vocab_size:
            embedding_vector = glove_embeddings.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

# Load pre-trained GloVe embeddings (update with your actual path)
glove_file_path = '/content/drive/MyDrive/DSA/glove/glove.6B.300d.txt'
glove_embeddings = load_glove_embeddings(glove_file_path)
embedding_matrix = create_embedding_matrix(tokenizer.word_index, glove_embeddings, VOCAB_SIZE)

# Step 3: Define the C-LSTM Model for Binary Classification
class CLSTMBinaryClassifierAgNews(tf.keras.Model):
    def __init__(self, vocab_size, embedding_matrix, max_length, num_filters=150, lstm_units=150, embedding_dim=300, dropout_rate=0.5, l2_reg_lambda=0.001):
        super(CLSTMBinaryClassifierAgNews, self).__init__()
        self.embedding = layers.Embedding(input_dim=vocab_size,
                                          output_dim=embedding_dim,
                                          input_length=max_length,
                                          weights=[embedding_matrix],
                                          trainable=True)
        self.embedding_dropout = layers.Dropout(rate=dropout_rate)

        # Convolutional layer with filter size 3
        self.conv_layer = layers.Conv2D(filters=num_filters,
                                        kernel_size=(3, embedding_dim),
                                        activation='relu', padding='valid')
        self.batch_norm = layers.BatchNormalization()

        # LSTM layer to capture dependencies
        self.lstm = layers.LSTM(lstm_units, return_sequences=False)
        self.dropout = layers.Dropout(rate=dropout_rate)

        # Output layer for binary classification
        self.fc = layers.Dense(1, activation='sigmoid', kernel_regularizer=tf.keras.regularizers.L2(l2_reg_lambda))

    def call(self, inputs, training=False):
        x = self.embedding(inputs)
        x = self.embedding_dropout(x, training=training)
        x = tf.expand_dims(x, -1)

        # Apply convolutional layer and batch normalization
        conv_out = self.conv_layer(x)
        conv_out = self.batch_norm(conv_out, training=training)
        conv_out = tf.squeeze(conv_out, 2)

        # Pass through LSTM
        rnn_outputs = self.lstm(conv_out)
        rnn_outputs = self.dropout(rnn_outputs, training=training)

        # Output for binary classification
        binary_output = self.fc(rnn_outputs)
        return binary_output

# Step 4: Initialize and compile the model
model = CLSTMBinaryClassifierAgNews(vocab_size=VOCAB_SIZE,
                              embedding_matrix=embedding_matrix,
                              max_length=MAX_LEN)

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

# Step 5: Train the model
history = model.fit(
    x_train, y_train,
    batch_size=64,
    epochs=10,
    validation_data=(x_test, y_test),
    verbose=1
)

# Step 6: Evaluate the model
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_acc}')



Epoch 1/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 24ms/step - accuracy: 0.9221 - loss: 0.2076 - val_accuracy: 0.9590 - val_loss: 0.1211
Epoch 2/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 24ms/step - accuracy: 0.9540 - loss: 0.1315 - val_accuracy: 0.9615 - val_loss: 0.1144
Epoch 3/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 24ms/step - accuracy: 0.9588 - loss: 0.1205 - val_accuracy: 0.9644 - val_loss: 0.1086
Epoch 4/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 24ms/step - accuracy: 0.9629 - loss: 0.1089 - val_accuracy: 0.9640 - val_loss: 0.1077
Epoch 5/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 24ms/step - accuracy: 0.9645 - loss: 0.1038 - val_accuracy: 0.9656 - val_loss: 0.1029
Epoch 6/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 24ms/step - accuracy: 0.9655 - loss: 0.1007 - val_accuracy: 0.9650 - val_loss: 0.1039
Epoc

C-LSTM FINE-GRAINED CLASSIFICATTION ON AG NEWS DATASET




In [9]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers
from datasets import load_dataset
from sklearn.model_selection import train_test_split

# Step 1: Load and preprocess the AG News dataset
dataset = load_dataset("ag_news")

# Extract texts and labels
texts = dataset['train']['text']
labels = dataset['train']['label']

# Tokenization and padding
VOCAB_SIZE = 10000
MAX_LEN = 300

tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
x_data = pad_sequences(sequences, maxlen=MAX_LEN)
y_data = np.array(labels)  # Fine-grained labels (0 to 3)

# Split data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

# Step 2: Load GloVe embeddings
def load_glove_embeddings(glove_file_path, embedding_dim=300):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            embedding_vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = embedding_vector
    return embeddings_index

# Create embedding matrix
def create_embedding_matrix(word_index, glove_embeddings, vocab_size, embedding_dim=300):
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in word_index.items():
        if i < vocab_size:
            embedding_vector = glove_embeddings.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

# Load pre-trained GloVe embeddings (update with your actual path)
glove_file_path = "/content/drive/MyDrive/DSA/glove/glove.6B.300d.txt"
glove_embeddings = load_glove_embeddings(glove_file_path)
embedding_matrix = create_embedding_matrix(tokenizer.word_index, glove_embeddings, VOCAB_SIZE)

# Step 3: Define the C-LSTM Model for Fine-Grained Classification
class CLSTMFineGrainedClassifierAgNews(tf.keras.Model):
    def __init__(self, vocab_size, embedding_matrix, max_length, num_filters=150, lstm_units=150, embedding_dim=300, dropout_rate=0.5, l2_reg_lambda=0.001, num_classes=4):
        super(CLSTMFineGrainedClassifierAgNews, self).__init__()
        self.embedding = layers.Embedding(input_dim=vocab_size,
                                          output_dim=embedding_dim,
                                          input_length=max_length,
                                          weights=[embedding_matrix],
                                          trainable=True)
        self.embedding_dropout = layers.Dropout(rate=dropout_rate)

        # Convolutional layer with filter size 3
        self.conv_layer = layers.Conv2D(filters=num_filters,
                                        kernel_size=(3, embedding_dim),
                                        activation='relu', padding='valid')
        self.batch_norm = layers.BatchNormalization()

        # LSTM layer to capture dependencies
        self.lstm = layers.LSTM(lstm_units, return_sequences=False)
        self.dropout = layers.Dropout(rate=dropout_rate)

        # Output layer for fine-grained classification with 4 classes
        self.fc = layers.Dense(num_classes, activation='softmax', kernel_regularizer=tf.keras.regularizers.L2(l2_reg_lambda))

    def call(self, inputs, training=False):
        x = self.embedding(inputs)
        x = self.embedding_dropout(x, training=training)
        x = tf.expand_dims(x, -1)

        # Apply convolutional layer and batch normalization
        conv_out = self.conv_layer(x)
        conv_out = self.batch_norm(conv_out, training=training)
        conv_out = tf.squeeze(conv_out, 2)

        # Pass through LSTM
        rnn_outputs = self.lstm(conv_out)
        rnn_outputs = self.dropout(rnn_outputs, training=training)

        # Output for fine-grained classification
        fine_output = self.fc(rnn_outputs)
        return fine_output

# Step 4: Initialize and compile the model
model = CLSTMFineGrainedClassifierAgNews(vocab_size=VOCAB_SIZE,
                                   embedding_matrix=embedding_matrix,
                                   max_length=MAX_LEN,
                                   num_classes=4)

model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Step 5: Train the model
history = model.fit(
    x_train, y_train,
    batch_size=64,
    epochs=10,
    validation_data=(x_test, y_test),
    verbose=1
)

# Step 6: Evaluate the model
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_acc}')


Epoch 1/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 24ms/step - accuracy: 0.8441 - loss: 0.4536 - val_accuracy: 0.9110 - val_loss: 0.2740
Epoch 2/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 24ms/step - accuracy: 0.9052 - loss: 0.2921 - val_accuracy: 0.9170 - val_loss: 0.2529
Epoch 3/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 24ms/step - accuracy: 0.9139 - loss: 0.2633 - val_accuracy: 0.9203 - val_loss: 0.2422
Epoch 4/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 24ms/step - accuracy: 0.9187 - loss: 0.2475 - val_accuracy: 0.9219 - val_loss: 0.2389
Epoch 5/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 24ms/step - accuracy: 0.9215 - loss: 0.2378 - val_accuracy: 0.9213 - val_loss: 0.2400
Epoch 6/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 24ms/step - accuracy: 0.9235 - loss: 0.2337 - val_accuracy: 0.9207 - val_loss: 0.2367
Epoc