In [6]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Bidirectional, GRU, Dense, TimeDistributed, Layer, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np
import pandas as pd
import os
import logging
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [7]:
# Suppress TensorFlow logging
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
logging.getLogger('tensorflow').setLevel(logging.FATAL)

In [8]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/Lanado/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Lanado/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
def load_data(path):
    # Load data
    data = pd.read_csv(path, sep='\t', header=None, names=['label', 'text'])
    data['label'] = data['label'].str.replace('__label__', '')
    data['label'] = data['label'].astype(int).astype('category')

    return data

train_path = 'trees_sst/sst_train.txt'
val_path = 'trees_sst/sst_dev.txt'
test_path = 'trees_sst/sst_test.txt'

train_data, val_data, test_data = load_data(train_path), load_data(val_path), load_data(test_path)

In [10]:
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = word_tokenize(text)
    text = [word for word in text if word not in stop_words]
    return ' '.join(text)

train_data['text'] = train_data['text'].apply(clean_text)
val_data['text'] = val_data['text'].apply(clean_text)

In [11]:
# Tokenizer settings
MAX_WORDS = 20000
MAX_SENTENCES = 15
MAX_WORDS_PER_SENTENCE = 50
EMBEDDING_DIM = 200

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<UNK>")
tokenizer.fit_on_texts(train_data['text'])

def preprocess_texts(texts):
    sequences = tokenizer.texts_to_sequences(texts)
    sequences = pad_sequences(sequences, maxlen=MAX_WORDS_PER_SENTENCE)
    return sequences

def preprocess_documents(docs):
    processed_docs = []
    for doc in docs:
        sentences = doc.split('. ')  # Split into sentences
        sentences = sentences[:MAX_SENTENCES]  # Keep only the first MAX_SENTENCES sentences
        processed_sentences = preprocess_texts(sentences)
        if len(processed_sentences) < MAX_SENTENCES:
            pad_length = MAX_SENTENCES - len(processed_sentences)
            padding = np.zeros((pad_length, MAX_WORDS_PER_SENTENCE), dtype=np.int32)
            processed_sentences = np.vstack([processed_sentences, padding])
        processed_docs.append(processed_sentences)
    processed_docs = np.array(processed_docs)
    return processed_docs

X_train = preprocess_documents(train_data['text'])
X_val = preprocess_documents(val_data['text'])

y_train = to_categorical(np.array(train_data['label']) - 1, num_classes=5)  
y_val = to_categorical(np.array(val_data['label']) - 1, num_classes=5)


In [12]:
class AttentionLayer(Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name='att_weight', shape=(input_shape[-1], input_shape[-1]), initializer='random_normal', trainable=True)
        self.b = self.add_weight(name='att_bias', shape=(input_shape[-1],), initializer='random_normal', trainable=True)
        self.u = self.add_weight(name='att_u', shape=(input_shape[-1],), initializer='random_normal', trainable=True)
        super(AttentionLayer, self).build(input_shape)

    def call(self, x):
        uit = tf.tanh(tf.tensordot(x, self.W, axes=1) + self.b)
        ait = tf.tensordot(uit, self.u, axes=1)
        a = tf.nn.softmax(ait)
        a = tf.expand_dims(a, axis=-1)
        output = x * a
        return tf.reduce_sum(output, axis=1)

# Word-level attention
word_input = Input(shape=(MAX_WORDS_PER_SENTENCE,), dtype='int32')
word_sequences = Embedding(input_dim=MAX_WORDS, output_dim=EMBEDDING_DIM, input_length=MAX_WORDS_PER_SENTENCE)(word_input)
word_lstm = Bidirectional(GRU(150, return_sequences=True))(word_sequences)
word_dense = TimeDistributed(Dense(200))(word_lstm)
word_attention = AttentionLayer()(word_dense)
word_encoder = Model(word_input, word_attention)

# Sentence-level attention
sentence_input = Input(shape=(MAX_SENTENCES, MAX_WORDS_PER_SENTENCE), dtype='int32')
sentence_encoder = TimeDistributed(word_encoder)(sentence_input)
sentence_lstm = Bidirectional(GRU(150, return_sequences=True))(sentence_encoder)
sentence_dense = TimeDistributed(Dense(200))(sentence_lstm)
sentence_attention = AttentionLayer()(sentence_dense)
dropout = Dropout(0.5)(sentence_attention)
preds = Dense(5, activation='softmax')(dropout) 

model = Model(sentence_input, preds)


In [13]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 15, 50)]          0         
                                                                 
 time_distributed_1 (TimeDis  (None, 15, 200)          4417400   
 tributed)                                                       
                                                                 
 bidirectional_1 (Bidirectio  (None, 15, 300)          316800    
 nal)                                                            
                                                                 
 time_distributed_2 (TimeDis  (None, 15, 200)          60200     
 tributed)                                                       
                                                                 
 attention_layer_1 (Attentio  (None, 200)              40400     
 nLayer)                                                   

In [14]:
# Early stopping and model checkpoint to save the best model
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_val, y_val), callbacks=[early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
