In [260]:
import tensorflow as tf
from datasets import load_dataset as ld
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import pandas as pd
import numpy as np

In [261]:
from datasets import load_dataset as ld

def load_dataset_as_pandas_dataframe(
    dataset_name: str="trec"
):

    dataset = ld(dataset_name)
    df = dataset["train"].to_pandas()
    df = df.sample(frac=1)

    df_test = dataset["test"].to_pandas()
    df_test = df_test.sample(frac=1)
    
    return df, df_test

In [262]:
df, df_test = load_dataset_as_pandas_dataframe()

In [263]:
df.head(5)

Unnamed: 0,text,coarse_label,fine_label
809,"What is the meaning of W.B. Yeat 's poem , `` ...",2,24
1868,Where is Amsterdam ?,4,35
3930,Where is the Kentucky Horse Park ?,4,35
5340,What is in baby powder and baby lotion that ma...,2,27
4565,How has TV affected our society ?,2,26


- Processing Text Data

In [264]:
# Parameters 
vocab_size = 1000 
emebed_size = 20 
max_sequence_length = 14
epochs = 50
batch_size = 30 

In [275]:
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(df_test['text'])

## LSTM Text Classification Model


In [266]:
rnn_model = tf.keras.Sequential([ 
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=emebed_size,input_length = max_sequence_length), 
    tf.keras.layers.LSTM(units=2000),
    tf.keras.layers.LayerNormalization(),
    tf.keras.layers.Dropout(0.50),
    tf.keras.layers.Dense(units=len(df.coarse_label.unique()), activation='softmax')
])

rnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [274]:
rnn_model.summary()

Model: "sequential_23"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_24 (Embedding)    (None, 14, 20)            20000     
                                                                 
 lstm_22 (LSTM)              (None, 2000)              16168000  
                                                                 
 layer_normalization_11 (Lay  (None, 2000)             4000      
 erNormalization)                                                
                                                                 
 dropout_6 (Dropout)         (None, 2000)              0         
                                                                 
 dense_23 (Dense)            (None, 6)                 12006     
                                                                 
Total params: 16,204,006
Trainable params: 16,204,006
Non-trainable params: 0
_________________________________________

In [268]:
test_sequences = tokenizer.texts_to_sequences(df_test['text'])
test_padded_sequences = pad_sequences(test_sequences, maxlen=max_sequence_length, padding='post')

In [269]:
label_count = 0 
accs = [] 
for i in range(15):
    label_count += 10
    sequences = tokenizer.texts_to_sequences(df_test['text'][:label_count])
    padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')
    rnn_model.fit(padded_sequences, df_test['coarse_label'][:label_count], epochs=20,validation_data=(test_padded_sequences, df_test['coarse_label']))
    predictions = rnn_model.predict(test_padded_sequences)
    predicted_labels = np.argmax(predictions, axis=1)
    print(accuracy_score(df_test['coarse_label'],predicted_labels))
    accs.append(accuracy_score(df_test['coarse_label'],predicted_labels))

    



Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
0.388
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
0.468
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
0.426
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
0.454
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/