In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tf.keras import Sequential
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load the dataset
data = pd.read_csv("../../data/cleaned/out.csv")

2023-04-30 13:19:01.638238: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Preprocess the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['text'])
sequences = tokenizer.texts_to_sequences(data['text'])
word_index = tokenizer.word_index

max_sequence_length = max(len(seq) for seq in sequences)
input_data = pad_sequences(sequences, maxlen=max_sequence_length)

# One-hot encode the labels
label_encoder = LabelEncoder()
integer_encoded_labels = label_encoder.fit_transform(data['label'])
labels = to_categorical(integer_encoded_labels, num_classes=7)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(input_data, labels, test_size=0.2, random_state=42)

In [4]:
embedding_dim = 100
vocab_size = len(word_index) + 1

model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_sequence_length),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(7, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [5]:
num_epochs = 10
batch_size = 64

history = model.fit(
    X_train, y_train,
    validation_split=0.1,
    epochs=num_epochs,
    batch_size=batch_size,
    verbose=2
)

Epoch 1/10
743/743 - 211s - loss: 1.5471 - accuracy: 0.4067 - val_loss: 1.3626 - val_accuracy: 0.4941 - 211s/epoch - 284ms/step
Epoch 2/10
743/743 - 285s - loss: 1.1252 - accuracy: 0.5907 - val_loss: 1.3206 - val_accuracy: 0.5106 - 285s/epoch - 384ms/step
Epoch 3/10
743/743 - 214s - loss: 0.8278 - accuracy: 0.7116 - val_loss: 1.4487 - val_accuracy: 0.4968 - 214s/epoch - 289ms/step
Epoch 4/10
743/743 - 207s - loss: 0.5833 - accuracy: 0.8020 - val_loss: 1.6036 - val_accuracy: 0.4843 - 207s/epoch - 279ms/step
Epoch 5/10
743/743 - 217s - loss: 0.4417 - accuracy: 0.8477 - val_loss: 1.7806 - val_accuracy: 0.4794 - 217s/epoch - 291ms/step
Epoch 6/10
743/743 - 248s - loss: 0.3625 - accuracy: 0.8740 - val_loss: 2.1126 - val_accuracy: 0.4748 - 248s/epoch - 334ms/step
Epoch 7/10
743/743 - 209s - loss: 0.3159 - accuracy: 0.8891 - val_loss: 2.1132 - val_accuracy: 0.4680 - 209s/epoch - 281ms/step
Epoch 8/10
743/743 - 228s - loss: 0.2788 - accuracy: 0.9007 - val_loss: 2.3407 - val_accuracy: 0.4614 - 

In [6]:
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=2)
print(f'Test accuracy: {test_accuracy}')

413/413 - 23s - loss: 2.4631 - accuracy: 0.4713 - 23s/epoch - 57ms/step
Test accuracy: 0.47128352522850037
