In [None]:
"""
README

Directory Structure

CS3244-Twemoji
├── Dataset
│   ├── train_bert_embeddings.npy
│   ├── valid_bert_embeddings.npy
│   ├── test_bert_embeddings.npy
│   ├── train_with_bert_embeddings.csv
│   ├── valid_with_bert_embeddings.csv
│   └── test_with_bert_embeddings.csv
│
├── src
│   ├── main.ipynb (this notebook)
│   └── eda.ipynb
│
└── venv # ignore this

NB : Main work is solely in the structuring of your directory
"""


"""
FINAL_NOTE:

After testing by Nigel (big thanks to nigel), 
the baseline model will use the following:

- Embedder : BERT Embedding (without any further preprocessing and feature engineering)
- First Layer Dense Unit    : 64 neuron, activation function : ReLU
- First Dropout Rate        : 0.00
- Second Layer Dense Unit   : 32 neuron, activation function : ReLU
- Second Dropout Rate       : 0.00
- Learning Rate             : 0.001
- Optimizer                 : Stochastic Gradient Descent (SGD)
- Training Batch Size       : 16
- Loss Function             : Sparse Categorical Cross Entropy
- Metrics                   : Accuracy
- Epochs                    : 15
"""

# Package Importing

In [7]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf

# Data Loading

In [4]:
train_embed_path = os.path.join(os.path.dirname(os.getcwd()), "Dataset", "train_bert_embeddings.npy")
valid_embed_path = os.path.join(os.path.dirname(os.getcwd()), "Dataset", "valid_bert_embeddings.npy")
test_embed_path = os.path.join(os.path.dirname(os.getcwd()), "Dataset", "test_bert_embeddings.npy")
train_labels_path = os.path.join(os.path.dirname(os.getcwd()), "Dataset", "train_with_bert_embeddings.csv")
valid_labels_path = os.path.join(os.path.dirname(os.getcwd()), "Dataset", "valid_with_bert_embeddings.csv")
test_labels_path = os.path.join(os.path.dirname(os.getcwd()), "Dataset", "test_with_bert_embeddings.csv")

train_embeddings = np.load(train_embed_path)
valid_embeddings = np.load(valid_embed_path)
test_embeddings = np.load(test_embed_path)
train_labels = pd.read_csv(train_labels_path)
valid_labels = pd.read_csv(valid_labels_path)
test_labels = pd.read_csv(test_labels_path)

In [8]:
# Ensure indices are integers
train_indices = train_labels['embedding_index'].values.astype(int)
valid_indices = valid_labels['embedding_index'].values.astype(int)
test_indices = test_labels['embedding_index'].values.astype(int)

# Extract embeddings using indices
train_X = train_embeddings[train_indices]
valid_X = valid_embeddings[valid_indices]
test_X = test_embeddings[test_indices]

# Extract labels (assuming 'label' column contains integer class labels)
train_Y = train_labels['label'].values.astype(int)
valid_Y = valid_labels['label'].values.astype(int)
test_Y = test_labels['label'].values.astype(int)

# Print shapes to verify
print(f"train_X shape: {train_X.shape}, train_Y shape: {train_Y.shape}")
print(f"valid_X shape: {valid_X.shape}, valid_Y shape: {valid_Y.shape}")
print(f"test_X shape: {test_X.shape}, test_Y shape: {test_Y.shape}")


train_X shape: (133999, 768), train_Y shape: (133999,)
valid_X shape: (17223, 768), valid_Y shape: (17223,)
test_X shape: (17063, 768), test_Y shape: (17063,)


# Model Building and Testing

In [13]:
# Model Architecture Hyperparameters
input_shape = train_X.shape[1]
dense_1_units = 64
dense_1_activation = "relu"  
dropout_1_rate = 0.0
dense_2_units = 32
dense_2_activation = "relu"  
dropout_2_rate = 0.0
output_units = len(np.unique(train_Y))
output_activation = "softmax"

# Training Hyperparameters
optimizer = "sgd"
learning_rate = 0.001
loss_function = "sparse_categorical_crossentropy"
metrics = ["accuracy"]

# Training Parameters
batch_size = 16
epochs = 15
validation_data = (valid_X, valid_Y)

# Model Compilation & Training
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(input_shape,)),
    tf.keras.layers.Dense(dense_1_units, activation = dense_1_activation),
    tf.keras.layers.Dropout(dropout_1_rate),
    tf.keras.layers.Dense(dense_2_units, activation = dense_2_activation),
    tf.keras.layers.Dropout(dropout_2_rate),
    tf.keras.layers.Dense(output_units, activation = output_activation)
])

model.compile(optimizer=optimizer,
              loss=loss_function,
              metrics=metrics)

history = model.fit(train_X, train_Y, validation_data=validation_data,
                    epochs=epochs, batch_size=batch_size)

Epoch 1/15
[1m8375/8375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 435us/step - accuracy: 0.3733 - loss: 1.4114 - val_accuracy: 0.4281 - val_loss: 1.3191
Epoch 2/15
[1m8375/8375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 404us/step - accuracy: 0.4266 - loss: 1.3167 - val_accuracy: 0.4349 - val_loss: 1.3069
Epoch 3/15
[1m8375/8375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 395us/step - accuracy: 0.4350 - loss: 1.3039 - val_accuracy: 0.4373 - val_loss: 1.2991
Epoch 4/15
[1m8375/8375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 394us/step - accuracy: 0.4423 - loss: 1.2906 - val_accuracy: 0.4413 - val_loss: 1.2937
Epoch 5/15
[1m8375/8375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 389us/step - accuracy: 0.4462 - loss: 1.2819 - val_accuracy: 0.4412 - val_loss: 1.2945
Epoch 6/15
[1m8375/8375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 391us/step - accuracy: 0.4536 - loss: 1.2744 - val_accuracy: 0.4439 - val_loss: 1.2890
Epoc

In [14]:
# Evaluate the model on the test set
test_loss, test_acc = model.evaluate(test_X, test_Y)
print(f"Test Accuracy: {test_acc:.4f}")

# Predict on the test set
test_predictions = model.predict(test_X)
test_predicted_classes = np.argmax(test_predictions, axis=1)

[1m  1/534[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m5s[0m 10ms/step - accuracy: 0.4688 - loss: 1.4805

[1m534/534[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 363us/step - accuracy: 0.4764 - loss: 1.2285
Test Accuracy: 0.4428
[1m534/534[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 253us/step


In [16]:
model.save("baseline_nn_weights.keras")