In [1]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.layers import Input, Dense, Dropout, Concatenate, Conv1D, GlobalMaxPooling1D, Activation, Attention
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

In [2]:
# Set GPU device
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        tf.config.experimental.set_memory_growth(gpus[0], True)
        print('GPU device set')
    except RuntimeError as e:
        print(e)

GPU device set


In [3]:
# Load the data
train_data = pd.read_csv('/kaggle/input/swm-data-final-split/swm_data_final_split/ori_data_final_train.csv')
val_data = pd.read_csv('/kaggle/input/swm-data-final-split/swm_data_final_split/ori_data_final_val.csv')
test_data = pd.read_csv('/kaggle/input/swm-data-final-split/swm_data_final_split/ori_data_final_test.csv')

**Preprocessing**

In [4]:
def assign_class(label):
  if label == "OR":
    return 0
  else:
    return 1

In [5]:
def assign_cat(category):
    if category == "Kindle_Store_5":
        return 1
    elif category == "Books_5":
        return 2
    elif category == "Pet_Supplies_5":
        return 3
    elif category == "Home_and_Kitchen_5":
        return 4
    elif category == "Electronics_5":
        return 5
    elif category == "Sports_and_Outdoors_5":
        return 6
    elif category == "Tools_and_Home_Improvement_5":
        return 7
    elif category == "Clothing_Shoes_and_Jewelry_5":
        return 8
    elif category == "Toys_and_Games_5":
        return 9
    else:
        return 10

In [6]:
train_data['category_final'] = train_data['category'].apply(assign_cat)
val_data['category_final'] = val_data['category'].apply(assign_cat)
test_data['category_final'] = test_data['category'].apply(assign_cat)

In [7]:
train_data['label_final'] = train_data['label'].apply(assign_class)
val_data['label_final'] = val_data['label'].apply(assign_class)
test_data['label_final'] = test_data['label'].apply(assign_class)

In [8]:
train_data = train_data.astype({'sentiment':'int'})
val_data = val_data.astype({'sentiment':'int'})
test_data = test_data.astype({'sentiment':'int'})

In [9]:
# Separate out text and categorical features for each dataset
train_text = train_data["text_final"].values.astype("str")
train_cat_features = train_data[['category_final', 'sentiment', 'word_count_categories']].values
train_labels = train_data["label_final"].values
val_text = val_data["text_final"].values.astype("str")
val_cat_features = val_data[['category_final', 'sentiment', 'word_count_categories']].values
val_labels = val_data["label_final"].values
test_text = test_data["text_final"].values.astype("str")
test_cat_features = test_data[['category_final', 'sentiment', 'word_count_categories']].values
test_labels = test_data["label_final"].values

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [11]:
def generate_inputs(texts, categorical_features, tokenizer, max_length):
    # Tokenize the input texts
    tokens = tokenizer.batch_encode_plus(
        texts,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )

    # Extract the token ids, attention masks, and token type ids
    input_ids = np.array(tokens['input_ids'])
    attention_masks = np.array(tokens['attention_mask'])
    token_type_ids = np.array(tokens['token_type_ids'])

    # Convert the categorical features to one-hot encoding
    num_classes = [10, 2, 5]  # number of classes for each categorical feature
    categorical_inputs = np.zeros((len(texts), sum(num_classes)))
    for i, num_class in enumerate(num_classes):
        categorical_inputs[np.arange(len(texts)), categorical_features[:, i]-1 + sum(num_classes[:i])] = 1
        
    # Return the inputs as a list of NumPy arrays
    return [input_ids, attention_masks, token_type_ids, categorical_inputs]

In [12]:
max_length = 128

In [13]:
# Tokenize the train, validation, and test texts
train_tokens = tokenizer.batch_encode_plus(
    train_text,
    add_special_tokens = True,
    max_length=max_length,
    padding = True,
    return_attention_mask = True,
    truncation=True,
    return_tensors='tf'
)
val_tokens = tokenizer.batch_encode_plus(
    val_text,
    add_special_tokens = True,
    max_length=max_length,
    padding = True,
    return_attention_mask = True,
    truncation=True,
    return_tensors='tf'
)
test_tokens = tokenizer.batch_encode_plus(
    test_text,
    add_special_tokens = True,
    max_length=max_length,
    padding = True,
    return_attention_mask = True,
    truncation=True,
    return_tensors='tf'
)

# Extract the token ids, attention masks, and token type ids for train, validation, and test
train_input_ids = np.array(train_tokens['input_ids'])
train_attention_masks = np.array(train_tokens['attention_mask'])
train_token_type_ids = np.array(train_tokens['token_type_ids'])
val_input_ids = np.array(val_tokens['input_ids'])
val_attention_masks = np.array(val_tokens['attention_mask'])
val_token_type_ids = np.array(val_tokens['token_type_ids'])
test_input_ids = np.array(test_tokens['input_ids'])
test_attention_masks = np.array(test_tokens['attention_mask'])
test_token_type_ids = np.array(test_tokens['token_type_ids'])

In [14]:
# Generate the input data for train, validation, and test
train_inputs = generate_inputs(train_text, train_cat_features, tokenizer, max_length)
val_inputs = generate_inputs(val_text, val_cat_features, tokenizer, max_length)
test_inputs = generate_inputs(test_text, test_cat_features, tokenizer, max_length)

# Generate the target data for train, validation, and test
train_targets = np.array(train_labels)
val_targets = np.array(val_labels)
test_targets = np.array(test_labels)

**Model**

In [15]:
# Load BERT model
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4", trainable=False)

In [16]:
# Build model
input_word_ids = Input(shape=(None,), dtype=tf.int32, name="input_word_ids")
input_mask = Input(shape=(None,), dtype=tf.int32, name="input_mask")
input_type_ids = Input(shape=(None,), dtype=tf.int32, name="input_type_ids")
bert_inputs = {"input_word_ids": input_word_ids, "input_mask": input_mask, "input_type_ids": input_type_ids}

# BERT embeddings
bert_outputs = bert_layer(bert_inputs)
pooled_output = bert_outputs["pooled_output"]
sequence_output = bert_outputs["sequence_output"]

# Categorical features input
input_cat_features = Input(shape=(17,), dtype=tf.float32, name="input_cat_features")

# Concatenate BERT embeddings and categorical features
concat_layer = Concatenate()([pooled_output, input_cat_features])

# Classification layer
dense_layer_1 = Dense(256, activation="relu")(concat_layer)
dense_layer_2 = Dense(128, activation="relu")(dense_layer_1)
dropout_layer = Dropout(0.2)(dense_layer_2)
output_layer = Dense(1, activation="sigmoid")(dropout_layer)

In [17]:
# Define model inputs and outputs
model = Model(inputs=[input_word_ids, input_mask, input_type_ids, input_cat_features], outputs=output_layer)

# Compile model
model.compile(loss="binary_crossentropy", optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4), 
              metrics=["accuracy"])

# Set up early stopping and model checkpoint callbacks
callbacks = [
    EarlyStopping(monitor="val_loss", patience=5),
    ModelCheckpoint(filepath="best_model.h5", monitor="val_loss", save_best_only=True,
                   save_weights_only=True, verbose=1)
]

In [18]:
# Define the training parameters
batch_size = 64
epochs = 20

# Train the model
history = model.fit(train_inputs, train_targets, validation_data=(val_inputs, val_targets), 
                    epochs=epochs, batch_size=batch_size, callbacks=callbacks)

Epoch 1/20
Epoch 1: val_loss improved from inf to 0.41759, saving model to best_model.h5
Epoch 2/20
Epoch 2: val_loss improved from 0.41759 to 0.36323, saving model to best_model.h5
Epoch 3/20
Epoch 3: val_loss improved from 0.36323 to 0.35303, saving model to best_model.h5
Epoch 4/20
Epoch 4: val_loss improved from 0.35303 to 0.32092, saving model to best_model.h5
Epoch 5/20
Epoch 5: val_loss improved from 0.32092 to 0.30705, saving model to best_model.h5
Epoch 6/20
Epoch 6: val_loss did not improve from 0.30705
Epoch 7/20
Epoch 7: val_loss improved from 0.30705 to 0.30259, saving model to best_model.h5
Epoch 8/20
Epoch 8: val_loss did not improve from 0.30259
Epoch 9/20
Epoch 9: val_loss improved from 0.30259 to 0.28063, saving model to best_model.h5
Epoch 10/20
Epoch 10: val_loss did not improve from 0.28063
Epoch 11/20
Epoch 11: val_loss improved from 0.28063 to 0.27656, saving model to best_model.h5
Epoch 12/20
Epoch 12: val_loss did not improve from 0.27656
Epoch 13/20
Epoch 13: 

In [19]:
def evaluate_model(model, X_test, y_test):
    y_pred = np.round(model.predict(X_test))
    y_true = y_test

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)
    roc_auc = roc_auc_score(y_test, model.predict(X_test), multi_class='ovr')

    metrics = {'accuracy': accuracy,
               'precision': precision,
               'recall': recall,
               'f1': f1,
               'confusion_matrix': cm,
               'roc_auc': roc_auc}

    return metrics

In [20]:
# Load the best saved model
model.load_weights("/kaggle/working/best_model.h5")

# Evaluate the model on the test data
test_metrics = evaluate_model(model, test_inputs, test_targets)

# Print the evaluation metrics
print('Test accuracy:', test_metrics['accuracy'])
print('Test precision:', test_metrics['precision'])
print('Test recall:', test_metrics['recall'])
print('Test F1 score:', test_metrics['f1'])
print('Test ROC-AUC score:', test_metrics['roc_auc'])
print('Test confusion matrix:\n', test_metrics['confusion_matrix'])

Test accuracy: 0.8941641938674579
Test precision: 0.8742258218199143
Test recall: 0.9179589794897449
Test F1 score: 0.8955588091752075
Test ROC-AUC score: 0.9637261173373972
Test confusion matrix:
 [[1781  264]
 [ 164 1835]]
