## Constructing solution based on a discriminative neural network (BiLSTM)

#### Train and apply the model to real data

In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re

# Data processing function
def data_processing(text):
    text = text.lower()
    text = re.sub(r"<br />", " ", text)
    text = re.sub(r"[^\w\s]", " ", text)
    words = re.findall(r"\b\w+\b", text)
    return " ".join(words)

# Load real data
df_real3 = pd.read_csv("balanced_data.csv")

# Apply data processing to the 'review' column
df_real3["review"] = df_real3["review"].apply(data_processing)

# Preprocess real data
texts_real3 = df_real3['review'].values
labels_real3 = df_real3['sentiment'].values

# Ensure sentiment labels are in the range [0, 2]
labels_real3 = labels_real3 + 1  # Convert -1 to 0, keep 0 and 1 as is

# Tokenization for real data
max_words3 = 10000
max_length_real3 = 100
tokenizer_real3 = Tokenizer(num_words=max_words3)
tokenizer_real3.fit_on_texts(texts_real3)
sequences_real3 = tokenizer_real3.texts_to_sequences(texts_real3)
padded_sequences_real3 = pad_sequences(sequences_real3, maxlen=max_length_real3, truncating='post')

# Train/test split for real data
X_train_real3, X_test_real3, y_train_real3, y_test_real3 = train_test_split(
    padded_sequences_real3, labels_real3, test_size=0.2, random_state=42
)

# Build and compile the model for real data (adjusted architecture)
model_real3 = Sequential([
    Embedding(input_dim=max_words3, output_dim=32, input_length=max_length_real3),
    Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)),
    Dense(16, activation='relu'),
    Dropout(0.2),
    Dense(3, activation='softmax')  # Adjusted to 3 output categories
])

adam_real3 = tf.keras.optimizers.Adam(learning_rate=0.001)

model_real3.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=adam_real3,
    metrics=['accuracy']
)

model_real3.summary()

# Train the model with early stopping
early_stopping_real3 = tf.keras.callbacks.EarlyStopping(
    monitor='val_accuracy', patience=15, restore_best_weights=True
)

model_real3.fit(
    X_train_real3, y_train_real3, epochs=100,
    validation_data=(X_test_real3, y_test_real3),
    batch_size=32,
    callbacks=[early_stopping_real3]
)

# Evaluate the model on real data
real_results3 = model_real3.evaluate(X_test_real3, y_test_real3)
print("Real Data Evaluation - Loss: {:.4f}, Accuracy: {:.4f}".format(real_results3[0], real_results3[1]))


2023-12-11 06:49:11.301612: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-11 06:49:11.334202: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-11 06:49:11.334236: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-11 06:49:11.335477: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-11 06:49:11.341398: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-11 06:49:11.342341: I tensorflow/core/platform/cpu_feature_guard.cc:1

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 32)           320000    
                                                                 
 bidirectional (Bidirection  (None, 128)               49664     
 al)                                                             
                                                                 
 dense (Dense)               (None, 16)                2064      
                                                                 
 dropout (Dropout)           (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 3)                 51        
                                                                 
Total params: 371779 (1.42 MB)
Trainable params: 371779 (1.42 MB)
Non-trainable params: 0 (0.00 Byte)
____________________

In [2]:
# Save the model
model_real3.save("blstm_real.h5")

  saving_api.save_model(


In [3]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Evaluate the model on real data
real_predictions3 = model_real3.predict(X_test_real3)
real_predictions_classes3 = real_predictions3.argmax(axis=-1)

# Calculate and print accuracy, precision, recall, and F1-score
accuracy_real3 = accuracy_score(y_test_real3, real_predictions_classes3)
precision_real3 = precision_score(y_test_real3, real_predictions_classes3, average='weighted')
recall_real3 = recall_score(y_test_real3, real_predictions_classes3, average='weighted')
f1_real3 = f1_score(y_test_real3, real_predictions_classes3, average='weighted')

print("Real Data Evaluation:")
print("Accuracy: {:.4f}".format(accuracy_real3))
print("Precision: {:.4f}".format(precision_real3))
print("Recall: {:.4f}".format(recall_real3))
print("F1 Score: {:.4f}".format(f1_real3))

# Display confusion matrix
conf_matrix_real3 = confusion_matrix(y_test_real3, real_predictions_classes3)
print("Confusion Matrix:")
print(conf_matrix_real3)

Real Data Evaluation:
Accuracy: 0.7090
Precision: 0.7070
Recall: 0.7090
F1 Score: 0.7078
Confusion Matrix:
[[4417 1353  378]
 [1467 3751 1000]
 [ 314  849 4891]]


#### Train and apply the model to synthetic data

In [4]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load synthetic data
df_synthetic = pd.read_csv("synthetic_data_nltk.csv")

# Convert sentiment labels to numeric values
label_mapping = {-1: 0, 0: 1, 1: 2}
df_synthetic['label'] = df_synthetic['label'].map(label_mapping)

# Tokenization for synthetic data
max_words = 10000  # Limit the number of words to consider
max_length_synthetic = 100  # Adjusted to a reasonable value
tokenizer_synthetic = Tokenizer(num_words=max_words)
tokenizer_synthetic.fit_on_texts(df_synthetic['text'].values)
sequences_synthetic = tokenizer_synthetic.texts_to_sequences(df_synthetic['text'].values)
padded_sequences_synthetic = pad_sequences(sequences_synthetic, maxlen=max_length_synthetic, truncating='post')

# Train/test split for synthetic data
X_train_synthetic, X_test_synthetic, y_train_synthetic, y_test_synthetic = train_test_split(
    padded_sequences_synthetic, df_synthetic['label'], test_size=0.2, random_state=42
)

# Build and compile the model for synthetic data
model_synthetic = Sequential([
    Embedding(input_dim=max_words, output_dim=32, input_length=max_length_synthetic),
    Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)),
    Dense(16, activation='relu'),
    Dropout(0.2),
    Dense(3, activation='softmax')  # Updated to have 3 output units for negative, neutral, and positive
])

adam_synthetic = tf.keras.optimizers.Adam(learning_rate=0.001)

model_synthetic.compile(
    loss='sparse_categorical_crossentropy',  # Updated to use sparse categorical crossentropy for multi-class classification
    optimizer=adam_synthetic,
    metrics=['accuracy']
)

model_synthetic.summary()

# Train the model with early stopping
early_stopping_synthetic = tf.keras.callbacks.EarlyStopping(
    monitor='val_accuracy', patience=15, restore_best_weights=True
)

model_synthetic.fit(
    X_train_synthetic, y_train_synthetic, epochs=100,
    validation_data=(X_test_synthetic, y_test_synthetic),
    batch_size=32,  # Adjusted to a smaller batch size
    callbacks=[early_stopping_synthetic]
)

# Evaluate the model on synthetic data
synthetic_results = model_synthetic.evaluate(X_test_synthetic, y_test_synthetic)
print("Synthetic Data Evaluation - Loss: {:.4f}, Accuracy: {:.4f}".format(synthetic_results[0], synthetic_results[1]))


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 32)           320000    
                                                                 
 bidirectional_1 (Bidirecti  (None, 128)               49664     
 onal)                                                           
                                                                 
 dense_2 (Dense)             (None, 16)                2064      
                                                                 
 dropout_1 (Dropout)         (None, 16)                0         
                                                                 
 dense_3 (Dense)             (None, 3)                 51        
                                                                 
Total params: 371779 (1.42 MB)
Trainable params: 371779 (1.42 MB)
Non-trainable params: 0 (0.00 Byte)
__________________

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Synthetic Data Evaluation - Loss: 0.0463, Accuracy: 0.9800


In [5]:
# Save the model
model_synthetic.save("blstm_synthetic.h5")

  saving_api.save_model(


In [6]:
from sklearn.metrics import classification_report

# Evaluate the model on synthetic data
synthetic_predictions = model_synthetic.predict(X_test_synthetic)
synthetic_predictions_classes = synthetic_predictions.argmax(axis=-1)

# Calculate and print accuracy, precision, recall, and F1-score
accuracy_synthetic = synthetic_results[1]
precision_synthetic = precision_score(y_test_synthetic, synthetic_predictions_classes, average='weighted')
recall_synthetic = recall_score(y_test_synthetic, synthetic_predictions_classes, average='weighted')
f1_synthetic = f1_score(y_test_synthetic, synthetic_predictions_classes, average='weighted')

print("Synthetic Data Evaluation:")
print("Accuracy: {:.4f}".format(accuracy_synthetic))
print("Precision: {:.4f}".format(precision_synthetic))
print("Recall: {:.4f}".format(recall_synthetic))
print("F1 Score: {:.4f}".format(f1_synthetic))

# Display confusion matrix
conf_matrix_synthetic = confusion_matrix(y_test_synthetic, synthetic_predictions_classes)
print("Confusion Matrix:")
print(conf_matrix_synthetic)


Synthetic Data Evaluation:
Accuracy: 0.9800
Precision: 0.9801
Recall: 0.9800
F1 Score: 0.9800
Confusion Matrix:
[[3286    0   72]
 [   0 3386    0]
 [ 128    0 3128]]
