## Constructing solution based on a discriminative neural network (BiLSTM)

In [3]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense

# Assume you have preprocessed data (tokenization, padding, etc.)
vocab_size = 10000  # adjust according to your data
embedding_dim = 16
max_length = 200  # adjust according to your data

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True)),
    Bidirectional(LSTM(32)),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

2023-12-02 19:26:52.775813: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-02 19:26:52.809856: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-02 19:26:52.809890: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-02 19:26:52.811166: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-02 19:26:52.817392: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-02 19:26:52.818241: I tensorflow/core/platform/cpu_feature_guard.cc:1

## Step 3a: Train and apply the model to real data

In [4]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load real data
df_real = pd.read_csv("sentiment_real_data.csv")

# Preprocess real data
le_real = LabelEncoder()
df_real['sentiment'] = le_real.fit_transform(df_real['sentiment'])
texts_real = df_real['review'].values
labels_real = df_real['sentiment'].values

# Tokenization for real data
tokenizer_real = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer_real.fit_on_texts(texts_real)
sequences_real = tokenizer_real.texts_to_sequences(texts_real)
padded_sequences_real = pad_sequences(sequences_real, maxlen=max_length, truncating='post')

# Train/test split for real data
X_train_real, X_test_real, y_train_real, y_test_real = train_test_split(
    padded_sequences_real, labels_real, test_size=0.2, random_state=42
)

# Model for real data
model_real = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True)),
    Bidirectional(LSTM(32)),
    Dense(1, activation='sigmoid')
])

model_real.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model on real data
model_real.fit(X_train_real, y_train_real, epochs=5, validation_data=(X_test_real, y_test_real))

Epoch 1/5


2023-12-02 19:31:58.100057: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 230400000 exceeds 10% of free system memory.




2023-12-02 19:55:46.274617: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 57600000 exceeds 10% of free system memory.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7f1e429893f0>

In [5]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, BatchNormalization
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load real data
df_real2 = pd.read_csv("sentiment_real_data.csv")

# Preprocess real data
le_real2 = LabelEncoder()
df_real2['sentiment'] = le_real2.fit_transform(df_real2['sentiment'])
texts_real2 = df_real2['review'].values
labels_real2 = df_real2['sentiment'].values

# Tokenization for real data
tokenizer_real2 = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer_real2.fit_on_texts(texts_real2)
sequences_real2 = tokenizer_real2.texts_to_sequences(texts_real2)
padded_sequences_real2 = pad_sequences(sequences_real2, maxlen=max_length, truncating='post')

# Train/test split for real data
X_train_real2, X_test_real2, y_train_real2, y_test_real2 = train_test_split(
    padded_sequences_real2, labels_real2, test_size=0.2, random_state=42
)

# Model for real data
model_real2 = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)),
    Bidirectional(LSTM(32, dropout=0.2, recurrent_dropout=0.2)),
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model_real2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model on real data
model_real2.fit(X_train_real2, y_train_real2, epochs=10, validation_data=(X_test_real2, y_test_real2), batch_size=64)

Epoch 1/10


2023-12-02 21:41:23.838971: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 230400000 exceeds 10% of free system memory.




2023-12-02 22:19:22.241027: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 57600000 exceeds 10% of free system memory.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

KeyboardInterrupt: 

In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, BatchNormalization
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load real data
df_real3 = pd.read_csv("sentiment_real_data.csv")

# Preprocess real data
le_real3 = LabelEncoder()
df_real3['sentiment'] = le_real3.fit_transform(df_real3['sentiment'])
texts_real3 = df_real3['review'].values
labels_real3 = df_real3['sentiment'].values

# Tokenization for real data
tokenizer_real3 = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer_real3.fit_on_texts(texts_real3)
sequences_real3 = tokenizer_real3.texts_to_sequences(texts_real3)
padded_sequences_real3 = pad_sequences(sequences_real3, maxlen=max_length, truncating='post')

# Train/test split for real data
X_train_real3, X_test_real3, y_train_real3, y_test_real3 = train_test_split(
    padded_sequences_real3, labels_real3, test_size=0.2, random_state=42
)

model_real3 = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True, dropout=0.3, recurrent_dropout=0.3)),
    Bidirectional(LSTM(32, dropout=0.3, recurrent_dropout=0.3, return_sequences=True)),
    Bidirectional(LSTM(16, dropout=0.3, recurrent_dropout=0.3)),
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.4),
    Dense(1, activation='sigmoid')
])

model_real3.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model on real data
model_real3.fit(X_train_real3, y_train_real3, epochs=20, validation_data=(X_test_real3, y_test_real3), batch_size=64)

## Step 3b: Train and apply the model to synthetic data

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense

# Load synthetic data from Excel file
file_path = 'sentiment_synthetic_data.xlsx'
df = pd.read_excel(file_path)

# Preprocess data
le = LabelEncoder()
df['Sentiment'] = le.fit_transform(df['Sentiment'])
texts = df['Feedback'].values
labels = df['Sentiment'].values

# Tokenization
vocab_size = 10000  # adjust according to your data
embedding_dim = 16
max_length = 200  # adjust according to your data

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=max_length, truncating='post')

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# Build and compile the model
model_synthetic = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True)),
    Bidirectional(LSTM(32)),
    Dense(1, activation='sigmoid')
])

model_synthetic.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model_synthetic.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))