In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, LSTM, Dense, Bidirectional, SpatialDropout1D
from imblearn.over_sampling import SMOTE
import re
from googletrans import Translator 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.decomposition import LatentDirichletAllocation



In [2]:
# Load NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize Translator
translator = Translator()

# Load the original dataset
df_original = pd.read_csv('5kReviewWithSentimentAmazon.csv')

# Keep 'reviewText' and 'Stance' columns
df = df_original[['reviewText', 'Stance']]

# Drop rows where any of the required columns are NaN
df = df.dropna(subset=['reviewText', 'Stance'])

# Ensure all entries in 'reviewText' are strings
df['reviewText'] = df['reviewText'].astype(str)

# Print value counts of 'Stance' column
print(df['Stance'].value_counts())

# Text Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Moneykicks\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Moneykicks\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Stance
Supportive    4162
Opposed        469
Neutral        283
Name: count, dtype: int64


In [3]:
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])  # Lemmatize and remove stopwords
    return text

df['reviewText'] = df['reviewText'].apply(preprocess_text)

# Define back-translation function
def back_translate(text, src_language='en', mid_language='fr'):
    try:
        translated_text = translator.translate(text, src=src_language, dest=mid_language).text
        back_translated_text = translator.translate(translated_text, src=mid_language, dest=src_language).text
        return back_translated_text
    except Exception as e:
        print(f"Error during back-translation: {e}")
        return text

# Define function to augment minority class
def augment_minority_class(df, class_label, src_language='en', mid_language='fr'):
    minority_texts = df[df['Stance'] == class_label]['reviewText'].tolist()
    augmented_texts = [back_translate(text, src_language, mid_language) for text in minority_texts]
    augmented_labels = [class_label] * len(augmented_texts)
    return pd.DataFrame({'reviewText': augmented_texts, 'Stance': augmented_labels})

# Define threshold for minority class
threshold = 100  # Adjust based on your needs

# Identify minority classes
class_counts = df['Stance'].value_counts()
minority_classes = class_counts[class_counts < threshold].index.tolist()

# Apply back-translation for all minority classes
augmented_df_list = [augment_minority_class(df, cls) for cls in minority_classes]
df_augmented = pd.concat([df] + augmented_df_list, ignore_index=True)



In [4]:
# Vectorize text data for LDA
count_vectorizer_lda = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
text_data_vectorized_lda = count_vectorizer_lda.fit_transform(df_augmented['reviewText'])

# Fit LDA model
lda_model = LatentDirichletAllocation(n_components=10, random_state=42)
lda_model.fit(text_data_vectorized_lda)

# Transform data into topic distributions
topic_distributions_lda = lda_model.transform(text_data_vectorized_lda)

# Add topic distributions to the dataframe
for i in range(10):  # Adjust based on the number of topics (n_components) in LDA
    df_augmented[f'topic_{i}_lda'] = topic_distributions_lda[:, i]

# Text Vectorization for ML models
tfidf = TfidfVectorizer(max_df=0.7)
X = tfidf.fit_transform(df_augmented['reviewText'])
y = df_augmented['Stance']

# Handle class imbalance using SMOTE for traditional ML models
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

# Split dataset for ML models
X_train_ml, X_test_ml, y_train_ml, y_test_ml = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42)

# Train and evaluate ML models
models = {
    'Naive Bayes': MultinomialNB(),
    'SVM': SVC(kernel='linear', probability=True),  # Set probability=True for SVM
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}



In [5]:
# Confidence threshold
confidence_threshold = 0.8

def trust_based_accuracy(y_true, y_pred, confidence_scores, threshold):
    mask = confidence_scores >= threshold
    y_true_trust = y_true[mask]
    y_pred_trust = y_pred[mask]
    if len(y_true_trust) == 0:
        return None  # No predictions above the threshold
    return accuracy_score(y_true_trust, y_pred_trust)

for name, model in models.items():
    model.fit(X_train_ml, y_train_ml)
    y_pred = model.predict(X_test_ml)
    y_probs = model.predict_proba(X_test_ml)
    confidence_scores = y_probs.max(axis=1)
    
    # Calculate standard accuracy
    accuracy = accuracy_score(y_test_ml, y_pred)
    report = classification_report(y_test_ml, y_pred)
    print(f"{name} Results:")
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(report)
    
    # Calculate trust-based accuracy
    trust_accuracy = trust_based_accuracy(y_test_ml, y_pred, confidence_scores, confidence_threshold)
    if trust_accuracy is not None:
        print(f"Trust-Based Accuracy for {name}: {trust_accuracy}")
    else:
        print(f"No predictions above the confidence threshold for {name}")



Naive Bayes Results:
Accuracy: 0.9275420336269016
Classification Report:
              precision    recall  f1-score   support

     Neutral       0.95      0.97      0.96       844
     Opposed       0.88      0.97      0.92       811
  Supportive       0.96      0.84      0.90       843

    accuracy                           0.93      2498
   macro avg       0.93      0.93      0.93      2498
weighted avg       0.93      0.93      0.93      2498

Trust-Based Accuracy for Naive Bayes: 0.9946308724832215
SVM Results:
Accuracy: 0.9619695756605284
Classification Report:
              precision    recall  f1-score   support

     Neutral       0.96      1.00      0.98       844
     Opposed       0.93      1.00      0.96       811
  Supportive       1.00      0.89      0.94       843

    accuracy                           0.96      2498
   macro avg       0.96      0.96      0.96      2498
weighted avg       0.96      0.96      0.96      2498

Trust-Based Accuracy for SVM: 0.97791411042

In [6]:
# Tokenizer and padding for deep learning models
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df_augmented['reviewText'])
X_seq_dl_lda = tokenizer.texts_to_sequences(df_augmented['reviewText'])
X_pad_dl_lda = pad_sequences(X_seq_dl_lda, maxlen=100)
y_encoded = pd.get_dummies(df_augmented['Stance']).values

# Combine TF-IDF and LDA topics for deep learning models
X_combined_dl_lda = np.hstack((X_pad_dl_lda, topic_distributions_lda))

# Split dataset for deep learning models
X_train_dl, X_test_dl, y_train_dl, y_test_dl = train_test_split(X_combined_dl_lda, y_encoded, test_size=0.2, random_state=42)



In [7]:
# Define and train CNN model with TF-IDF + LDA topics
cnn_model = Sequential([
    Embedding(5000, 128, input_length=100),
    Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'),
    GlobalMaxPooling1D(),
    Dense(128, activation='relu'),
    Dense(y_encoded.shape[1], activation='softmax')
])
cnn_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
cnn_model.fit(X_train_dl, y_train_dl, epochs=5, batch_size=64, validation_data=(X_test_dl, y_test_dl), verbose=2)

cnn_y_pred = cnn_model.predict(X_test_dl)
cnn_y_pred_labels = cnn_y_pred.argmax(axis=1)
y_test_dl_labels = y_test_dl.argmax(axis=1)
cnn_accuracy = accuracy_score(y_test_dl_labels, cnn_y_pred_labels)
cnn_report = classification_report(y_test_dl_labels, cnn_y_pred_labels)
print("CNN Results:")
print(f"Accuracy: {cnn_accuracy}")
print("Classification Report:")
print(cnn_report)

# Calculate confidence scores for CNN
cnn_confidence_scores = cnn_y_pred.max(axis=1)

# Calculate trust-based accuracy for CNN
cnn_trust_accuracy = trust_based_accuracy(y_test_dl_labels, cnn_y_pred_labels, cnn_confidence_scores, confidence_threshold)
if cnn_trust_accuracy is not None:
    print(f"Trust-Based Accuracy for CNN: {cnn_trust_accuracy}")
else:
    print(f"No predictions above the confidence threshold for CNN")



Epoch 1/5




62/62 - 3s - 48ms/step - accuracy: 0.8471 - loss: 0.5495 - val_accuracy: 0.8464 - val_loss: 0.4341
Epoch 2/5
62/62 - 2s - 25ms/step - accuracy: 0.8494 - loss: 0.3569 - val_accuracy: 0.8566 - val_loss: 0.3559
Epoch 3/5
62/62 - 2s - 26ms/step - accuracy: 0.9252 - loss: 0.2318 - val_accuracy: 0.8759 - val_loss: 0.3765
Epoch 4/5
62/62 - 1s - 23ms/step - accuracy: 0.9720 - loss: 0.0954 - val_accuracy: 0.8739 - val_loss: 0.4490
Epoch 5/5
62/62 - 1s - 23ms/step - accuracy: 0.9934 - loss: 0.0303 - val_accuracy: 0.8728 - val_loss: 0.4862
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
CNN Results:
Accuracy: 0.8728382502543235
Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.40      0.49        52
           1       0.60      0.29      0.39        99
           2       0.90      0.97      0.93       832

    accuracy                           0.87       983
   macro avg       0.71      0.56      0.61       983
w

In [8]:
# Define and train LSTM model with TF-IDF + LDA topics
lstm_model = Sequential([
    Embedding(5000, 128, input_length=100),
    SpatialDropout1D(0.2),
    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    Dense(y_encoded.shape[1], activation='softmax')
])
lstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.fit(X_train_dl, y_train_dl, epochs=5, batch_size=64, validation_data=(X_test_dl, y_test_dl), verbose=2)

lstm_y_pred = lstm_model.predict(X_test_dl)
lstm_y_pred_labels = lstm_y_pred.argmax(axis=1)
lstm_accuracy = accuracy_score(y_test_dl_labels, lstm_y_pred_labels)
lstm_report = classification_report(y_test_dl_labels, lstm_y_pred_labels)
print("LSTM Results:")
print(f"Accuracy: {lstm_accuracy}")
print("Classification Report:")
print(lstm_report)

# Calculate confidence scores for LSTM
lstm_confidence_scores = lstm_y_pred.max(axis=1)

# Calculate trust-based accuracy for LSTM
lstm_trust_accuracy = trust_based_accuracy(y_test_dl_labels, lstm_y_pred_labels, lstm_confidence_scores, confidence_threshold)
if lstm_trust_accuracy is not None:
    print(f"Trust-Based Accuracy for LSTM: {lstm_trust_accuracy}")
else:
    print(f"No predictions above the confidence threshold for LSTM")


Epoch 1/5




62/62 - 7s - 105ms/step - accuracy: 0.8372 - loss: 0.5709 - val_accuracy: 0.8464 - val_loss: 0.5278
Epoch 2/5
62/62 - 4s - 71ms/step - accuracy: 0.8471 - loss: 0.5274 - val_accuracy: 0.8464 - val_loss: 0.4936
Epoch 3/5
62/62 - 5s - 73ms/step - accuracy: 0.8489 - loss: 0.4256 - val_accuracy: 0.8535 - val_loss: 0.3762
Epoch 4/5
62/62 - 4s - 71ms/step - accuracy: 0.8692 - loss: 0.3216 - val_accuracy: 0.8616 - val_loss: 0.3664
Epoch 5/5
62/62 - 6s - 93ms/step - accuracy: 0.8809 - loss: 0.2678 - val_accuracy: 0.8525 - val_loss: 0.4071
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step
LSTM Results:
Accuracy: 0.8524923702950152
Classification Report:
              precision    recall  f1-score   support

           0       0.42      0.19      0.26        52
           1       0.37      0.28      0.32        99
           2       0.91      0.96      0.93       832

    accuracy                           0.85       983
   macro avg       0.56      0.48      0.51       98

In [9]:

# Define and train BiLSTM model with TF-IDF + LDA topics
bilstm_model = Sequential([
    Embedding(5000, 128, input_length=100),
    SpatialDropout1D(0.2),
    Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)),
    Dense(y_encoded.shape[1], activation='softmax')
])
bilstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
bilstm_model.fit(X_train_dl, y_train_dl, epochs=5, batch_size=64, validation_data=(X_test_dl, y_test_dl), verbose=2)

bilstm_y_pred = bilstm_model.predict(X_test_dl)
bilstm_y_pred_labels = bilstm_y_pred.argmax(axis=1)
bilstm_accuracy = accuracy_score(y_test_dl_labels, bilstm_y_pred_labels)
bilstm_report = classification_report(y_test_dl_labels, bilstm_y_pred_labels)
print("BiLSTM Results:")
print(f"Accuracy: {bilstm_accuracy}")
print("Classification Report:")
print(bilstm_report)

# Calculate confidence scores for BiLSTM
bilstm_confidence_scores = bilstm_y_pred.max(axis=1)

# Calculate trust-based accuracy for BiLSTM
bilstm_trust_accuracy = trust_based_accuracy(y_test_dl_labels, bilstm_y_pred_labels, bilstm_confidence_scores, confidence_threshold)
if bilstm_trust_accuracy is not None:
    print(f"Trust-Based Accuracy for BiLSTM: {bilstm_trust_accuracy}")
else:
    print(f"No predictions above the confidence threshold for BiLSTM")



Epoch 1/5




62/62 - 12s - 193ms/step - accuracy: 0.8352 - loss: 0.5637 - val_accuracy: 0.8464 - val_loss: 0.5274
Epoch 2/5
62/62 - 8s - 130ms/step - accuracy: 0.8471 - loss: 0.5118 - val_accuracy: 0.8464 - val_loss: 0.5535
Epoch 3/5
62/62 - 8s - 132ms/step - accuracy: 0.8481 - loss: 0.4676 - val_accuracy: 0.8484 - val_loss: 0.3821
Epoch 4/5
62/62 - 8s - 128ms/step - accuracy: 0.8616 - loss: 0.3294 - val_accuracy: 0.8596 - val_loss: 0.4106
Epoch 5/5
62/62 - 8s - 124ms/step - accuracy: 0.8792 - loss: 0.2887 - val_accuracy: 0.8739 - val_loss: 0.4170
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step
BiLSTM Results:
Accuracy: 0.873855544252289
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.13      0.23        52
           1       0.51      0.34      0.41        99
           2       0.90      0.98      0.94       832

    accuracy                           0.87       983
   macro avg       0.73      0.49      0.53   

In [11]:
# Compare all models
accuracy_scores_all = {
    'Naive Bayes': accuracy_score(y_test_ml, models['Naive Bayes'].predict(X_test_ml)),
    'SVM': accuracy_score(y_test_ml, models['SVM'].predict(X_test_ml)),
    'Random Forest': accuracy_score(y_test_ml, models['Random Forest'].predict(X_test_ml)),
    'CNN': cnn_accuracy,
    'LSTM': lstm_accuracy,
    'BiLSTM': bilstm_accuracy
}

best_model = max(accuracy_scores_all, key=accuracy_scores_all.get)
print(f"\nThe best model based on accuracy is: {best_model} with accuracy {accuracy_scores_all[best_model]}")

trust_accuracy_scores_all = {
    'Naive Bayes': trust_based_accuracy(y_test_ml, models['Naive Bayes'].predict(X_test_ml), models['Naive Bayes'].predict_proba(X_test_ml).max(axis=1), confidence_threshold),
    'SVM': trust_based_accuracy(y_test_ml, models['SVM'].predict(X_test_ml), models['SVM'].predict_proba(X_test_ml).max(axis=1), confidence_threshold),
    'Random Forest': trust_based_accuracy(y_test_ml, models['Random Forest'].predict(X_test_ml), models['Random Forest'].predict_proba(X_test_ml).max(axis=1), confidence_threshold),
    'CNN': cnn_trust_accuracy,
    'LSTM': lstm_trust_accuracy,
    'BiLSTM': bilstm_trust_accuracy
}

best_trust_model = max(trust_accuracy_scores_all, key=lambda k: trust_accuracy_scores_all[k] if trust_accuracy_scores_all[k] is not None else -1)
print(f"\nThe best model based on trust-based accuracy is: {best_trust_model} with trust-based accuracy {trust_accuracy_scores_all[best_trust_model]}")



The best model based on accuracy is: Random Forest with accuracy 0.9755804643714971

The best model based on trust-based accuracy is: Random Forest with trust-based accuracy 1.0
