In [8]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, LSTM, Dense, Bidirectional, SpatialDropout1D
from tensorflow.keras.utils import to_categorical
from imblearn.over_sampling import SMOTE
import re
import nlpaug.augmenter.word as naw

# Ensure NLTK stopwords and WordNet lemmatizer are downloaded
nltk.download('stopwords')
nltk.download('wordnet')

# Load data from CSV file
df = pd.read_csv('5kReviewWithSentimentAmazon.csv')

# Keep only 'reviewText' and 'Stance' columns
df = df[['reviewText', 'Stance']]

# Drop rows where 'reviewText' or 'Stance' is NaN
df = df.dropna(subset=['reviewText', 'Stance'])

# Ensure all entries in 'reviewText' are strings
df['reviewText'] = df['reviewText'].astype(str)

# for full review display without truncation
pd.set_option('display.max_colwidth', None)

# Print value counts of 'Stance' column
print(df['Stance'].value_counts())

# Text Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])  # Lemmatize and remove stopwords
    return text

df['reviewText'] = df['reviewText'].apply(preprocess_text)

# Transform the text data into TF-IDF features
tfidf = TfidfVectorizer(max_df=0.7)
X = tfidf.fit_transform(df['reviewText'])

# Encode the target labels
y = df['Stance']

# Handle class imbalance using SMOTE for traditional ML models
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

# Split the dataset into training and testing sets for traditional ML models
X_train_ml, X_test_ml, y_train_ml, y_test_ml = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42)

# Initialize and train the Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train_ml, y_train_ml)

# Predict the stances on the test set using Naive Bayes
nb_y_pred = nb_model.predict(X_test_ml)

# Evaluate the Naive Bayes model
nb_accuracy = accuracy_score(y_test_ml, nb_y_pred)
nb_report = classification_report(y_test_ml, nb_y_pred)
print("Naive Bayes Results:")
print(f"Accuracy: {nb_accuracy}")
print("Classification Report:")
print(nb_report)

# Initialize and train the SVM classifier
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_ml, y_train_ml)

# Predict the stances on the test set using SVM
svm_y_pred = svm_model.predict(X_test_ml)

# Evaluate the SVM model
svm_accuracy = accuracy_score(y_test_ml, svm_y_pred)
svm_report = classification_report(y_test_ml, svm_y_pred)
print("SVM Results:")
print(f"Accuracy: {svm_accuracy}")
print("Classification Report:")
print(svm_report)

# Initialize and train the Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_ml, y_train_ml)

# Predict the stances on the test set using Random Forest
rf_y_pred = rf_model.predict(X_test_ml)

# Evaluate the Random Forest model
rf_accuracy = accuracy_score(y_test_ml, rf_y_pred)
rf_report = classification_report(y_test_ml, rf_y_pred)
print("Random Forest Results:")
print(f"Accuracy: {rf_accuracy}")
print("Classification Report:")
print(rf_report)

# Initialize the accuracy_scores dictionary for ML models
accuracy_scores_ml = {
    'Naive Bayes': nb_accuracy,
    'SVM': svm_accuracy,
    'Random Forest': rf_accuracy
}

best_ml_model = max(accuracy_scores_ml, key=accuracy_scores_ml.get)
print(f"\nThe best traditional ML model based on accuracy is: {best_ml_model} with accuracy {accuracy_scores_ml[best_ml_model]}")

# Tokenizer and padding for deep learning models
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['reviewText'])
X_seq = tokenizer.texts_to_sequences(df['reviewText'])
X_pad = pad_sequences(X_seq, maxlen=100)

# Encode the target labels for deep learning models
y_encoded = pd.get_dummies(df['Stance']).values

# Split the dataset into training and testing sets for deep learning models
X_train_dl, X_test_dl, y_train_dl, y_test_dl = train_test_split(X_pad, y_encoded, test_size=0.2, random_state=42)

# Data Augmentation using Synonym Replacement
aug = naw.SynonymAug(aug_src='wordnet')

def augment_texts(texts, aug, num_augs=1):
    augmented_texts = []
    for text in texts:
        for _ in range(num_augs):
            augmented_texts.append(aug.augment(text))
    return augmented_texts

# Apply augmentation on training data
stance_labels = df['Stance'].unique()
augmented_texts = []
augmented_labels = []

for stance in stance_labels:
    class_texts = df['reviewText'][df['Stance'] == stance].values
    augmented_class_texts = augment_texts(class_texts, aug, num_augs=3)
    augmented_texts += augmented_class_texts
    augmented_labels += [stance] * len(augmented_class_texts)

# Debugging statements to check lengths
print(f"Length of augmented_texts: {len(augmented_texts)}")
print(f"Length of augmented_labels: {len(augmented_labels)}")

# Ensure lengths match before creating DataFrame
if len(augmented_texts) == len(augmented_labels):
    augmented_df = pd.DataFrame({'reviewText': augmented_texts, 'Stance': augmented_labels})
else:
    print("Error: Lengths of augmented_texts and augmented_labels do not match")

# Combine original and augmented data
df_augmented = pd.concat([df, augmented_df])

# Tokenizer and padding for augmented data
tokenizer.fit_on_texts(df_augmented['reviewText'])
X_aug_seq = tokenizer.texts_to_sequences(df_augmented['reviewText'])
X_aug_pad = pad_sequences(X_aug_seq, maxlen=100)

# Encode the target labels for augmented data
y_aug_encoded = pd.get_dummies(df_augmented['Stance']).values

# Split the augmented dataset into training and testing sets
X_train_aug, X_test_aug, y_train_aug, y_test_aug = train_test_split(X_aug_pad, y_aug_encoded, test_size=0.2, random_state=42)

# CNN Model
cnn_model = Sequential()
cnn_model.add(Embedding(5000, 128, input_length=100))
cnn_model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dense(128, activation='relu'))
cnn_model.add(Dense(3, activation='softmax'))
cnn_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
cnn_model.fit(X_train_aug, y_train_aug, epochs=5, batch_size=64, validation_data=(X_test_aug, y_test_aug), verbose=2)

cnn_y_pred = cnn_model.predict(X_test_aug)
cnn_y_pred_labels = cnn_y_pred.argmax(axis=1)
y_test_aug_labels = y_test_aug.argmax(axis=1)

cnn_accuracy = accuracy_score(y_test_aug_labels, cnn_y_pred_labels)
cnn_report = classification_report(y_test_aug_labels, cnn_y_pred_labels)
print("CNN Results:")
print(f"Accuracy: {cnn_accuracy}")
print("Classification Report:")
print(cnn_report)

# LSTM Model
lstm_model = Sequential()
lstm_model.add(Embedding(5000, 128, input_length=100))
lstm_model.add(SpatialDropout1D(0.2))
lstm_model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
lstm_model.add(Dense(3, activation='softmax'))
lstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.fit(X_train_aug, y_train_aug, epochs=5, batch_size=64, validation_data=(X_test_aug, y_test_aug), verbose=2)

lstm_y_pred = lstm_model.predict(X_test_aug)
lstm_y_pred_labels = lstm_y_pred.argmax(axis=1)
y_test_aug_labels = y_test_aug.argmax(axis=1)

lstm_accuracy = accuracy_score(y_test_aug_labels, lstm_y_pred_labels)
lstm_report = classification_report(y_test_aug_labels, lstm_y_pred_labels)
print("LSTM Results:")
print(f"Accuracy: {lstm_accuracy}")
print("Classification Report:")
print(lstm_report)

# BiLSTM Model
bilstm_model = Sequential()
bilstm_model.add(Embedding(5000, 128, input_length=100))
bilstm_model.add(SpatialDropout1D(0.2))
bilstm_model.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)))
bilstm_model.add(Dense(3, activation='softmax'))
bilstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
bilstm_model.fit(X_train_aug, y_train_aug, epochs=5, batch_size=64, validation_data=(X_test_aug, y_test_aug), verbose=2)

bilstm_y_pred = bilstm_model.predict(X_test_aug)
bilstm_y_pred_labels = bilstm_y_pred.argmax(axis=1)
y_test_aug_labels = y_test_aug.argmax(axis=1)

bilstm_accuracy = accuracy_score(y_test_aug_labels, bilstm_y_pred_labels)
bilstm_report = classification_report(y_test_aug_labels, bilstm_y_pred_labels)
print("BiLSTM Results:")
print(f"Accuracy: {bilstm_accuracy}")
print("Classification Report:")
print(bilstm_report)

# Compare the accuracy of all models
accuracy_scores_all = {
    'Naive Bayes': nb_accuracy,
    'SVM': svm_accuracy,
    'Random Forest': rf_accuracy,
    'CNN': cnn_accuracy,
    'LSTM': lstm_accuracy,
    'BiLSTM': bilstm_accuracy
}

# Find the best model among all based on accuracy
best_model = max(accuracy_scores_all, key=accuracy_scores_all.get)
print(f"\nThe best model based on accuracy is: {best_model} with accuracy {accuracy_scores_all[best_model]}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Moneykicks\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Moneykicks\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Stance
Supportive    4162
Opposed        469
Neutral        283
Name: count, dtype: int64
Naive Bayes Results:
Accuracy: 0.9275420336269016
Classification Report:
              precision    recall  f1-score   support

     Neutral       0.95      0.97      0.96       844
     Opposed       0.88      0.97      0.92       811
  Supportive       0.96      0.84      0.90       843

    accuracy                           0.93      2498
   macro avg       0.93      0.93      0.93      2498
weighted avg       0.93      0.93      0.93      2498

SVM Results:
Accuracy: 0.9619695756605284
Classification Report:
              precision    recall  f1-score   support

     Neutral       0.96      1.00      0.98       844
     Opposed       0.93      1.00      0.96       811
  Supportive       1.00      0.89      0.94       843

    accuracy                           0.96      2498
   macro avg       0.96      0.96      0.96      2498
weighted avg       0.96      0.96      0.96      2498

Random For



246/246 - 8s - 32ms/step - accuracy: 0.8423 - loss: 0.5297 - val_accuracy: 0.8535 - val_loss: 0.4846
Epoch 2/5
246/246 - 6s - 26ms/step - accuracy: 0.8589 - loss: 0.4656 - val_accuracy: 0.8591 - val_loss: 0.4882
Epoch 3/5
246/246 - 6s - 26ms/step - accuracy: 0.8764 - loss: 0.4273 - val_accuracy: 0.8566 - val_loss: 0.5009
Epoch 4/5
246/246 - 7s - 27ms/step - accuracy: 0.8839 - loss: 0.4058 - val_accuracy: 0.8596 - val_loss: 0.5282
Epoch 5/5
246/246 - 6s - 26ms/step - accuracy: 0.8855 - loss: 0.3998 - val_accuracy: 0.8594 - val_loss: 0.5255
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
CNN Results:
Accuracy: 0.8593591047812817
Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.14      0.23       228
           1       0.48      0.08      0.14       351
           2       0.87      0.99      0.93      3353

    accuracy                           0.86      3932
   macro avg       0.66      0.40      0.43



246/246 - 25s - 103ms/step - accuracy: 0.8432 - loss: 0.5470 - val_accuracy: 0.8527 - val_loss: 0.5204
Epoch 2/5
246/246 - 23s - 93ms/step - accuracy: 0.8456 - loss: 0.5252 - val_accuracy: 0.8530 - val_loss: 0.5048
Epoch 3/5
246/246 - 23s - 93ms/step - accuracy: 0.8516 - loss: 0.4839 - val_accuracy: 0.8527 - val_loss: 0.4879
Epoch 4/5
246/246 - 22s - 90ms/step - accuracy: 0.8633 - loss: 0.4561 - val_accuracy: 0.8538 - val_loss: 0.4930
Epoch 5/5
246/246 - 23s - 92ms/step - accuracy: 0.8731 - loss: 0.4342 - val_accuracy: 0.8530 - val_loss: 0.5016
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step
LSTM Results:
Accuracy: 0.853001017293998
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.07      0.13       228
           1       0.35      0.07      0.11       351
           2       0.86      0.99      0.92      3353

    accuracy                           0.85      3932
   macro avg       0.62      0.38   



246/246 - 42s - 171ms/step - accuracy: 0.8444 - loss: 0.5428 - val_accuracy: 0.8527 - val_loss: 0.5153
Epoch 2/5
246/246 - 40s - 162ms/step - accuracy: 0.8463 - loss: 0.4999 - val_accuracy: 0.8520 - val_loss: 0.4862
Epoch 3/5
246/246 - 40s - 164ms/step - accuracy: 0.8584 - loss: 0.4673 - val_accuracy: 0.8538 - val_loss: 0.4845
Epoch 4/5
246/246 - 41s - 167ms/step - accuracy: 0.8688 - loss: 0.4413 - val_accuracy: 0.8548 - val_loss: 0.5022
Epoch 5/5
246/246 - 42s - 169ms/step - accuracy: 0.8762 - loss: 0.4250 - val_accuracy: 0.8545 - val_loss: 0.5217
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 21ms/step
BiLSTM Results:
Accuracy: 0.854526958290946
Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.08      0.14       228
           1       0.41      0.08      0.13       351
           2       0.86      0.99      0.92      3353

    accuracy                           0.85      3932
   macro avg       0.65      0

In [9]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, LSTM, Dense, Bidirectional, SpatialDropout1D
from tensorflow.keras.utils import to_categorical
from imblearn.over_sampling import SMOTE
import re
import nlpaug.augmenter.word as naw

# Ensure NLTK stopwords and WordNet lemmatizer are downloaded
nltk.download('stopwords')
nltk.download('wordnet')

# Load data from CSV file
df = pd.read_csv('5kReviewWithSentimentAmazon.csv')

# Keep only 'reviewText' and 'Stance' columns
df = df[['reviewText', 'Stance']]

# Drop rows where 'reviewText' or 'Stance' is NaN
df = df.dropna(subset=['reviewText', 'Stance'])

# Ensure all entries in 'reviewText' are strings
df['reviewText'] = df['reviewText'].astype(str)

# for full review display without truncation
pd.set_option('display.max_colwidth', None)

# Print value counts of 'Stance' column
print(df['Stance'].value_counts())

# Text Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])  # Lemmatize and remove stopwords
    return text

df['reviewText'] = df['reviewText'].apply(preprocess_text)

# Transform the text data into TF-IDF features
tfidf = TfidfVectorizer(max_df=0.7)
X = tfidf.fit_transform(df['reviewText'])

# Encode the target labels
y = df['Stance']

# Handle class imbalance using SMOTE for traditional ML models
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

# Split the dataset into training and testing sets for traditional ML models
X_train_ml, X_test_ml, y_train_ml, y_test_ml = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42)

# Initialize and train the Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train_ml, y_train_ml)

# Predict the stances on the test set using Naive Bayes
nb_y_pred = nb_model.predict(X_test_ml)

# Evaluate the Naive Bayes model
nb_accuracy = accuracy_score(y_test_ml, nb_y_pred)
nb_report = classification_report(y_test_ml, nb_y_pred)
print("Naive Bayes Results:")
print(f"Accuracy: {nb_accuracy}")
print("Classification Report:")
print(nb_report)

# Initialize and train the SVM classifier
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_ml, y_train_ml)

# Predict the stances on the test set using SVM
svm_y_pred = svm_model.predict(X_test_ml)

# Evaluate the SVM model
svm_accuracy = accuracy_score(y_test_ml, svm_y_pred)
svm_report = classification_report(y_test_ml, svm_y_pred)
print("SVM Results:")
print(f"Accuracy: {svm_accuracy}")
print("Classification Report:")
print(svm_report)

# Initialize and train the Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_ml, y_train_ml)

# Predict the stances on the test set using Random Forest
rf_y_pred = rf_model.predict(X_test_ml)

# Evaluate the Random Forest model
rf_accuracy = accuracy_score(y_test_ml, rf_y_pred)
rf_report = classification_report(y_test_ml, rf_y_pred)
print("Random Forest Results:")
print(f"Accuracy: {rf_accuracy}")
print("Classification Report:")
print(rf_report)

# Initialize the accuracy_scores dictionary for ML models
accuracy_scores_ml = {
    'Naive Bayes': nb_accuracy,
    'SVM': svm_accuracy,
    'Random Forest': rf_accuracy
}

best_ml_model = max(accuracy_scores_ml, key=accuracy_scores_ml.get)
print(f"\nThe best traditional ML model based on accuracy is: {best_ml_model} with accuracy {accuracy_scores_ml[best_ml_model]}")

# Tokenizer and padding for deep learning models
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['reviewText'])
X_seq = tokenizer.texts_to_sequences(df['reviewText'])
X_pad = pad_sequences(X_seq, maxlen=100)

# Encode the target labels for deep learning models
y_encoded = pd.get_dummies(df['Stance']).values

# Split the dataset into training and testing sets for deep learning models
X_train_dl, X_test_dl, y_train_dl, y_test_dl = train_test_split(X_pad, y_encoded, test_size=0.2, random_state=42)

# Data Augmentation using Synonym Replacement
aug = naw.SynonymAug(aug_src='wordnet')

def augment_texts(texts, aug, num_augs=1):
    augmented_texts = []
    for text in texts:
        for _ in range(num_augs):
            augmented_texts.append(aug.augment(text))
    return augmented_texts

# Apply augmentation on training data
stance_labels = df['Stance'].unique()
augmented_texts = []
augmented_labels = []

for stance in stance_labels:
    class_texts = df['reviewText'][df['Stance'] == stance].values
    augmented_class_texts = augment_texts(class_texts, aug, num_augs=3)
    augmented_texts += augmented_class_texts
    augmented_labels += [stance] * len(augmented_class_texts)

# Debugging statements to check lengths
print(f"Length of augmented_texts: {len(augmented_texts)}")
print(f"Length of augmented_labels: {len(augmented_labels)}")

# Ensure lengths match before creating DataFrame
if len(augmented_texts) == len(augmented_labels):
    augmented_df = pd.DataFrame({'reviewText': augmented_texts, 'Stance': augmented_labels})
else:
    print("Error: Lengths of augmented_texts and augmented_labels do not match")

# Combine original and augmented data
df_augmented = pd.concat([df, augmented_df])

# Tokenizer and padding for augmented data
tokenizer.fit_on_texts(df_augmented['reviewText'])
X_aug_seq = tokenizer.texts_to_sequences(df_augmented['reviewText'])
X_aug_pad = pad_sequences(X_aug_seq, maxlen=100)

# Encode the target labels for augmented data
y_aug_encoded = pd.get_dummies(df_augmented['Stance']).values

# Split the augmented dataset into training and testing sets
X_train_aug, X_test_aug, y_train_aug, y_test_aug = train_test_split(X_aug_pad, y_aug_encoded, test_size=0.2, random_state=42)

# Compute class weights for deep learning models
class_weights = df['Stance'].value_counts(normalize=True).to_dict()
total_classes = len(class_weights)
class_weights_dict = {i: (1 / class_weights[i]) * (total_classes / len(class_weights)) for i in class_weights}

# CNN Model
cnn_model = Sequential()
cnn_model.add(Embedding(5000, 128, input_length=100))
cnn_model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dense(128, activation='relu'))
cnn_model.add(Dense(len(df['Stance'].unique()), activation='softmax'))
cnn_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
cnn_model.fit(X_train_aug, y_train_aug, epochs=5, batch_size=64, validation_data=(X_test_aug, y_test_aug), verbose=2, class_weight=class_weights_dict)

cnn_y_pred = cnn_model.predict(X_test_aug)
cnn_y_pred_labels = cnn_y_pred.argmax(axis=1)
y_test_aug_labels = y_test_aug.argmax(axis=1)

cnn_accuracy = accuracy_score(y_test_aug_labels, cnn_y_pred_labels)
cnn_report = classification_report(y_test_aug_labels, cnn_y_pred_labels)
print("CNN Results:")
print(f"Accuracy: {cnn_accuracy}")
print("Classification Report:")
print(cnn_report)

# LSTM Model
lstm_model = Sequential()
lstm_model.add(Embedding(5000, 128, input_length=100))
lstm_model.add(SpatialDropout1D(0.2))
lstm_model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
lstm_model.add(Dense(len(df['Stance'].unique()), activation='softmax'))
lstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.fit(X_train_aug, y_train_aug, epochs=5, batch_size=64, validation_data=(X_test_aug, y_test_aug), verbose=2, class_weight=class_weights_dict)

lstm_y_pred = lstm_model.predict(X_test_aug)
lstm_y_pred_labels = lstm_y_pred.argmax(axis=1)
y_test_aug_labels = y_test_aug.argmax(axis=1)

lstm_accuracy = accuracy_score(y_test_aug_labels, lstm_y_pred_labels)
lstm_report = classification_report(y_test_aug_labels, lstm_y_pred_labels)
print("LSTM Results:")
print(f"Accuracy: {lstm_accuracy}")
print("Classification Report:")
print(lstm_report)

# BiLSTM Model
bilstm_model = Sequential()
bilstm_model.add(Embedding(5000, 128, input_length=100))
bilstm_model.add(SpatialDropout1D(0.2))
bilstm_model.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)))
bilstm_model.add(Dense(len(df['Stance'].unique()), activation='softmax'))
bilstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
bilstm_model.fit(X_train_aug, y_train_aug, epochs=5, batch_size=64, validation_data=(X_test_aug, y_test_aug), verbose=2, class_weight=class_weights_dict)

bilstm_y_pred = bilstm_model.predict(X_test_aug)
bilstm_y_pred_labels = bilstm_y_pred.argmax(axis=1)
y_test_aug_labels = y_test_aug.argmax(axis=1)

bilstm_accuracy = accuracy_score(y_test_aug_labels, bilstm_y_pred_labels)
bilstm_report = classification_report(y_test_aug_labels, bilstm_y_pred_labels)
print("BiLSTM Results:")
print(f"Accuracy: {bilstm_accuracy}")
print("Classification Report:")
print(bilstm_report)

# Compare the accuracy of all models
accuracy_scores_all = {
    'Naive Bayes': nb_accuracy,
    'SVM': svm_accuracy,
    'Random Forest': rf_accuracy,
    'CNN': cnn_accuracy,
    'LSTM': lstm_accuracy,
    'BiLSTM': bilstm_accuracy
}

# Find the best model among all based on accuracy
best_model = max(accuracy_scores_all, key=accuracy_scores_all.get)
print(f"\nThe best model based on accuracy is: {best_model} with accuracy {accuracy_scores_all[best_model]}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Moneykicks\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Moneykicks\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Stance
Supportive    4162
Opposed        469
Neutral        283
Name: count, dtype: int64
Naive Bayes Results:
Accuracy: 0.9275420336269016
Classification Report:
              precision    recall  f1-score   support

     Neutral       0.95      0.97      0.96       844
     Opposed       0.88      0.97      0.92       811
  Supportive       0.96      0.84      0.90       843

    accuracy                           0.93      2498
   macro avg       0.93      0.93      0.93      2498
weighted avg       0.93      0.93      0.93      2498

SVM Results:
Accuracy: 0.9619695756605284
Classification Report:
              precision    recall  f1-score   support

     Neutral       0.96      1.00      0.98       844
     Opposed       0.93      1.00      0.96       811
  Supportive       1.00      0.89      0.94       843

    accuracy                           0.96      2498
   macro avg       0.96      0.96      0.96      2498
weighted avg       0.96      0.96      0.96      2498

Random For



246/246 - 12s - 47ms/step - accuracy: 0.8420 - loss: 0.5359 - val_accuracy: 0.8527 - val_loss: 0.4893
Epoch 2/5
246/246 - 9s - 35ms/step - accuracy: 0.8557 - loss: 0.4717 - val_accuracy: 0.8586 - val_loss: 0.4891
Epoch 3/5
246/246 - 9s - 35ms/step - accuracy: 0.8741 - loss: 0.4325 - val_accuracy: 0.8550 - val_loss: 0.5086
Epoch 4/5
246/246 - 9s - 35ms/step - accuracy: 0.8830 - loss: 0.4092 - val_accuracy: 0.8594 - val_loss: 0.5397
Epoch 5/5
246/246 - 9s - 35ms/step - accuracy: 0.8850 - loss: 0.4011 - val_accuracy: 0.8589 - val_loss: 0.5414
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step
CNN Results:
Accuracy: 0.8588504577822991
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.11      0.19       228
           1       0.49      0.09      0.15       351
           2       0.87      0.99      0.92      3353

    accuracy                           0.86      3932
   macro avg       0.71      0.40      0.4



246/246 - 31s - 126ms/step - accuracy: 0.8436 - loss: 0.5476 - val_accuracy: 0.8527 - val_loss: 0.5194
Epoch 2/5
246/246 - 32s - 129ms/step - accuracy: 0.8456 - loss: 0.5227 - val_accuracy: 0.8533 - val_loss: 0.5021
Epoch 3/5
246/246 - 31s - 128ms/step - accuracy: 0.8537 - loss: 0.4816 - val_accuracy: 0.8525 - val_loss: 0.4892
Epoch 4/5
246/246 - 32s - 128ms/step - accuracy: 0.8636 - loss: 0.4530 - val_accuracy: 0.8555 - val_loss: 0.4924
Epoch 5/5
246/246 - 31s - 127ms/step - accuracy: 0.8732 - loss: 0.4326 - val_accuracy: 0.8540 - val_loss: 0.5073
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step
LSTM Results:
Accuracy: 0.8540183112919634
Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.07      0.13       228
           1       0.39      0.07      0.12       351
           2       0.86      0.99      0.92      3353

    accuracy                           0.85      3932
   macro avg       0.62      0.



246/246 - 60s - 245ms/step - accuracy: 0.8426 - loss: 0.5449 - val_accuracy: 0.8527 - val_loss: 0.5120
Epoch 2/5
246/246 - 53s - 215ms/step - accuracy: 0.8470 - loss: 0.4977 - val_accuracy: 0.8530 - val_loss: 0.4868
Epoch 3/5
246/246 - 51s - 208ms/step - accuracy: 0.8585 - loss: 0.4665 - val_accuracy: 0.8573 - val_loss: 0.4880
Epoch 4/5
246/246 - 54s - 219ms/step - accuracy: 0.8688 - loss: 0.4428 - val_accuracy: 0.8561 - val_loss: 0.5043
Epoch 5/5
246/246 - 50s - 203ms/step - accuracy: 0.8756 - loss: 0.4273 - val_accuracy: 0.8543 - val_loss: 0.5251
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 31ms/step
BiLSTM Results:
Accuracy: 0.8542726347914548
Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.06      0.11       228
           1       0.41      0.07      0.12       351
           2       0.86      0.99      0.92      3353

    accuracy                           0.85      3932
   macro avg       0.64      

In [10]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, LSTM, Dense, Bidirectional, SpatialDropout1D
from tensorflow.keras.utils import to_categorical
from imblearn.over_sampling import SMOTE
import re
import nlpaug.augmenter.word as naw

# Ensure NLTK stopwords and WordNet lemmatizer are downloaded
nltk.download('stopwords')
nltk.download('wordnet')

# Load data from CSV file
df = pd.read_csv('5kReviewWithSentimentAmazon.csv')

# Keep only 'reviewText' and 'Stance' columns
df = df[['reviewText', 'Stance']]

# Drop rows where 'reviewText' or 'Stance' is NaN
df = df.dropna(subset=['reviewText', 'Stance'])

# Ensure all entries in 'reviewText' are strings
df['reviewText'] = df['reviewText'].astype(str)

# for full review display without truncation
pd.set_option('display.max_colwidth', None)

# Print value counts of 'Stance' column
print(df['Stance'].value_counts())

# Text Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])  # Lemmatize and remove stopwords
    return text

df['reviewText'] = df['reviewText'].apply(preprocess_text)

# Transform the text data into TF-IDF features
tfidf = TfidfVectorizer(max_df=0.7)
X = tfidf.fit_transform(df['reviewText'])

# Encode the target labels
y = df['Stance']

# Handle class imbalance using SMOTE for traditional ML models
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

# Split the dataset into training and testing sets for traditional ML models
X_train_ml, X_test_ml, y_train_ml, y_test_ml = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42)

# Initialize and train the Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train_ml, y_train_ml)

# Predict the stances on the test set using Naive Bayes
nb_y_pred = nb_model.predict(X_test_ml)

# Evaluate the Naive Bayes model
nb_accuracy = accuracy_score(y_test_ml, nb_y_pred)
nb_report = classification_report(y_test_ml, nb_y_pred)
print("Naive Bayes Results:")
print(f"Accuracy: {nb_accuracy}")
print("Classification Report:")
print(nb_report)

# Initialize and train the SVM classifier
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_ml, y_train_ml)

# Predict the stances on the test set using SVM
svm_y_pred = svm_model.predict(X_test_ml)

# Evaluate the SVM model
svm_accuracy = accuracy_score(y_test_ml, svm_y_pred)
svm_report = classification_report(y_test_ml, svm_y_pred)
print("SVM Results:")
print(f"Accuracy: {svm_accuracy}")
print("Classification Report:")
print(svm_report)

# Initialize and train the Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_ml, y_train_ml)

# Predict the stances on the test set using Random Forest
rf_y_pred = rf_model.predict(X_test_ml)

# Evaluate the Random Forest model
rf_accuracy = accuracy_score(y_test_ml, rf_y_pred)
rf_report = classification_report(y_test_ml, rf_y_pred)
print("Random Forest Results:")
print(f"Accuracy: {rf_accuracy}")
print("Classification Report:")
print(rf_report)

# Initialize the accuracy_scores dictionary for ML models
accuracy_scores_ml = {
    'Naive Bayes': nb_accuracy,
    'SVM': svm_accuracy,
    'Random Forest': rf_accuracy
}

best_ml_model = max(accuracy_scores_ml, key=accuracy_scores_ml.get)
print(f"\nThe best traditional ML model based on accuracy is: {best_ml_model} with accuracy {accuracy_scores_ml[best_ml_model]}")

# Tokenizer and padding for deep learning models
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['reviewText'])
X_seq = tokenizer.texts_to_sequences(df['reviewText'])
X_pad = pad_sequences(X_seq, maxlen=100)

# Encode the target labels for deep learning models
y_encoded = pd.get_dummies(df['Stance']).values

# Split the dataset into training and testing sets for deep learning models
X_train_dl, X_test_dl, y_train_dl, y_test_dl = train_test_split(X_pad, y_encoded, test_size=0.2, random_state=42)

# Data Augmentation using Synonym Replacement
aug = naw.SynonymAug(aug_src='wordnet')

def augment_texts(texts, aug, num_augs=1):
    augmented_texts = []
    for text in texts:
        for _ in range(num_augs):
            augmented_texts.append(aug.augment(text))
    return augmented_texts

# Define a threshold for what constitutes a minority class
threshold = 500  # Example threshold; adjust based on your needs

# Count the number of samples per class
class_counts = df['Stance'].value_counts()

# Identify minority classes
minority_classes = class_counts[class_counts < threshold].index

# Apply augmentation on minority classes only
augmented_texts = []
augmented_labels = []

for stance in minority_classes:
    class_texts = df['reviewText'][df['Stance'] == stance].values
    augmented_class_texts = augment_texts(class_texts, aug, num_augs=3)
    augmented_texts += augmented_class_texts
    augmented_labels += [stance] * len(augmented_class_texts)

# Debugging statements to check lengths
print(f"Length of augmented_texts: {len(augmented_texts)}")
print(f"Length of augmented_labels: {len(augmented_labels)}")

# Ensure lengths match before creating DataFrame
if len(augmented_texts) == len(augmented_labels):
    augmented_df = pd.DataFrame({'reviewText': augmented_texts, 'Stance': augmented_labels})
else:
    print("Error: Lengths of augmented_texts and augmented_labels do not match")

# Combine original and augmented data
df_augmented = pd.concat([df, augmented_df])

# Tokenizer and padding for augmented data
tokenizer.fit_on_texts(df_augmented['reviewText'])
X_aug_seq = tokenizer.texts_to_sequences(df_augmented['reviewText'])
X_aug_pad = pad_sequences(X_aug_seq, maxlen=100)

# Encode the target labels for augmented data
y_aug_encoded = pd.get_dummies(df_augmented['Stance']).values

# Split the augmented dataset into training and testing sets
X_train_aug, X_test_aug, y_train_aug, y_test_aug = train_test_split(X_aug_pad, y_aug_encoded, test_size=0.2, random_state=42)

# CNN Model
cnn_model = Sequential()
cnn_model.add(Embedding(5000, 128, input_length=100))
cnn_model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dense(128, activation='relu'))
cnn_model.add(Dense(3, activation='softmax'))
cnn_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
cnn_model.fit(X_train_aug, y_train_aug, epochs=5, batch_size=64, validation_data=(X_test_aug, y_test_aug), verbose=2)

cnn_y_pred = cnn_model.predict(X_test_aug)
cnn_y_pred_labels = cnn_y_pred.argmax(axis=1)
y_test_aug_labels = y_test_aug.argmax(axis=1)

cnn_accuracy = accuracy_score(y_test_aug_labels, cnn_y_pred_labels)
cnn_report = classification_report(y_test_aug_labels, cnn_y_pred_labels)
print("CNN Results:")
print(f"Accuracy: {cnn_accuracy}")
print("Classification Report:")
print(cnn_report)

# LSTM Model
lstm_model = Sequential()
lstm_model.add(Embedding(5000, 128, input_length=100))
lstm_model.add(SpatialDropout1D(0.2))
lstm_model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
lstm_model.add(Dense(3, activation='softmax'))
lstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.fit(X_train_aug, y_train_aug, epochs=5, batch_size=64, validation_data=(X_test_aug, y_test_aug), verbose=2)

lstm_y_pred = lstm_model.predict(X_test_aug)
lstm_y_pred_labels = lstm_y_pred.argmax(axis=1)
y_test_aug_labels = y_test_aug.argmax(axis=1)

lstm_accuracy = accuracy_score(y_test_aug_labels, lstm_y_pred_labels)
lstm_report = classification_report(y_test_aug_labels, lstm_y_pred_labels)
print("LSTM Results:")
print(f"Accuracy: {lstm_accuracy}")
print("Classification Report:")
print(lstm_report)

# BiLSTM Model
bilstm_model = Sequential()
bilstm_model.add(Embedding(5000, 128, input_length=100))
bilstm_model.add(SpatialDropout1D(0.2))
bilstm_model.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)))
bilstm_model.add(Dense(3, activation='softmax'))
bilstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
bilstm_model.fit(X_train_aug, y_train_aug, epochs=5, batch_size=64, validation_data=(X_test_aug, y_test_aug), verbose=2)

bilstm_y_pred = bilstm_model.predict(X_test_aug)
bilstm_y_pred_labels = bilstm_y_pred.argmax(axis=1)
y_test_aug_labels = y_test_aug.argmax(axis=1)

bilstm_accuracy = accuracy_score(y_test_aug_labels, bilstm_y_pred_labels)
bilstm_report = classification_report(y_test_aug_labels, bilstm_y_pred_labels)
print("BiLSTM Results:")
print(f"Accuracy: {bilstm_accuracy}")
print("Classification Report:")
print(bilstm_report)

# Compare the accuracy of all models
accuracy_scores_all = {
    'Naive Bayes': nb_accuracy,
    'SVM': svm_accuracy,
    'Random Forest': rf_accuracy,
    'CNN': cnn_accuracy,
    'LSTM': lstm_accuracy,
    'BiLSTM': bilstm_accuracy
}

# Find the best model among all based on accuracy
best_model = max(accuracy_scores_all, key=accuracy_scores_all.get)
print(f"\nThe best model based on accuracy is: {best_model} with accuracy {accuracy_scores_all[best_model]}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Moneykicks\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Moneykicks\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Stance
Supportive    4162
Opposed        469
Neutral        283
Name: count, dtype: int64
Naive Bayes Results:
Accuracy: 0.9275420336269016
Classification Report:
              precision    recall  f1-score   support

     Neutral       0.95      0.97      0.96       844
     Opposed       0.88      0.97      0.92       811
  Supportive       0.96      0.84      0.90       843

    accuracy                           0.93      2498
   macro avg       0.93      0.93      0.93      2498
weighted avg       0.93      0.93      0.93      2498

SVM Results:
Accuracy: 0.9619695756605284
Classification Report:
              precision    recall  f1-score   support

     Neutral       0.96      1.00      0.98       844
     Opposed       0.93      1.00      0.96       811
  Supportive       1.00      0.89      0.94       843

    accuracy                           0.96      2498
   macro avg       0.96      0.96      0.96      2498
weighted avg       0.96      0.96      0.96      2498

Random For



90/90 - 8s - 93ms/step - accuracy: 0.7653 - loss: 0.6010 - val_accuracy: 0.8075 - val_loss: 0.4510
Epoch 2/5
90/90 - 4s - 41ms/step - accuracy: 0.7932 - loss: 0.4237 - val_accuracy: 0.8110 - val_loss: 0.4261
Epoch 3/5
90/90 - 4s - 40ms/step - accuracy: 0.8264 - loss: 0.3481 - val_accuracy: 0.8131 - val_loss: 0.4394
Epoch 4/5
90/90 - 4s - 40ms/step - accuracy: 0.8612 - loss: 0.2756 - val_accuracy: 0.8061 - val_loss: 0.4708
Epoch 5/5
90/90 - 4s - 40ms/step - accuracy: 0.8701 - loss: 0.2316 - val_accuracy: 0.8145 - val_loss: 0.4894
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step
CNN Results:
Accuracy: 0.8145048814504882
Classification Report:
              precision    recall  f1-score   support

           0       0.59      0.17      0.26       198
           1       0.64      0.88      0.75       380
           2       0.93      0.93      0.93       856

    accuracy                           0.81      1434
   macro avg       0.72      0.66      0.65      1434




90/90 - 20s - 228ms/step - accuracy: 0.7556 - loss: 0.6370 - val_accuracy: 0.7992 - val_loss: 0.5047
Epoch 2/5
90/90 - 17s - 185ms/step - accuracy: 0.7744 - loss: 0.4930 - val_accuracy: 0.8026 - val_loss: 0.4461
Epoch 3/5
90/90 - 16s - 176ms/step - accuracy: 0.8030 - loss: 0.4090 - val_accuracy: 0.8040 - val_loss: 0.4480
Epoch 4/5
90/90 - 16s - 173ms/step - accuracy: 0.8213 - loss: 0.3664 - val_accuracy: 0.7901 - val_loss: 0.4742
Epoch 5/5
90/90 - 15s - 169ms/step - accuracy: 0.8377 - loss: 0.3329 - val_accuracy: 0.7943 - val_loss: 0.4912
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 46ms/step
LSTM Results:
Accuracy: 0.794281729428173
Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.11      0.18       198
           1       0.60      0.88      0.72       380
           2       0.93      0.91      0.92       856

    accuracy                           0.79      1434
   macro avg       0.72      0.63      0.61 



90/90 - 35s - 387ms/step - accuracy: 0.7456 - loss: 0.6252 - val_accuracy: 0.7985 - val_loss: 0.5049
Epoch 2/5
90/90 - 26s - 290ms/step - accuracy: 0.7748 - loss: 0.4788 - val_accuracy: 0.8061 - val_loss: 0.4432
Epoch 3/5
90/90 - 27s - 302ms/step - accuracy: 0.8033 - loss: 0.3981 - val_accuracy: 0.8020 - val_loss: 0.4568
Epoch 4/5
90/90 - 27s - 296ms/step - accuracy: 0.8162 - loss: 0.3684 - val_accuracy: 0.7887 - val_loss: 0.4781
Epoch 5/5
90/90 - 25s - 280ms/step - accuracy: 0.8307 - loss: 0.3402 - val_accuracy: 0.8026 - val_loss: 0.4915
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 46ms/step
BiLSTM Results:
Accuracy: 0.802649930264993
Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.08      0.13       198
           1       0.62      0.87      0.72       380
           2       0.92      0.94      0.93       856

    accuracy                           0.80      1434
   macro avg       0.71      0.63      0.6

In [11]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, LSTM, Dense, Bidirectional, SpatialDropout1D
from tensorflow.keras.utils import to_categorical
from imblearn.over_sampling import SMOTE
import re
import nlpaug.augmenter.word as naw
from sklearn.preprocessing import LabelEncoder

# Ensure NLTK stopwords and WordNet lemmatizer are downloaded
nltk.download('stopwords')
nltk.download('wordnet')

# Load data from CSV file
df = pd.read_csv('5kReviewWithSentimentAmazon.csv')

# Keep only 'reviewText' and 'Stance' columns
df = df[['reviewText', 'Stance']]

# Drop rows where 'reviewText' or 'Stance' is NaN
df = df.dropna(subset=['reviewText', 'Stance'])

# Ensure all entries in 'reviewText' are strings
df['reviewText'] = df['reviewText'].astype(str)

# Text Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])  # Lemmatize and remove stopwords
    return text

df['reviewText'] = df['reviewText'].apply(preprocess_text)

# Transform the text data into TF-IDF features
tfidf = TfidfVectorizer(max_df=0.7)
X = tfidf.fit_transform(df['reviewText'])

# Encode the target labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Stance'])

# Handle class imbalance using SMOTE for traditional ML models
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

# Split the dataset into training and testing sets for traditional ML models
X_train_ml, X_test_ml, y_train_ml, y_test_ml = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42)

# Initialize and train the Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train_ml, y_train_ml)
nb_y_pred = nb_model.predict(X_test_ml)
nb_accuracy = accuracy_score(y_test_ml, nb_y_pred)
nb_report = classification_report(y_test_ml, nb_y_pred)
print("Naive Bayes Results:")
print(f"Accuracy: {nb_accuracy}")
print("Classification Report:")
print(nb_report)

# Initialize and train the SVM classifier
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_ml, y_train_ml)
svm_y_pred = svm_model.predict(X_test_ml)
svm_accuracy = accuracy_score(y_test_ml, svm_y_pred)
svm_report = classification_report(y_test_ml, svm_y_pred)
print("SVM Results:")
print(f"Accuracy: {svm_accuracy}")
print("Classification Report:")
print(svm_report)

# Initialize and train the Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_ml, y_train_ml)
rf_y_pred = rf_model.predict(X_test_ml)
rf_accuracy = accuracy_score(y_test_ml, rf_y_pred)
rf_report = classification_report(y_test_ml, rf_y_pred)
print("Random Forest Results:")
print(f"Accuracy: {rf_accuracy}")
print("Classification Report:")
print(rf_report)

# Tokenizer and padding for deep learning models
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['reviewText'])
X_seq = tokenizer.texts_to_sequences(df['reviewText'])
X_pad = pad_sequences(X_seq, maxlen=100)

# Apply SMOTE to text data
X_smote_dl, y_smote_dl = smote.fit_resample(X_pad, y)

# Split the balanced dataset into training and testing sets for deep learning models
X_train_dl, X_test_dl, y_train_dl, y_test_dl = train_test_split(X_smote_dl, y_smote_dl, test_size=0.2, random_state=42)

# Convert y_train_dl and y_test_dl to categorical
y_train_dl = to_categorical(y_train_dl, num_classes=len(label_encoder.classes_))
y_test_dl = to_categorical(y_test_dl, num_classes=len(label_encoder.classes_))

# CNN Model
cnn_model = Sequential()
cnn_model.add(Embedding(5000, 128, input_length=100))
cnn_model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dense(128, activation='relu'))
cnn_model.add(Dense(len(label_encoder.classes_), activation='softmax'))
cnn_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
cnn_model.fit(X_train_dl, y_train_dl, epochs=5, batch_size=64, validation_data=(X_test_dl, y_test_dl), verbose=2)
cnn_y_pred = cnn_model.predict(X_test_dl)
cnn_y_pred_labels = cnn_y_pred.argmax(axis=1)
cnn_accuracy = accuracy_score(y_test_dl.argmax(axis=1), cnn_y_pred_labels)
cnn_report = classification_report(y_test_dl.argmax(axis=1), cnn_y_pred_labels)
print("CNN Results:")
print(f"Accuracy: {cnn_accuracy}")
print("Classification Report:")
print(cnn_report)

# LSTM Model
lstm_model = Sequential()
lstm_model.add(Embedding(5000, 128, input_length=100))
lstm_model.add(SpatialDropout1D(0.2))
lstm_model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
lstm_model.add(Dense(len(label_encoder.classes_), activation='softmax'))
lstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.fit(X_train_dl, y_train_dl, epochs=5, batch_size=64, validation_data=(X_test_dl, y_test_dl), verbose=2)
lstm_y_pred = lstm_model.predict(X_test_dl)
lstm_y_pred_labels = lstm_y_pred.argmax(axis=1)
lstm_accuracy = accuracy_score(y_test_dl.argmax(axis=1), lstm_y_pred_labels)
lstm_report = classification_report(y_test_dl.argmax(axis=1), lstm_y_pred_labels)
print("LSTM Results:")
print(f"Accuracy: {lstm_accuracy}")
print("Classification Report:")
print(lstm_report)

# BiLSTM Model
bilstm_model = Sequential()
bilstm_model.add(Embedding(5000, 128, input_length=100))
bilstm_model.add(SpatialDropout1D(0.2))
bilstm_model.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)))
bilstm_model.add(Dense(len(label_encoder.classes_), activation='softmax'))
bilstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
bilstm_model.fit(X_train_dl, y_train_dl, epochs=5, batch_size=64, validation_data=(X_test_dl, y_test_dl), verbose=2)
bilstm_y_pred = bilstm_model.predict(X_test_dl)
bilstm_y_pred_labels = bilstm_y_pred.argmax(axis=1)
bilstm_accuracy = accuracy_score(y_test_dl.argmax(axis=1), bilstm_y_pred_labels)
bilstm_report = classification_report(y_test_dl.argmax(axis=1), bilstm_y_pred_labels)
print("BiLSTM Results:")
print(f"Accuracy: {bilstm_accuracy}")
print("Classification Report:")
print(bilstm_report)

# Compare the accuracy of all models
accuracy_scores_all = {
    'Naive Bayes': nb_accuracy,
    'SVM': svm_accuracy,
    'Random Forest': rf_accuracy,
    'CNN': cnn_accuracy,
    'LSTM': lstm_accuracy,
    'BiLSTM': bilstm_accuracy
}

# Find the best model among all based on accuracy
best_model = max(accuracy_scores_all, key=accuracy_scores_all.get)
print(f"\nThe best model based on accuracy is: {best_model} with accuracy {accuracy_scores_all[best_model]}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Moneykicks\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Moneykicks\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Naive Bayes Results:
Accuracy: 0.9275420336269016
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       844
           1       0.88      0.97      0.92       811
           2       0.96      0.84      0.90       843

    accuracy                           0.93      2498
   macro avg       0.93      0.93      0.93      2498
weighted avg       0.93      0.93      0.93      2498

SVM Results:
Accuracy: 0.9619695756605284
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       844
           1       0.93      1.00      0.96       811
           2       1.00      0.89      0.94       843

    accuracy                           0.96      2498
   macro avg       0.96      0.96      0.96      2498
weighted avg       0.96      0.96      0.96      2498

Random Forest Results:
Accuracy: 0.9755804643714971
Classification Report:
              precision  



157/157 - 5s - 32ms/step - accuracy: 0.5364 - loss: 0.9226 - val_accuracy: 0.6333 - val_loss: 0.7501
Epoch 2/5
157/157 - 4s - 24ms/step - accuracy: 0.7448 - loss: 0.5799 - val_accuracy: 0.6601 - val_loss: 0.7115
Epoch 3/5
157/157 - 4s - 24ms/step - accuracy: 0.8709 - loss: 0.3397 - val_accuracy: 0.6401 - val_loss: 0.9023
Epoch 4/5
157/157 - 4s - 24ms/step - accuracy: 0.9479 - loss: 0.1731 - val_accuracy: 0.6293 - val_loss: 1.0596
Epoch 5/5
157/157 - 4s - 26ms/step - accuracy: 0.9784 - loss: 0.0850 - val_accuracy: 0.6281 - val_loss: 1.1702
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
CNN Results:
Accuracy: 0.6281024819855885
Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.75      0.63       844
           1       0.56      0.50      0.53       811
           2       0.85      0.63      0.72       843

    accuracy                           0.63      2498
   macro avg       0.65      0.63      0.63  



157/157 - 14s - 86ms/step - accuracy: 0.5133 - loss: 0.9714 - val_accuracy: 0.5761 - val_loss: 0.8896
Epoch 2/5
157/157 - 11s - 68ms/step - accuracy: 0.6133 - loss: 0.8297 - val_accuracy: 0.5813 - val_loss: 0.8837
Epoch 3/5
157/157 - 11s - 68ms/step - accuracy: 0.6746 - loss: 0.7363 - val_accuracy: 0.5913 - val_loss: 0.9029
Epoch 4/5
157/157 - 11s - 67ms/step - accuracy: 0.7201 - loss: 0.6472 - val_accuracy: 0.5965 - val_loss: 0.9242
Epoch 5/5
157/157 - 11s - 67ms/step - accuracy: 0.7609 - loss: 0.5646 - val_accuracy: 0.5833 - val_loss: 1.0020
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step
LSTM Results:
Accuracy: 0.5832666132906325
Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.51      0.55       844
           1       0.53      0.50      0.51       811
           2       0.63      0.74      0.68       843

    accuracy                           0.58      2498
   macro avg       0.58      0.58     



157/157 - 23s - 148ms/step - accuracy: 0.4991 - loss: 0.9848 - val_accuracy: 0.5725 - val_loss: 0.8873
Epoch 2/5
157/157 - 32s - 206ms/step - accuracy: 0.6093 - loss: 0.8334 - val_accuracy: 0.5889 - val_loss: 0.8823
Epoch 3/5
157/157 - 32s - 207ms/step - accuracy: 0.6704 - loss: 0.7438 - val_accuracy: 0.5913 - val_loss: 0.8941
Epoch 4/5
157/157 - 34s - 216ms/step - accuracy: 0.7142 - loss: 0.6568 - val_accuracy: 0.5821 - val_loss: 0.9192
Epoch 5/5
157/157 - 32s - 206ms/step - accuracy: 0.7536 - loss: 0.5831 - val_accuracy: 0.5881 - val_loss: 0.9718
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 28ms/step
BiLSTM Results:
Accuracy: 0.588070456365092
Classification Report:
              precision    recall  f1-score   support

           0       0.56      0.54      0.55       844
           1       0.53      0.52      0.52       811
           2       0.66      0.70      0.68       843

    accuracy                           0.59      2498
   macro avg       0.59      0.5

In [13]:
# Install necessary libraries
# !pip install googletrans==4.0.0-rc1 pandas scikit-learn tensorflow imbalanced-learn nlpaug

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, LSTM, Dense, Bidirectional, SpatialDropout1D
from tensorflow.keras.utils import to_categorical
from imblearn.over_sampling import SMOTE
import re
from googletrans import Translator

# Initialize the translator
translator = Translator()

# Load data from CSV file
df = pd.read_csv('5kReviewWithSentimentAmazon.csv')

# Keep only 'reviewText' and 'Stance' columns
df = df[['reviewText', 'Stance']]

# Drop rows where 'reviewText' or 'Stance' is NaN
df = df.dropna(subset=['reviewText', 'Stance'])

# Ensure all entries in 'reviewText' are strings
df['reviewText'] = df['reviewText'].astype(str)

# For full review display without truncation
pd.set_option('display.max_colwidth', None)

# Print value counts of 'Stance' column
print(df['Stance'].value_counts())

# Text Preprocessing
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Ensure NLTK stopwords and WordNet lemmatizer are downloaded
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])  # Lemmatize and remove stopwords
    return text

df['reviewText'] = df['reviewText'].apply(preprocess_text)

# Define back-translation function
def back_translate(text, src_language='en', mid_language='fr'):
    try:
        translated_text = translator.translate(text, src=src_language, dest=mid_language).text
        back_translated_text = translator.translate(translated_text, src=mid_language, dest=src_language).text
        return back_translated_text
    except Exception as e:
        print(f"Error during back-translation: {e}")
        return text

# Define function to augment minority class
def augment_minority_class(df, class_label, src_language='en', mid_language='fr'):
    minority_texts = df[df['Stance'] == class_label]['reviewText'].tolist()
    augmented_texts = [back_translate(text, src_language, mid_language) for text in minority_texts]
    augmented_labels = [class_label] * len(augmented_texts)
    return pd.DataFrame({'reviewText': augmented_texts, 'Stance': augmented_labels})

# Define threshold for minority class
threshold = 100  # Adjust based on your needs

# Identify minority classes
class_counts = df['Stance'].value_counts()
minority_classes = class_counts[class_counts < threshold].index.tolist()

# Apply back-translation for all minority classes
augmented_df_list = [augment_minority_class(df, cls) for cls in minority_classes]
df_augmented = pd.concat([df] + augmented_df_list, ignore_index=True)

# Text Vectorization for ML models
tfidf = TfidfVectorizer(max_df=0.7)
X = tfidf.fit_transform(df_augmented['reviewText'])
y = df_augmented['Stance']

# Handle class imbalance using SMOTE for traditional ML models
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

# Split dataset for ML models
X_train_ml, X_test_ml, y_train_ml, y_test_ml = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42)

# Train and evaluate ML models
models = {
    'Naive Bayes': MultinomialNB(),
    'SVM': SVC(kernel='linear'),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}

for name, model in models.items():
    model.fit(X_train_ml, y_train_ml)
    y_pred = model.predict(X_test_ml)
    accuracy = accuracy_score(y_test_ml, y_pred)
    report = classification_report(y_test_ml, y_pred)
    print(f"{name} Results:")
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(report)

# Tokenizer and padding for deep learning models
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df_augmented['reviewText'])
X_seq = tokenizer.texts_to_sequences(df_augmented['reviewText'])
X_pad = pad_sequences(X_seq, maxlen=100)
y_encoded = pd.get_dummies(df_augmented['Stance']).values

# Split dataset for deep learning models
X_train_dl, X_test_dl, y_train_dl, y_test_dl = train_test_split(X_pad, y_encoded, test_size=0.2, random_state=42)

# Define and train CNN model
cnn_model = Sequential([
    Embedding(5000, 128, input_length=100),
    Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'),
    GlobalMaxPooling1D(),
    Dense(128, activation='relu'),
    Dense(y_encoded.shape[1], activation='softmax')
])
cnn_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
cnn_model.fit(X_train_dl, y_train_dl, epochs=5, batch_size=64, validation_data=(X_test_dl, y_test_dl), verbose=2)

cnn_y_pred = cnn_model.predict(X_test_dl)
cnn_y_pred_labels = cnn_y_pred.argmax(axis=1)
y_test_dl_labels = y_test_dl.argmax(axis=1)
cnn_accuracy = accuracy_score(y_test_dl_labels, cnn_y_pred_labels)
cnn_report = classification_report(y_test_dl_labels, cnn_y_pred_labels)
print("CNN Results:")
print(f"Accuracy: {cnn_accuracy}")
print("Classification Report:")
print(cnn_report)

# Define and train LSTM model
lstm_model = Sequential([
    Embedding(5000, 128, input_length=100),
    SpatialDropout1D(0.2),
    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    Dense(y_encoded.shape[1], activation='softmax')
])
lstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.fit(X_train_dl, y_train_dl, epochs=5, batch_size=64, validation_data=(X_test_dl, y_test_dl), verbose=2)

lstm_y_pred = lstm_model.predict(X_test_dl)
lstm_y_pred_labels = lstm_y_pred.argmax(axis=1)
lstm_accuracy = accuracy_score(y_test_dl_labels, lstm_y_pred_labels)
lstm_report = classification_report(y_test_dl_labels, lstm_y_pred_labels)
print("LSTM Results:")
print(f"Accuracy: {lstm_accuracy}")
print("Classification Report:")
print(lstm_report)

# Define and train BiLSTM model
bilstm_model = Sequential([
    Embedding(5000, 128, input_length=100),
    SpatialDropout1D(0.2),
    Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)),
    Dense(y_encoded.shape[1], activation='softmax')
])
bilstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
bilstm_model.fit(X_train_dl, y_train_dl, epochs=5, batch_size=64, validation_data=(X_test_dl, y_test_dl), verbose=2)

bilstm_y_pred = bilstm_model.predict(X_test_dl)
bilstm_y_pred_labels = bilstm_y_pred.argmax(axis=1)
bilstm_accuracy = accuracy_score(y_test_dl_labels, bilstm_y_pred_labels)
bilstm_report = classification_report(y_test_dl_labels, bilstm_y_pred_labels)
print("BiLSTM Results:")
print(f"Accuracy: {bilstm_accuracy}")
print("Classification Report:")
print(bilstm_report)

# Compare all models
accuracy_scores_all = {
    'Naive Bayes': accuracy_score(y_test_ml, models['Naive Bayes'].predict(X_test_ml)),
    'SVM': accuracy_score(y_test_ml, models['SVM'].predict(X_test_ml)),
    'Random Forest': accuracy_score(y_test_ml, models['Random Forest'].predict(X_test_ml)),
    'CNN': cnn_accuracy,
    'LSTM': lstm_accuracy,
    'BiLSTM': bilstm_accuracy
}

best_model = max(accuracy_scores_all, key=accuracy_scores_all.get)
print(f"\nThe best model based on accuracy is: {best_model} with accuracy {accuracy_scores_all[best_model]}")


Stance
Supportive    4162
Opposed        469
Neutral        283
Name: count, dtype: int64


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Moneykicks\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Moneykicks\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Naive Bayes Results:
Accuracy: 0.9275420336269016
Classification Report:
              precision    recall  f1-score   support

     Neutral       0.95      0.97      0.96       844
     Opposed       0.88      0.97      0.92       811
  Supportive       0.96      0.84      0.90       843

    accuracy                           0.93      2498
   macro avg       0.93      0.93      0.93      2498
weighted avg       0.93      0.93      0.93      2498

SVM Results:
Accuracy: 0.9619695756605284
Classification Report:
              precision    recall  f1-score   support

     Neutral       0.96      1.00      0.98       844
     Opposed       0.93      1.00      0.96       811
  Supportive       1.00      0.89      0.94       843

    accuracy                           0.96      2498
   macro avg       0.96      0.96      0.96      2498
weighted avg       0.96      0.96      0.96      2498

Random Forest Results:
Accuracy: 0.9755804643714971
Classification Report:
              precision  



62/62 - 3s - 47ms/step - accuracy: 0.8151 - loss: 0.5776 - val_accuracy: 0.8464 - val_loss: 0.4419
Epoch 2/5
62/62 - 2s - 27ms/step - accuracy: 0.8547 - loss: 0.3570 - val_accuracy: 0.8637 - val_loss: 0.3464
Epoch 3/5
62/62 - 2s - 27ms/step - accuracy: 0.9252 - loss: 0.2140 - val_accuracy: 0.8759 - val_loss: 0.3640
Epoch 4/5
62/62 - 2s - 27ms/step - accuracy: 0.9753 - loss: 0.0856 - val_accuracy: 0.8647 - val_loss: 0.5296
Epoch 5/5
62/62 - 2s - 27ms/step - accuracy: 0.9954 - loss: 0.0246 - val_accuracy: 0.8779 - val_loss: 0.5031
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
CNN Results:
Accuracy: 0.8779247202441506
Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.40      0.49        52
           1       0.65      0.32      0.43        99
           2       0.90      0.97      0.93       832

    accuracy                           0.88       983
   macro avg       0.73      0.57      0.62       983
w



62/62 - 9s - 142ms/step - accuracy: 0.8359 - loss: 0.5898 - val_accuracy: 0.8464 - val_loss: 0.5009
Epoch 2/5
62/62 - 7s - 107ms/step - accuracy: 0.8499 - loss: 0.4621 - val_accuracy: 0.8535 - val_loss: 0.4187
Epoch 3/5
62/62 - 7s - 109ms/step - accuracy: 0.8748 - loss: 0.3198 - val_accuracy: 0.8708 - val_loss: 0.3763
Epoch 4/5
62/62 - 7s - 108ms/step - accuracy: 0.9247 - loss: 0.2205 - val_accuracy: 0.8850 - val_loss: 0.3617
Epoch 5/5
62/62 - 7s - 107ms/step - accuracy: 0.9524 - loss: 0.1477 - val_accuracy: 0.8810 - val_loss: 0.4630
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step
LSTM Results:
Accuracy: 0.8809766022380467
Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.46      0.55        52
           1       0.63      0.27      0.38        99
           2       0.90      0.98      0.94       832

    accuracy                           0.88       983
   macro avg       0.74      0.57      0.62     



62/62 - 15s - 250ms/step - accuracy: 0.8377 - loss: 0.5648 - val_accuracy: 0.8464 - val_loss: 0.4807
Epoch 2/5
62/62 - 12s - 193ms/step - accuracy: 0.8525 - loss: 0.4007 - val_accuracy: 0.8535 - val_loss: 0.3555
Epoch 3/5
62/62 - 12s - 193ms/step - accuracy: 0.8850 - loss: 0.2799 - val_accuracy: 0.8678 - val_loss: 0.4038
Epoch 4/5
62/62 - 12s - 195ms/step - accuracy: 0.9199 - loss: 0.2160 - val_accuracy: 0.8830 - val_loss: 0.4051
Epoch 5/5
62/62 - 12s - 191ms/step - accuracy: 0.9425 - loss: 0.1622 - val_accuracy: 0.8810 - val_loss: 0.4393
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 33ms/step
BiLSTM Results:
Accuracy: 0.8809766022380467
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.46      0.54        52
           1       0.59      0.36      0.45        99
           2       0.91      0.97      0.94       832

    accuracy                           0.88       983
   macro avg       0.72      0.60      0.

In [10]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, LSTM, Dense, Bidirectional, SpatialDropout1D
from tensorflow.keras.utils import to_categorical
from imblearn.over_sampling import SMOTE
import re
from googletrans import Translator 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Load NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize Translator
translator = Translator()

# Load the original dataset
df_original = pd.read_csv('5kReviewWithSentimentAmazon.csv')

# Keep 'reviewerName', 'reviewText', and 'Stance' columns
df = df_original[['reviewerName', 'reviewText', 'Stance']]

# Drop rows where any of the required columns are NaN
df = df.dropna(subset=['reviewerName', 'reviewText', 'Stance'])

# Ensure all entries in 'reviewText' are strings
df['reviewText'] = df['reviewText'].astype(str)

# For full review display without truncation
pd.set_option('display.max_colwidth', None)

# Print value counts of 'Stance' column
print(df['Stance'].value_counts())

# Text Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])  # Lemmatize and remove stopwords
    return text

df['reviewText'] = df['reviewText'].apply(preprocess_text)

# Define back-translation function
def back_translate(text, src_language='en', mid_language='fr'):
    try:
        translated_text = translator.translate(text, src=src_language, dest=mid_language).text
        back_translated_text = translator.translate(translated_text, src=mid_language, dest=src_language).text
        return back_translated_text
    except Exception as e:
        print(f"Error during back-translation: {e}")
        return text

# Define function to augment minority class
def augment_minority_class(df, class_label, src_language='en', mid_language='fr'):
    minority_texts = df[df['Stance'] == class_label]['reviewText'].tolist()
    augmented_texts = [back_translate(text, src_language, mid_language) for text in minority_texts]
    augmented_labels = [class_label] * len(augmented_texts)
    return pd.DataFrame({'reviewText': augmented_texts, 'Stance': augmented_labels})

# Define threshold for minority class
threshold = 100  # Adjust based on your needs

# Identify minority classes
class_counts = df['Stance'].value_counts()
minority_classes = class_counts[class_counts < threshold].index.tolist()

# Apply back-translation for all minority classes
augmented_df_list = [augment_minority_class(df, cls) for cls in minority_classes]
df_augmented = pd.concat([df] + augmented_df_list, ignore_index=True)

# Text Vectorization for ML models
tfidf = TfidfVectorizer(max_df=0.7)
X = tfidf.fit_transform(df_augmented['reviewText'])
y = df_augmented['Stance']

# Handle class imbalance using SMOTE for traditional ML models
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

# Split dataset for ML models
X_train_ml, X_test_ml, y_train_ml, y_test_ml = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42)

# Train and evaluate ML models
models = {
    'Naive Bayes': MultinomialNB(),
    'SVM': SVC(kernel='linear'),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}

for name, model in models.items():
    model.fit(X_train_ml, y_train_ml)
    y_pred = model.predict(X_test_ml)
    accuracy = accuracy_score(y_test_ml, y_pred)
    report = classification_report(y_test_ml, y_pred)
    print(f"{name} Results:")
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(report)

# Tokenizer and padding for deep learning models
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df_augmented['reviewText'])
X_seq = tokenizer.texts_to_sequences(df_augmented['reviewText'])
X_pad = pad_sequences(X_seq, maxlen=100)
y_encoded = pd.get_dummies(df_augmented['Stance']).values

# Split dataset for deep learning models
X_train_dl, X_test_dl, y_train_dl, y_test_dl = train_test_split(X_pad, y_encoded, test_size=0.2, random_state=42)

# Define and train CNN model
cnn_model = Sequential([
    Embedding(5000, 128, input_length=100),
    Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'),
    GlobalMaxPooling1D(),
    Dense(128, activation='relu'),
    Dense(y_encoded.shape[1], activation='softmax')
])
cnn_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
cnn_model.fit(X_train_dl, y_train_dl, epochs=5, batch_size=64, validation_data=(X_test_dl, y_test_dl), verbose=2)

cnn_y_pred = cnn_model.predict(X_test_dl)
cnn_y_pred_labels = cnn_y_pred.argmax(axis=1)
y_test_dl_labels = y_test_dl.argmax(axis=1)
cnn_accuracy = accuracy_score(y_test_dl_labels, cnn_y_pred_labels)
cnn_report = classification_report(y_test_dl_labels, cnn_y_pred_labels)
print("CNN Results:")
print(f"Accuracy: {cnn_accuracy}")
print("Classification Report:")
print(cnn_report)

# Define and train LSTM model
lstm_model = Sequential([
    Embedding(5000, 128, input_length=100),
    SpatialDropout1D(0.2),
    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    Dense(y_encoded.shape[1], activation='softmax')
])
lstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.fit(X_train_dl, y_train_dl, epochs=5, batch_size=64, validation_data=(X_test_dl, y_test_dl), verbose=2)

lstm_y_pred = lstm_model.predict(X_test_dl)
lstm_y_pred_labels = lstm_y_pred.argmax(axis=1)
lstm_accuracy = accuracy_score(y_test_dl_labels, lstm_y_pred_labels)
lstm_report = classification_report(y_test_dl_labels, lstm_y_pred_labels)
print("LSTM Results:")
print(f"Accuracy: {lstm_accuracy}")
print("Classification Report:")
print(lstm_report)

# Define and train BiLSTM model
bilstm_model = Sequential([
    Embedding(5000, 128, input_length=100),
    SpatialDropout1D(0.2),
    Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)),
    Dense(y_encoded.shape[1], activation='softmax')
])
bilstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
bilstm_model.fit(X_train_dl, y_train_dl, epochs=5, batch_size=64, validation_data=(X_test_dl, y_test_dl), verbose=2)

bilstm_y_pred = bilstm_model.predict(X_test_dl)
bilstm_y_pred_labels = bilstm_y_pred.argmax(axis=1)
bilstm_accuracy = accuracy_score(y_test_dl_labels, bilstm_y_pred_labels)
bilstm_report = classification_report(y_test_dl_labels, bilstm_y_pred_labels)
print("BiLSTM Results:")
print(f"Accuracy: {bilstm_accuracy}")
print("Classification Report:")
print(bilstm_report)

# Compare all models
accuracy_scores_all = {
    'Naive Bayes': accuracy_score(y_test_ml, models['Naive Bayes'].predict(X_test_ml)),
    'SVM': accuracy_score(y_test_ml, models['SVM'].predict(X_test_ml)),
    'Random Forest': accuracy_score(y_test_ml, models['Random Forest'].predict(X_test_ml)),
    'CNN': cnn_accuracy,
    'LSTM': lstm_accuracy,
    'BiLSTM': bilstm_accuracy
}

best_model = max(accuracy_scores_all, key=accuracy_scores_all.get)
print(f"\nThe best model based on accuracy is: {best_model} with accuracy {accuracy_scores_all[best_model]}")

# # Calculate similarity between two users
# def get_user_reviews(df, user_name):
#     """Extract reviews for a specific user."""
#     return df[df['reviewerName'] == user_name]['reviewText'].tolist()

# def compute_similarity(user1_reviews, user2_reviews, vectorizer):
#     """Compute cosine similarity between reviews of two users."""
#     # Combine reviews from both users
#     combined_reviews = user1_reviews + user2_reviews
    
#     # Transform reviews to TF-IDF vectors
#     vectors = vectorizer.transform(combined_reviews)
    
#     # Compute cosine similarity between user 1 and user 2
#     similarity_matrix = cosine_similarity(vectors[:len(user1_reviews)], vectors[len(user1_reviews):])
    
#     # Return the average similarity score
#     return similarity_matrix.mean()

# # Define your users
# user1 = '53rdcard'
# user2 = 'Aaron'

# # Get reviews for each user
# user1_reviews = get_user_reviews(df, user1)
# user2_reviews = get_user_reviews(df, user2)

# # Vectorize the reviews using the same TF-IDF vectorizer as before
# tfidf = TfidfVectorizer(max_df=0.7)
# tfidf.fit(df['reviewText'])  # Fit on all reviews in the dataset

# # Compute similarity score
# similarity_score = compute_similarity(user1_reviews, user2_reviews, tfidf)
# print(f"Similarity Score between {user1} and {user2}: {similarity_score}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Moneykicks\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Moneykicks\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Stance
Supportive    4162
Opposed        469
Neutral        282
Name: count, dtype: int64
Naive Bayes Results:
Accuracy: 0.9187349879903923
Classification Report:
              precision    recall  f1-score   support

     Neutral       0.96      0.98      0.97       826
     Opposed       0.84      0.97      0.90       816
  Supportive       0.97      0.82      0.89       856

    accuracy                           0.92      2498
   macro avg       0.92      0.92      0.92      2498
weighted avg       0.93      0.92      0.92      2498

SVM Results:
Accuracy: 0.9655724579663731
Classification Report:
              precision    recall  f1-score   support

     Neutral       0.96      1.00      0.98       826
     Opposed       0.94      1.00      0.97       816
  Supportive       1.00      0.90      0.95       856

    accuracy                           0.97      2498
   macro avg       0.97      0.97      0.97      2498
weighted avg       0.97      0.97      0.97      2498

Random For



62/62 - 3s - 43ms/step - accuracy: 0.8494 - loss: 0.5253 - val_accuracy: 0.8383 - val_loss: 0.4297
Epoch 2/5
62/62 - 1s - 24ms/step - accuracy: 0.8616 - loss: 0.3376 - val_accuracy: 0.8515 - val_loss: 0.3602
Epoch 3/5
62/62 - 1s - 24ms/step - accuracy: 0.9356 - loss: 0.1903 - val_accuracy: 0.8637 - val_loss: 0.3906
Epoch 4/5
62/62 - 1s - 24ms/step - accuracy: 0.9776 - loss: 0.0726 - val_accuracy: 0.8606 - val_loss: 0.4719
Epoch 5/5
62/62 - 2s - 25ms/step - accuracy: 0.9964 - loss: 0.0193 - val_accuracy: 0.8576 - val_loss: 0.5761
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
CNN Results:
Accuracy: 0.8575788402848423
Classification Report:
              precision    recall  f1-score   support

           0       0.54      0.37      0.44        57
           1       0.55      0.27      0.37       102
           2       0.89      0.96      0.92       824

    accuracy                           0.86       983
   macro avg       0.66      0.54      0.58       983
w



62/62 - 6s - 101ms/step - accuracy: 0.8379 - loss: 0.5935 - val_accuracy: 0.8383 - val_loss: 0.5231
Epoch 2/5
62/62 - 4s - 68ms/step - accuracy: 0.8517 - loss: 0.4606 - val_accuracy: 0.8403 - val_loss: 0.4426
Epoch 3/5
62/62 - 4s - 67ms/step - accuracy: 0.8746 - loss: 0.3288 - val_accuracy: 0.8484 - val_loss: 0.3870
Epoch 4/5
62/62 - 4s - 66ms/step - accuracy: 0.9137 - loss: 0.2352 - val_accuracy: 0.8433 - val_loss: 0.4248
Epoch 5/5
62/62 - 4s - 71ms/step - accuracy: 0.9397 - loss: 0.1665 - val_accuracy: 0.8515 - val_loss: 0.4738
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step
LSTM Results:
Accuracy: 0.8514750762970499
Classification Report:
              precision    recall  f1-score   support

           0       0.54      0.37      0.44        57
           1       0.48      0.27      0.35       102
           2       0.89      0.96      0.92       824

    accuracy                           0.85       983
   macro avg       0.64      0.53      0.57       98



62/62 - 10s - 166ms/step - accuracy: 0.8382 - loss: 0.5494 - val_accuracy: 0.8383 - val_loss: 0.5012
Epoch 2/5
62/62 - 7s - 113ms/step - accuracy: 0.8578 - loss: 0.3941 - val_accuracy: 0.8454 - val_loss: 0.3877
Epoch 3/5
62/62 - 8s - 136ms/step - accuracy: 0.8896 - loss: 0.2748 - val_accuracy: 0.8566 - val_loss: 0.4145
Epoch 4/5
62/62 - 8s - 132ms/step - accuracy: 0.9232 - loss: 0.2036 - val_accuracy: 0.8576 - val_loss: 0.4419
Epoch 5/5
62/62 - 7s - 116ms/step - accuracy: 0.9458 - loss: 0.1483 - val_accuracy: 0.8474 - val_loss: 0.4383
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step
BiLSTM Results:
Accuracy: 0.8474059003051883
Classification Report:
              precision    recall  f1-score   support

           0       0.48      0.35      0.40        57
           1       0.47      0.37      0.42       102
           2       0.90      0.94      0.92       824

    accuracy                           0.85       983
   macro avg       0.62      0.55      0.58  

In [11]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Load NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Load the original dataset
df_original = pd.read_csv('5kReviewWithSentimentAmazon.csv')

# Keep 'reviewerName' and 'reviewText' columns
df = df_original[['reviewerName', 'reviewText']]

# Drop rows where any of the required columns are NaN
df = df.dropna(subset=['reviewerName', 'reviewText'])

# Ensure all entries in 'reviewText' are strings
df['reviewText'] = df['reviewText'].astype(str)

# For full review display without truncation
pd.set_option('display.max_colwidth', None)

# Text Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])  # Lemmatize and remove stopwords
    return text

df['reviewText'] = df['reviewText'].apply(preprocess_text)

# Function to get user reviews
def get_user_reviews(df, user_name):
    """Extract reviews for a specific user."""
    return df[df['reviewerName'] == user_name]['reviewText'].tolist()

# Function to compute similarity
def compute_similarity(user1_reviews, user2_reviews, vectorizer):
    """Compute cosine similarity between reviews of two users."""
    # Combine reviews from both users
    combined_reviews = user1_reviews + user2_reviews
    
    # Transform reviews to TF-IDF vectors
    vectors = vectorizer.transform(combined_reviews)
    
    # Compute cosine similarity between user 1 and user 2
    similarity_matrix = cosine_similarity(vectors[:len(user1_reviews)], vectors[len(user1_reviews):])
    
    # Return the average similarity score
    return similarity_matrix.mean()

# Define your users
user1 = '53rdcard'
user2 = 'Aaron'

# Get reviews for each user
user1_reviews = get_user_reviews(df, user1)
user2_reviews = get_user_reviews(df, user2)

# Vectorize the reviews using TF-IDF
tfidf = TfidfVectorizer(max_df=0.7)
tfidf.fit(df['reviewText'])  # Fit on all reviews in the dataset

# Compute similarity score
similarity_score = compute_similarity(user1_reviews, user2_reviews, tfidf)
print(f"Similarity Score between {user1} and {user2}: {similarity_score}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Moneykicks\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Moneykicks\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Similarity Score between 53rdcard and Aaron: 0.025672492209915476


In [13]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, LSTM, Dense, Bidirectional, SpatialDropout1D
from imblearn.over_sampling import SMOTE
import re
from googletrans import Translator 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Load NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize Translator
translator = Translator()

# Load the original dataset
df_original = pd.read_csv('5kReviewWithSentimentAmazon.csv')

# Keep 'reviewerName', 'reviewText', and 'Stance' columns
df = df_original[['reviewerName', 'reviewText', 'Stance']]

# Drop rows where any of the required columns are NaN
df = df.dropna(subset=['reviewerName', 'reviewText', 'Stance'])

# Ensure all entries in 'reviewText' are strings
df['reviewText'] = df['reviewText'].astype(str)

# For full review display without truncation
pd.set_option('display.max_colwidth', None)

# Print value counts of 'Stance' column
print(df['Stance'].value_counts())

# Text Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])  # Lemmatize and remove stopwords
    return text

df['reviewText'] = df['reviewText'].apply(preprocess_text)

# Define back-translation function
def back_translate(text, src_language='en', mid_language='fr'):
    try:
        translated_text = translator.translate(text, src=src_language, dest=mid_language).text
        back_translated_text = translator.translate(translated_text, src=mid_language, dest=src_language).text
        return back_translated_text
    except Exception as e:
        print(f"Error during back-translation: {e}")
        return text

# Define function to augment minority class
def augment_minority_class(df, class_label, src_language='en', mid_language='fr'):
    minority_texts = df[df['Stance'] == class_label]['reviewText'].tolist()
    augmented_texts = [back_translate(text, src_language, mid_language) for text in minority_texts]
    augmented_labels = [class_label] * len(augmented_texts)
    return pd.DataFrame({'reviewText': augmented_texts, 'Stance': augmented_labels})

# Define threshold for minority class
threshold = 100  # Adjust based on your needs

# Identify minority classes
class_counts = df['Stance'].value_counts()
minority_classes = class_counts[class_counts < threshold].index.tolist()

# Apply back-translation for all minority classes
augmented_df_list = [augment_minority_class(df, cls) for cls in minority_classes]
df_augmented = pd.concat([df] + augmented_df_list, ignore_index=True)

# Text Vectorization for ML models
tfidf = TfidfVectorizer(max_df=0.7)
X = tfidf.fit_transform(df_augmented['reviewText'])
y = df_augmented['Stance']

# Handle class imbalance using SMOTE for traditional ML models
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

# Split dataset for ML models
X_train_ml, X_test_ml, y_train_ml, y_test_ml = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42)

# Train and evaluate ML models
models = {
    'Naive Bayes': MultinomialNB(),
    'SVM': SVC(kernel='linear', probability=True),  # Set probability=True for SVM
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}

# Confidence threshold
confidence_threshold = 0.8

def trust_based_accuracy(y_true, y_pred, confidence_scores, threshold):
    mask = confidence_scores >= threshold
    y_true_trust = y_true[mask]
    y_pred_trust = y_pred[mask]
    if len(y_true_trust) == 0:
        return None  # No predictions above the threshold
    return accuracy_score(y_true_trust, y_pred_trust)

for name, model in models.items():
    model.fit(X_train_ml, y_train_ml)
    y_pred = model.predict(X_test_ml)
    y_probs = model.predict_proba(X_test_ml)
    confidence_scores = y_probs.max(axis=1)
    
    # Calculate standard accuracy
    accuracy = accuracy_score(y_test_ml, y_pred)
    report = classification_report(y_test_ml, y_pred)
    print(f"{name} Results:")
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(report)
    
    # Calculate trust-based accuracy
    trust_accuracy = trust_based_accuracy(y_test_ml, y_pred, confidence_scores, confidence_threshold)
    if trust_accuracy is not None:
        print(f"Trust-Based Accuracy for {name}: {trust_accuracy}")
    else:
        print(f"No predictions above the confidence threshold for {name}")

# Tokenizer and padding for deep learning models
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df_augmented['reviewText'])
X_seq = tokenizer.texts_to_sequences(df_augmented['reviewText'])
X_pad = pad_sequences(X_seq, maxlen=100)
y_encoded = pd.get_dummies(df_augmented['Stance']).values

# Split dataset for deep learning models
X_train_dl, X_test_dl, y_train_dl, y_test_dl = train_test_split(X_pad, y_encoded, test_size=0.2, random_state=42)

# Define and train CNN model
cnn_model = Sequential([
    Embedding(5000, 128, input_length=100),
    Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'),
    GlobalMaxPooling1D(),
    Dense(128, activation='relu'),
    Dense(y_encoded.shape[1], activation='softmax')
])
cnn_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
cnn_model.fit(X_train_dl, y_train_dl, epochs=5, batch_size=64, validation_data=(X_test_dl, y_test_dl), verbose=2)

cnn_y_pred = cnn_model.predict(X_test_dl)
cnn_y_pred_labels = cnn_y_pred.argmax(axis=1)
y_test_dl_labels = y_test_dl.argmax(axis=1)
cnn_accuracy = accuracy_score(y_test_dl_labels, cnn_y_pred_labels)
cnn_report = classification_report(y_test_dl_labels, cnn_y_pred_labels)
print("CNN Results:")
print(f"Accuracy: {cnn_accuracy}")
print("Classification Report:")
print(cnn_report)

# Calculate confidence scores for CNN
cnn_confidence_scores = cnn_y_pred.max(axis=1)

# Calculate trust-based accuracy for CNN
cnn_trust_accuracy = trust_based_accuracy(y_test_dl_labels, cnn_y_pred_labels, cnn_confidence_scores, confidence_threshold)
if cnn_trust_accuracy is not None:
    print(f"Trust-Based Accuracy for CNN: {cnn_trust_accuracy}")
else:
    print(f"No predictions above the confidence threshold for CNN")

# Define and train LSTM model
lstm_model = Sequential([
    Embedding(5000, 128, input_length=100),
    SpatialDropout1D(0.2),
    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    Dense(y_encoded.shape[1], activation='softmax')
])
lstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.fit(X_train_dl, y_train_dl, epochs=5, batch_size=64, validation_data=(X_test_dl, y_test_dl), verbose=2)

lstm_y_pred = lstm_model.predict(X_test_dl)
lstm_y_pred_labels = lstm_y_pred.argmax(axis=1)
lstm_accuracy = accuracy_score(y_test_dl_labels, lstm_y_pred_labels)
lstm_report = classification_report(y_test_dl_labels, lstm_y_pred_labels)
print("LSTM Results:")
print(f"Accuracy: {lstm_accuracy}")
print("Classification Report:")
print(lstm_report)

# Calculate confidence scores for LSTM
lstm_confidence_scores = lstm_y_pred.max(axis=1)

# Calculate trust-based accuracy for LSTM
lstm_trust_accuracy = trust_based_accuracy(y_test_dl_labels, lstm_y_pred_labels, lstm_confidence_scores, confidence_threshold)
if lstm_trust_accuracy is not None:
    print(f"Trust-Based Accuracy for LSTM: {lstm_trust_accuracy}")
else:
    print(f"No predictions above the confidence threshold for LSTM")

# Define and train BiLSTM model
bilstm_model = Sequential([
    Embedding(5000, 128, input_length=100),
    SpatialDropout1D(0.2),
    Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)),
    Dense(y_encoded.shape[1], activation='softmax')
])
bilstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
bilstm_model.fit(X_train_dl, y_train_dl, epochs=5, batch_size=64, validation_data=(X_test_dl, y_test_dl), verbose=2)

bilstm_y_pred = bilstm_model.predict(X_test_dl)
bilstm_y_pred_labels = bilstm_y_pred.argmax(axis=1)
bilstm_accuracy = accuracy_score(y_test_dl_labels, bilstm_y_pred_labels)
bilstm_report = classification_report(y_test_dl_labels, bilstm_y_pred_labels)
print("BiLSTM Results:")
print(f"Accuracy: {bilstm_accuracy}")
print("Classification Report:")
print(bilstm_report)

# Calculate confidence scores for BiLSTM
bilstm_confidence_scores = bilstm_y_pred.max(axis=1)

# Calculate trust-based accuracy for BiLSTM
bilstm_trust_accuracy = trust_based_accuracy(y_test_dl_labels, bilstm_y_pred_labels, bilstm_confidence_scores, confidence_threshold)
if bilstm_trust_accuracy is not None:
    print(f"Trust-Based Accuracy for BiLSTM: {bilstm_trust_accuracy}")
else:
    print(f"No predictions above the confidence threshold for BiLSTM")

# Compare all models
accuracy_scores_all = {
    'Naive Bayes': accuracy_score(y_test_ml, models['Naive Bayes'].predict(X_test_ml)),
    'SVM': accuracy_score(y_test_ml, models['SVM'].predict(X_test_ml)),
    'Random Forest': accuracy_score(y_test_ml, models['Random Forest'].predict(X_test_ml)),
    'CNN': cnn_accuracy,
    'LSTM': lstm_accuracy,
    'BiLSTM': bilstm_accuracy
}

best_model = max(accuracy_scores_all, key=accuracy_scores_all.get)
print(f"\nThe best model based on accuracy is: {best_model} with accuracy {accuracy_scores_all[best_model]}")

trust_accuracy_scores_all = {
    'Naive Bayes': trust_based_accuracy(y_test_ml, models['Naive Bayes'].predict(X_test_ml), models['Naive Bayes'].predict_proba(X_test_ml).max(axis=1), confidence_threshold),
    'SVM': trust_based_accuracy(y_test_ml, models['SVM'].predict(X_test_ml), models['SVM'].predict_proba(X_test_ml).max(axis=1), confidence_threshold),
    'Random Forest': trust_based_accuracy(y_test_ml, models['Random Forest'].predict(X_test_ml), models['Random Forest'].predict_proba(X_test_ml).max(axis=1), confidence_threshold),
    'CNN': cnn_trust_accuracy,
    'LSTM': lstm_trust_accuracy,
    'BiLSTM': bilstm_trust_accuracy
}

best_trust_model = max(trust_accuracy_scores_all, key=lambda k: trust_accuracy_scores_all[k] if trust_accuracy_scores_all[k] is not None else -1)
print(f"\nThe best model based on trust-based accuracy is: {best_trust_model} with trust-based accuracy {trust_accuracy_scores_all[best_trust_model]}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Moneykicks\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Moneykicks\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Stance
Supportive    4162
Opposed        469
Neutral        282
Name: count, dtype: int64
Naive Bayes Results:
Accuracy: 0.9187349879903923
Classification Report:
              precision    recall  f1-score   support

     Neutral       0.96      0.98      0.97       826
     Opposed       0.84      0.97      0.90       816
  Supportive       0.97      0.82      0.89       856

    accuracy                           0.92      2498
   macro avg       0.92      0.92      0.92      2498
weighted avg       0.93      0.92      0.92      2498

Trust-Based Accuracy for Naive Bayes: 0.998661311914324
SVM Results:
Accuracy: 0.9655724579663731
Classification Report:
              precision    recall  f1-score   support

     Neutral       0.96      1.00      0.98       826
     Opposed       0.94      1.00      0.97       816
  Supportive       1.00      0.90      0.95       856

    accuracy                           0.97      2498
   macro avg       0.97      0.97      0.97      2498
weighted 



62/62 - 3s - 45ms/step - accuracy: 0.8377 - loss: 0.5426 - val_accuracy: 0.8383 - val_loss: 0.4392
Epoch 2/5
62/62 - 2s - 26ms/step - accuracy: 0.8550 - loss: 0.3507 - val_accuracy: 0.8494 - val_loss: 0.3672
Epoch 3/5
62/62 - 2s - 26ms/step - accuracy: 0.9313 - loss: 0.2035 - val_accuracy: 0.8688 - val_loss: 0.4090
Epoch 4/5
62/62 - 2s - 29ms/step - accuracy: 0.9804 - loss: 0.0724 - val_accuracy: 0.8698 - val_loss: 0.4781
Epoch 5/5
62/62 - 2s - 25ms/step - accuracy: 0.9969 - loss: 0.0181 - val_accuracy: 0.8616 - val_loss: 0.5870
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
CNN Results:
Accuracy: 0.861648016276704
Classification Report:
              precision    recall  f1-score   support

           0       0.56      0.39      0.46        57
           1       0.60      0.26      0.37       102
           2       0.89      0.97      0.93       824

    accuracy                           0.86       983
   macro avg       0.68      0.54      0.58       983
we



62/62 - 8s - 125ms/step - accuracy: 0.8412 - loss: 0.5849 - val_accuracy: 0.8383 - val_loss: 0.5243
Epoch 2/5
62/62 - 6s - 96ms/step - accuracy: 0.8527 - loss: 0.4485 - val_accuracy: 0.8383 - val_loss: 0.4274
Epoch 3/5
62/62 - 6s - 90ms/step - accuracy: 0.8756 - loss: 0.3128 - val_accuracy: 0.8535 - val_loss: 0.3967
Epoch 4/5
62/62 - 6s - 90ms/step - accuracy: 0.9183 - loss: 0.2240 - val_accuracy: 0.8616 - val_loss: 0.4052
Epoch 5/5
62/62 - 6s - 90ms/step - accuracy: 0.9489 - loss: 0.1511 - val_accuracy: 0.8596 - val_loss: 0.5094
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step
LSTM Results:
Accuracy: 0.8596134282807731
Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.35      0.44        57
           1       0.53      0.25      0.34       102
           2       0.89      0.97      0.93       824

    accuracy                           0.86       983
   macro avg       0.67      0.53      0.57       98



62/62 - 14s - 219ms/step - accuracy: 0.8486 - loss: 0.5585 - val_accuracy: 0.8383 - val_loss: 0.4984
Epoch 2/5
62/62 - 10s - 158ms/step - accuracy: 0.8542 - loss: 0.3851 - val_accuracy: 0.8444 - val_loss: 0.3998
Epoch 3/5
62/62 - 10s - 157ms/step - accuracy: 0.8873 - loss: 0.2791 - val_accuracy: 0.8505 - val_loss: 0.4008
Epoch 4/5
62/62 - 10s - 164ms/step - accuracy: 0.9120 - loss: 0.2159 - val_accuracy: 0.8545 - val_loss: 0.4435
Epoch 5/5
62/62 - 10s - 155ms/step - accuracy: 0.9405 - loss: 0.1639 - val_accuracy: 0.8606 - val_loss: 0.5533
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step
BiLSTM Results:
Accuracy: 0.8606307222787386
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.26      0.40        57
           1       0.51      0.27      0.36       102
           2       0.88      0.97      0.93       824

    accuracy                           0.86       983
   macro avg       0.74      0.50      0.

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, LSTM, Dense, Bidirectional, SpatialDropout1D
from imblearn.over_sampling import SMOTE
import re
from googletrans import Translator
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.decomposition import LatentDirichletAllocation

# Load NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize Translator
translator = Translator()

# Load the original dataset
df_original = pd.read_csv('5kReviewWithSentimentAmazon.csv')

# Keep 'reviewerName', 'reviewText', and 'Stance' columns
df = df_original[['reviewerName', 'reviewText', 'Stance']]

# Drop rows where any of the required columns are NaN
df = df.dropna(subset=['reviewerName', 'reviewText', 'Stance'])

# Ensure all entries in 'reviewText' are strings
df['reviewText'] = df['reviewText'].astype(str)

# For full review display without truncation
pd.set_option('display.max_colwidth', None)

# Print value counts of 'Stance' column
print(df['Stance'].value_counts())

# Text Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])  # Lemmatize and remove stopwords
    return text

df['reviewText'] = df['reviewText'].apply(preprocess_text)

# Define back-translation function
def back_translate(text, src_language='en', mid_language='fr'):
    try:
        translated_text = translator.translate(text, src=src_language, dest=mid_language).text
        back_translated_text = translator.translate(translated_text, src=mid_language, dest=src_language).text
        return back_translated_text
    except Exception as e:
        print(f"Error during back-translation: {e}")
        return text

# Define function to augment minority class
def augment_minority_class(df, class_label, src_language='en', mid_language='fr'):
    minority_texts = df[df['Stance'] == class_label]['reviewText'].tolist()
    augmented_texts = [back_translate(text, src_language, mid_language) for text in minority_texts]
    augmented_labels = [class_label] * len(augmented_texts)
    return pd.DataFrame({'reviewText': augmented_texts, 'Stance': augmented_labels})

# Define threshold for minority class
threshold = 100  # Adjust based on your needs

# Identify minority classes
class_counts = df['Stance'].value_counts()
minority_classes = class_counts[class_counts < threshold].index.tolist()

# Apply back-translation for all minority classes
augmented_df_list = [augment_minority_class(df, cls) for cls in minority_classes]
df_augmented = pd.concat([df] + augmented_df_list, ignore_index=True)

# Text Vectorization for ML models
tfidf = TfidfVectorizer(max_df=0.7)
X = tfidf.fit_transform(df_augmented['reviewText'])
y = df_augmented['Stance']

# Handle class imbalance using SMOTE for traditional ML models
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

# Split dataset for ML models
X_train_ml, X_test_ml, y_train_ml, y_test_ml = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42)

# Train and evaluate ML models
models = {
    'Naive Bayes': MultinomialNB(),
    'SVM': SVC(kernel='linear', probability=True),  # Set probability=True for SVM
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}

# Confidence threshold
confidence_threshold = 0.8

def trust_based_accuracy(y_true, y_pred, confidence_scores, threshold):
    mask = confidence_scores >= threshold
    y_true_trust = y_true[mask]
    y_pred_trust = y_pred[mask]
    if len(y_true_trust) == 0:
        return None  # No predictions above the threshold
    return accuracy_score(y_true_trust, y_pred_trust)

for name, model in models.items():
    model.fit(X_train_ml, y_train_ml)
    y_pred = model.predict(X_test_ml)
    y_probs = model.predict_proba(X_test_ml)
    confidence_scores = y_probs.max(axis=1)
    
    # Calculate standard accuracy
    accuracy = accuracy_score(y_test_ml, y_pred)
    report = classification_report(y_test_ml, y_pred)
    print(f"{name} Results:")
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(report)
    
    # Calculate trust-based accuracy
    trust_accuracy = trust_based_accuracy(y_test_ml, y_pred, confidence_scores, confidence_threshold)
    if trust_accuracy is not None:
        print(f"Trust-Based Accuracy for {name}: {trust_accuracy}")
    else:
        print(f"No predictions above the confidence threshold for {name}")

# Tokenizer and padding for deep learning models
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df_augmented['reviewText'])
X_seq = tokenizer.texts_to_sequences(df_augmented['reviewText'])
X_pad = pad_sequences(X_seq, maxlen=100)
y_encoded = pd.get_dummies(df_augmented['Stance']).values

# Split dataset for deep learning models
X_train_dl, X_test_dl, y_train_dl, y_test_dl = train_test_split(X_pad, y_encoded, test_size=0.2, random_state=42)

# Define and train CNN model
cnn_model = Sequential([
    Embedding(5000, 128, input_length=100),
    Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'),
    GlobalMaxPooling1D(),
    Dense(128, activation='relu'),
    Dense(y_encoded.shape[1], activation='softmax')
])
cnn_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
cnn_model.fit(X_train_dl, y_train_dl, epochs=5, batch_size=64, validation_data=(X_test_dl, y_test_dl), verbose=2)

cnn_y_pred = cnn_model.predict(X_test_dl)
cnn_y_pred_labels = cnn_y_pred.argmax(axis=1)
y_test_dl_labels = y_test_dl.argmax(axis=1)
cnn_accuracy = accuracy_score(y_test_dl_labels, cnn_y_pred_labels)
cnn_report = classification_report(y_test_dl_labels, cnn_y_pred_labels)
print("CNN Results:")
print(f"Accuracy: {cnn_accuracy}")
print("Classification Report:")
print(cnn_report)

# Calculate confidence scores for CNN
cnn_confidence_scores = cnn_y_pred.max(axis=1)

# Calculate trust-based accuracy for CNN
cnn_trust_accuracy = trust_based_accuracy(y_test_dl_labels, cnn_y_pred_labels, cnn_confidence_scores, confidence_threshold)
if cnn_trust_accuracy is not None:
    print(f"Trust-Based Accuracy for CNN: {cnn_trust_accuracy}")
else:
    print(f"No predictions above the confidence threshold for CNN")

# Define and train LSTM model
lstm_model = Sequential([
    Embedding(5000, 128, input_length=100),
    SpatialDropout1D(0.2),
    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    Dense(y_encoded.shape[1], activation='softmax')
])
lstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.fit(X_train_dl, y_train_dl, epochs=5, batch_size=64, validation_data=(X_test_dl, y_test_dl), verbose=2)

lstm_y_pred = lstm_model.predict(X_test_dl)
lstm_y_pred_labels = lstm_y_pred.argmax(axis=1)
lstm_accuracy = accuracy_score(y_test_dl_labels, lstm_y_pred_labels)
lstm_report = classification_report(y_test_dl_labels, lstm_y_pred_labels)
print("LSTM Results:")
print(f"Accuracy: {lstm_accuracy}")
print("Classification Report:")
print(lstm_report)

# Calculate confidence scores for LSTM
lstm_confidence_scores = lstm_y_pred.max(axis=1)

# Calculate trust-based accuracy for LSTM
lstm_trust_accuracy = trust_based_accuracy(y_test_dl_labels, lstm_y_pred_labels, lstm_confidence_scores, confidence_threshold)
if lstm_trust_accuracy is not None:
    print(f"Trust-Based Accuracy for LSTM: {lstm_trust_accuracy}")
else:
    print(f"No predictions above the confidence threshold for LSTM")

# Define and train BiLSTM model
bilstm_model = Sequential([
    Embedding(5000, 128, input_length=100),
    SpatialDropout1D(0.2),
    Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)),
    Dense(y_encoded.shape[1], activation='softmax')
])
bilstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
bilstm_model.fit(X_train_dl, y_train_dl, epochs=5, batch_size=64, validation_data=(X_test_dl, y_test_dl), verbose=2)

bilstm_y_pred = bilstm_model.predict(X_test_dl)
bilstm_y_pred_labels = bilstm_y_pred.argmax(axis=1)
bilstm_accuracy = accuracy_score(y_test_dl_labels, bilstm_y_pred_labels)
bilstm_report = classification_report(y_test_dl_labels, bilstm_y_pred_labels)
print("BiLSTM Results:")
print(f"Accuracy: {bilstm_accuracy}")
print("Classification Report:")
print(bilstm_report)

# Calculate confidence scores for BiLSTM
bilstm_confidence_scores = bilstm_y_pred.max(axis=1)

# Calculate trust-based accuracy for BiLSTM
bilstm_trust_accuracy = trust_based_accuracy(y_test_dl_labels, bilstm_y_pred_labels, bilstm_confidence_scores, confidence_threshold)
if bilstm_trust_accuracy is not None:
    print(f"Trust-Based Accuracy for BiLSTM: {bilstm_trust_accuracy}")
else:
    print(f"No predictions above the confidence threshold for BiLSTM")

# # Topic Modeling with LDA
# # Vectorize the text data using TF-IDF for topic modeling
# tfidf_vectorizer = TfidfVectorizer(max_df=0.7, stop_words='english')
# tfidf_matrix = tfidf_vectorizer.fit_transform(df['reviewText'])

# # Number of topics
# num_topics = 5

# # Define the LDA model
# lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)

# # Fit the LDA model
# lda.fit(tfidf_matrix)

# # Get the topics
# def display_topics(model, feature_names, num_top_words):
#     for topic_idx, topic in enumerate(model.components_):
#         print(f"Topic {topic_idx + 1}:")
#         print(" ".join([feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))

# tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
# display_topics(lda, tfidf_feature_names, 10)

# # Assign topic labels to each review
# topic_assignments = lda.transform(tfidf_matrix)
# df['Topic'] = topic_assignments.argmax(axis=1)

# # Print the first few rows to see the assigned topics
# print(df.head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Moneykicks\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Moneykicks\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Stance
Supportive    4162
Opposed        469
Neutral        282
Name: count, dtype: int64
Naive Bayes Results:
Accuracy: 0.9187349879903923
Classification Report:
              precision    recall  f1-score   support

     Neutral       0.96      0.98      0.97       826
     Opposed       0.84      0.97      0.90       816
  Supportive       0.97      0.82      0.89       856

    accuracy                           0.92      2498
   macro avg       0.92      0.92      0.92      2498
weighted avg       0.93      0.92      0.92      2498

Trust-Based Accuracy for Naive Bayes: 0.998661311914324
SVM Results:
Accuracy: 0.9655724579663731
Classification Report:
              precision    recall  f1-score   support

     Neutral       0.96      1.00      0.98       826
     Opposed       0.94      1.00      0.97       816
  Supportive       1.00      0.90      0.95       856

    accuracy                           0.97      2498
   macro avg       0.97      0.97      0.97      2498
weighted 



62/62 - 3s - 52ms/step - accuracy: 0.8494 - loss: 0.5262 - val_accuracy: 0.8383 - val_loss: 0.4330
Epoch 2/5
62/62 - 2s - 27ms/step - accuracy: 0.8567 - loss: 0.3460 - val_accuracy: 0.8515 - val_loss: 0.3579
Epoch 3/5
62/62 - 2s - 27ms/step - accuracy: 0.9336 - loss: 0.2111 - val_accuracy: 0.8606 - val_loss: 0.4053
Epoch 4/5
62/62 - 2s - 28ms/step - accuracy: 0.9784 - loss: 0.0793 - val_accuracy: 0.8616 - val_loss: 0.4791
Epoch 5/5
62/62 - 2s - 27ms/step - accuracy: 0.9964 - loss: 0.0199 - val_accuracy: 0.8637 - val_loss: 0.5578
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
CNN Results:
Accuracy: 0.8636826042726348
Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.35      0.42        57
           1       0.56      0.33      0.42       102
           2       0.90      0.96      0.93       824

    accuracy                           0.86       983
   macro avg       0.66      0.55      0.59       983
w



62/62 - 7s - 120ms/step - accuracy: 0.8405 - loss: 0.5864 - val_accuracy: 0.8383 - val_loss: 0.5198
Epoch 2/5
62/62 - 5s - 77ms/step - accuracy: 0.8501 - loss: 0.4510 - val_accuracy: 0.8423 - val_loss: 0.4230
Epoch 3/5
62/62 - 5s - 80ms/step - accuracy: 0.8753 - loss: 0.3166 - val_accuracy: 0.8535 - val_loss: 0.3836
Epoch 4/5
62/62 - 5s - 75ms/step - accuracy: 0.9265 - loss: 0.2141 - val_accuracy: 0.8566 - val_loss: 0.4821
Epoch 5/5
62/62 - 5s - 75ms/step - accuracy: 0.9458 - loss: 0.1518 - val_accuracy: 0.8555 - val_loss: 0.4522
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step
LSTM Results:
Accuracy: 0.8555442522889115
Classification Report:
              precision    recall  f1-score   support

           0       0.47      0.33      0.39        57
           1       0.53      0.28      0.37       102
           2       0.89      0.96      0.93       824

    accuracy                           0.86       983
   macro avg       0.63      0.53      0.56       98



62/62 - 12s - 193ms/step - accuracy: 0.8394 - loss: 0.5527 - val_accuracy: 0.8393 - val_loss: 0.5067
Epoch 2/5
62/62 - 7s - 120ms/step - accuracy: 0.8529 - loss: 0.3914 - val_accuracy: 0.8474 - val_loss: 0.4270
Epoch 3/5
62/62 - 7s - 111ms/step - accuracy: 0.8885 - loss: 0.2749 - val_accuracy: 0.8606 - val_loss: 0.3841
Epoch 4/5
62/62 - 7s - 117ms/step - accuracy: 0.9198 - loss: 0.2037 - val_accuracy: 0.8616 - val_loss: 0.4488
Epoch 5/5
62/62 - 8s - 128ms/step - accuracy: 0.9481 - loss: 0.1465 - val_accuracy: 0.8545 - val_loss: 0.4815
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 33ms/step
BiLSTM Results:
Accuracy: 0.854526958290946
Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.30      0.41        57
           1       0.46      0.36      0.41       102
           2       0.90      0.95      0.92       824

    accuracy                           0.85       983
   macro avg       0.68      0.54      0.58   