In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, LSTM, Dense, Bidirectional, SpatialDropout1D
from imblearn.over_sampling import SMOTE
import re
from googletrans import Translator 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.decomposition import LatentDirichletAllocation



In [2]:
# Load NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize Translator
translator = Translator()

# Load the original dataset
df_original = pd.read_csv('171kReviewWithSentiment.csv')

# Keep 'Summary' and 'Stance' columns
df = df_original[['Summary', 'Stance']]

# Drop rows where any of the required columns are NaN
df = df.dropna(subset=['Summary', 'Stance'])

# Ensure all entries in 'Summary' are strings
df['Summary'] = df['Summary'].astype(str)

# Print value counts of 'Stance' column
print(df['Stance'].value_counts())

# Text Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Moneykicks\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Moneykicks\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Stance
supportive    25913
neutral       25790
oppose        25001
Name: count, dtype: int64


In [4]:
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])  # Lemmatize and remove stopwords
    return text

df['Summary'] = df['Summary'].apply(preprocess_text)

# Define back-translation function
def back_translate(text, src_language='en', mid_language='fr'):
    try:
        translated_text = translator.translate(text, src=src_language, dest=mid_language).text
        back_translated_text = translator.translate(translated_text, src=mid_language, dest=src_language).text
        return back_translated_text
    except Exception as e:
        print(f"Error during back-translation: {e}")
        return text

# Define function to augment minority class
def augment_minority_class(df, class_label, src_language='en', mid_language='fr'):
    minority_texts = df[df['Stance'] == class_label]['Summary'].tolist()
    augmented_texts = [back_translate(text, src_language, mid_language) for text in minority_texts]
    augmented_labels = [class_label] * len(augmented_texts)
    return pd.DataFrame({'Summary': augmented_texts, 'Stance': augmented_labels})

# Define threshold for minority class
threshold = 100  # Adjust based on your needs

# Identify minority classes
class_counts = df['Stance'].value_counts()
minority_classes = class_counts[class_counts < threshold].index.tolist()

# Apply back-translation for all minority classes
augmented_df_list = [augment_minority_class(df, cls) for cls in minority_classes]
df_augmented = pd.concat([df] + augmented_df_list, ignore_index=True)



In [5]:
# Vectorize text data for LDA
count_vectorizer_lda = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
text_data_vectorized_lda = count_vectorizer_lda.fit_transform(df_augmented['Summary'])

# Fit LDA model
lda_model = LatentDirichletAllocation(n_components=10, random_state=42)
lda_model.fit(text_data_vectorized_lda)

# Transform data into topic distributions
topic_distributions_lda = lda_model.transform(text_data_vectorized_lda)

# Add topic distributions to the dataframe
for i in range(10):  # Adjust based on the number of topics (n_components) in LDA
    df_augmented[f'topic_{i}_lda'] = topic_distributions_lda[:, i]

# Text Vectorization for ML models
tfidf = TfidfVectorizer(max_df=0.7)
X = tfidf.fit_transform(df_augmented['Summary'])
y = df_augmented['Stance']

# Handle class imbalance using SMOTE for traditional ML models
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

# Split dataset for ML models
X_train_ml, X_test_ml, y_train_ml, y_test_ml = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42)

# Train and evaluate ML models
models = {
    'Naive Bayes': MultinomialNB(),
    'SVM': SVC(kernel='linear', probability=True),  # Set probability=True for SVM
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}



In [6]:
# Confidence threshold
confidence_threshold = 0.8

def trust_based_accuracy(y_true, y_pred, confidence_scores, threshold):
    mask = confidence_scores >= threshold
    y_true_trust = y_true[mask]
    y_pred_trust = y_pred[mask]
    if len(y_true_trust) == 0:
        return None  # No predictions above the threshold
    return accuracy_score(y_true_trust, y_pred_trust)

for name, model in models.items():
    model.fit(X_train_ml, y_train_ml)
    y_pred = model.predict(X_test_ml)
    y_probs = model.predict_proba(X_test_ml)
    confidence_scores = y_probs.max(axis=1)
    
    # Calculate standard accuracy
    accuracy = accuracy_score(y_test_ml, y_pred)
    report = classification_report(y_test_ml, y_pred)
    print(f"{name} Results:")
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(report)
    
    # Calculate trust-based accuracy
    trust_accuracy = trust_based_accuracy(y_test_ml, y_pred, confidence_scores, confidence_threshold)
    if trust_accuracy is not None:
        print(f"Trust-Based Accuracy for {name}: {trust_accuracy}")
    else:
        print(f"No predictions above the confidence threshold for {name}")



Naive Bayes Results:
Accuracy: 0.792191921790584
Classification Report:
              precision    recall  f1-score   support

     neutral       0.81      0.67      0.73      5279
      oppose       0.75      0.82      0.78      5160
  supportive       0.82      0.90      0.86      5109

    accuracy                           0.79     15548
   macro avg       0.79      0.79      0.79     15548
weighted avg       0.79      0.79      0.79     15548

Trust-Based Accuracy for Naive Bayes: 0.9522505562211193
SVM Results:
Accuracy: 0.8040262413172112
Classification Report:
              precision    recall  f1-score   support

     neutral       0.77      0.75      0.76      5279
      oppose       0.78      0.78      0.78      5160
  supportive       0.86      0.88      0.87      5109

    accuracy                           0.80     15548
   macro avg       0.80      0.80      0.80     15548
weighted avg       0.80      0.80      0.80     15548

Trust-Based Accuracy for SVM: 0.909944134078

In [7]:
# Tokenizer and padding for deep learning models
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df_augmented['Summary'])
X_seq_dl_lda = tokenizer.texts_to_sequences(df_augmented['Summary'])
X_pad_dl_lda = pad_sequences(X_seq_dl_lda, maxlen=100)
y_encoded = pd.get_dummies(df_augmented['Stance']).values

# Combine TF-IDF and LDA topics for deep learning models
X_combined_dl_lda = np.hstack((X_pad_dl_lda, topic_distributions_lda))

# Split dataset for deep learning models
X_train_dl, X_test_dl, y_train_dl, y_test_dl = train_test_split(X_combined_dl_lda, y_encoded, test_size=0.2, random_state=42)



In [8]:
# Define and train CNN model with TF-IDF + LDA topics
cnn_model = Sequential([
    Embedding(5000, 128, input_length=100),
    Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'),
    GlobalMaxPooling1D(),
    Dense(128, activation='relu'),
    Dense(y_encoded.shape[1], activation='softmax')
])
cnn_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
cnn_model.fit(X_train_dl, y_train_dl, epochs=5, batch_size=64, validation_data=(X_test_dl, y_test_dl), verbose=2)

cnn_y_pred = cnn_model.predict(X_test_dl)
cnn_y_pred_labels = cnn_y_pred.argmax(axis=1)
y_test_dl_labels = y_test_dl.argmax(axis=1)
cnn_accuracy = accuracy_score(y_test_dl_labels, cnn_y_pred_labels)
cnn_report = classification_report(y_test_dl_labels, cnn_y_pred_labels)
print("CNN Results:")
print(f"Accuracy: {cnn_accuracy}")
print("Classification Report:")
print(cnn_report)

# Calculate confidence scores for CNN
cnn_confidence_scores = cnn_y_pred.max(axis=1)

# Calculate trust-based accuracy for CNN
cnn_trust_accuracy = trust_based_accuracy(y_test_dl_labels, cnn_y_pred_labels, cnn_confidence_scores, confidence_threshold)
if cnn_trust_accuracy is not None:
    print(f"Trust-Based Accuracy for CNN: {cnn_trust_accuracy}")
else:
    print(f"No predictions above the confidence threshold for CNN")





Epoch 1/5
959/959 - 23s - 24ms/step - accuracy: 0.7918 - loss: 0.5182 - val_accuracy: 0.8188 - val_loss: 0.4608
Epoch 2/5
959/959 - 22s - 23ms/step - accuracy: 0.8456 - loss: 0.4013 - val_accuracy: 0.8236 - val_loss: 0.4500
Epoch 3/5
959/959 - 22s - 23ms/step - accuracy: 0.8727 - loss: 0.3389 - val_accuracy: 0.8219 - val_loss: 0.4713
Epoch 4/5
959/959 - 22s - 23ms/step - accuracy: 0.8910 - loss: 0.2941 - val_accuracy: 0.8175 - val_loss: 0.5015
Epoch 5/5
959/959 - 22s - 23ms/step - accuracy: 0.9006 - loss: 0.2697 - val_accuracy: 0.8224 - val_loss: 0.5235
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
CNN Results:
Accuracy: 0.8223714229841601
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.77      0.78      5257
           1       0.82      0.79      0.80      4976
           2       0.85      0.91      0.88      5108

    accuracy                           0.82     15341
   macro avg       0.82     

In [9]:
# Define and train LSTM model with TF-IDF + LDA topics
lstm_model = Sequential([
    Embedding(5000, 128, input_length=100),
    SpatialDropout1D(0.2),
    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    Dense(y_encoded.shape[1], activation='softmax')
])
lstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.fit(X_train_dl, y_train_dl, epochs=5, batch_size=64, validation_data=(X_test_dl, y_test_dl), verbose=2)

lstm_y_pred = lstm_model.predict(X_test_dl)
lstm_y_pred_labels = lstm_y_pred.argmax(axis=1)
lstm_accuracy = accuracy_score(y_test_dl_labels, lstm_y_pred_labels)
lstm_report = classification_report(y_test_dl_labels, lstm_y_pred_labels)
print("LSTM Results:")
print(f"Accuracy: {lstm_accuracy}")
print("Classification Report:")
print(lstm_report)

# Calculate confidence scores for LSTM
lstm_confidence_scores = lstm_y_pred.max(axis=1)

# Calculate trust-based accuracy for LSTM
lstm_trust_accuracy = trust_based_accuracy(y_test_dl_labels, lstm_y_pred_labels, lstm_confidence_scores, confidence_threshold)
if lstm_trust_accuracy is not None:
    print(f"Trust-Based Accuracy for LSTM: {lstm_trust_accuracy}")
else:
    print(f"No predictions above the confidence threshold for LSTM")


Epoch 1/5




959/959 - 68s - 71ms/step - accuracy: 0.7431 - loss: 0.6225 - val_accuracy: 0.7968 - val_loss: 0.5139
Epoch 2/5
959/959 - 66s - 68ms/step - accuracy: 0.8122 - loss: 0.4920 - val_accuracy: 0.8063 - val_loss: 0.4898
Epoch 3/5
959/959 - 66s - 69ms/step - accuracy: 0.8248 - loss: 0.4580 - val_accuracy: 0.8073 - val_loss: 0.4928
Epoch 4/5
959/959 - 66s - 69ms/step - accuracy: 0.8337 - loss: 0.4342 - val_accuracy: 0.8118 - val_loss: 0.4857
Epoch 5/5
959/959 - 67s - 70ms/step - accuracy: 0.8399 - loss: 0.4177 - val_accuracy: 0.8137 - val_loss: 0.4776
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step
LSTM Results:
Accuracy: 0.813701844729809
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.75      0.77      5257
           1       0.80      0.79      0.79      4976
           2       0.84      0.91      0.87      5108

    accuracy                           0.81     15341
   macro avg       0.81      0.81    

In [10]:

# Define and train BiLSTM model with TF-IDF + LDA topics
bilstm_model = Sequential([
    Embedding(5000, 128, input_length=100),
    SpatialDropout1D(0.2),
    Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)),
    Dense(y_encoded.shape[1], activation='softmax')
])
bilstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
bilstm_model.fit(X_train_dl, y_train_dl, epochs=5, batch_size=64, validation_data=(X_test_dl, y_test_dl), verbose=2)

bilstm_y_pred = bilstm_model.predict(X_test_dl)
bilstm_y_pred_labels = bilstm_y_pred.argmax(axis=1)
bilstm_accuracy = accuracy_score(y_test_dl_labels, bilstm_y_pred_labels)
bilstm_report = classification_report(y_test_dl_labels, bilstm_y_pred_labels)
print("BiLSTM Results:")
print(f"Accuracy: {bilstm_accuracy}")
print("Classification Report:")
print(bilstm_report)

# Calculate confidence scores for BiLSTM
bilstm_confidence_scores = bilstm_y_pred.max(axis=1)

# Calculate trust-based accuracy for BiLSTM
bilstm_trust_accuracy = trust_based_accuracy(y_test_dl_labels, bilstm_y_pred_labels, bilstm_confidence_scores, confidence_threshold)
if bilstm_trust_accuracy is not None:
    print(f"Trust-Based Accuracy for BiLSTM: {bilstm_trust_accuracy}")
else:
    print(f"No predictions above the confidence threshold for BiLSTM")



Epoch 1/5




959/959 - 112s - 116ms/step - accuracy: 0.7457 - loss: 0.6194 - val_accuracy: 0.7999 - val_loss: 0.5190
Epoch 2/5
959/959 - 107s - 111ms/step - accuracy: 0.8096 - loss: 0.4949 - val_accuracy: 0.8051 - val_loss: 0.4911
Epoch 3/5
959/959 - 105s - 110ms/step - accuracy: 0.8236 - loss: 0.4592 - val_accuracy: 0.7998 - val_loss: 0.4919
Epoch 4/5
959/959 - 106s - 110ms/step - accuracy: 0.8317 - loss: 0.4384 - val_accuracy: 0.8086 - val_loss: 0.4805
Epoch 5/5
959/959 - 106s - 111ms/step - accuracy: 0.8381 - loss: 0.4208 - val_accuracy: 0.8116 - val_loss: 0.4843
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 16ms/step
BiLSTM Results:
Accuracy: 0.8115507463659475
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.75      0.77      5257
           1       0.79      0.79      0.79      4976
           2       0.85      0.90      0.87      5108

    accuracy                           0.81     15341
   macro avg       0.81 

In [11]:
# Compare all models
accuracy_scores_all = {
    'Naive Bayes': accuracy_score(y_test_ml, models['Naive Bayes'].predict(X_test_ml)),
    'SVM': accuracy_score(y_test_ml, models['SVM'].predict(X_test_ml)),
    'Random Forest': accuracy_score(y_test_ml, models['Random Forest'].predict(X_test_ml)),
    'CNN': cnn_accuracy,
    'LSTM': lstm_accuracy,
    'BiLSTM': bilstm_accuracy
}

best_model = max(accuracy_scores_all, key=accuracy_scores_all.get)
print(f"\nThe best model based on accuracy is: {best_model} with accuracy {accuracy_scores_all[best_model]}")

trust_accuracy_scores_all = {
    'Naive Bayes': trust_based_accuracy(y_test_ml, models['Naive Bayes'].predict(X_test_ml), models['Naive Bayes'].predict_proba(X_test_ml).max(axis=1), confidence_threshold),
    'SVM': trust_based_accuracy(y_test_ml, models['SVM'].predict(X_test_ml), models['SVM'].predict_proba(X_test_ml).max(axis=1), confidence_threshold),
    'Random Forest': trust_based_accuracy(y_test_ml, models['Random Forest'].predict(X_test_ml), models['Random Forest'].predict_proba(X_test_ml).max(axis=1), confidence_threshold),
    'CNN': cnn_trust_accuracy,
    'LSTM': lstm_trust_accuracy,
    'BiLSTM': bilstm_trust_accuracy
}

best_trust_model = max(trust_accuracy_scores_all, key=lambda k: trust_accuracy_scores_all[k] if trust_accuracy_scores_all[k] is not None else -1)
print(f"\nThe best model based on trust-based accuracy is: {best_trust_model} with trust-based accuracy {trust_accuracy_scores_all[best_trust_model]}")



The best model based on accuracy is: CNN with accuracy 0.8223714229841601

The best model based on trust-based accuracy is: Naive Bayes with trust-based accuracy 0.9522505562211193
