In [22]:
import pandas as pd
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
import nltk
from nltk.corpus import stopwords
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, LSTM, Dense, Bidirectional, SpatialDropout1D


In [21]:
# Ensure NLTK stopwords are downloaded
nltk.download('stopwords')

# Load data from CSV file
df = pd.read_csv('5kReviewWithSentimentAmazon.csv')

# Keep only 'reviewText' and 'Stance' columns
df = df[['reviewText', 'Stance']]

# Drop rows where 'reviewText' or 'Stance' is NaN
df = df.dropna(subset=['reviewText', 'Stance'])

# Ensure all entries in 'reviewText' are strings
df['reviewText'] = df['reviewText'].astype(str)

# for full review display without truncation
pd.set_option('display.max_colwidth', None)

# Print value counts of 'Stance' column
print(df['Stance'].value_counts())

Stance
Supportive    4162
Opposed        469
Neutral        283
Name: count, dtype: int64


[nltk_data] Error loading stopwords: <urlopen error [WinError 10065] A
[nltk_data]     socket operation was attempted to an unreachable host>


In [23]:
# Preprocess the text data
stop_words = list(stopwords.words('english'))
tfidf = TfidfVectorizer(stop_words=stop_words, max_df=0.7)

# Transform the text data into TF-IDF features
X = tfidf.fit_transform(df['reviewText'])

# Encode the target labels
y = df['Stance']

In [24]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Predict the stances on the test set using Naive Bayes
nb_y_pred = nb_model.predict(X_test)

# Evaluate the Naive Bayes model
nb_accuracy = accuracy_score(y_test, nb_y_pred)
nb_report = classification_report(y_test, nb_y_pred)
print("Naive Bayes Results:")
print(f"Accuracy: {nb_accuracy}")
print("Classification Report:")
print(nb_report)

Naive Bayes Results:
Accuracy: 0.8474059003051883
Classification Report:
              precision    recall  f1-score   support

     Neutral       1.00      0.02      0.04        52
     Opposed       0.00      0.00      0.00        99
  Supportive       0.85      1.00      0.92       832

    accuracy                           0.85       983
   macro avg       0.62      0.34      0.32       983
weighted avg       0.77      0.85      0.78       983



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [25]:
from sklearn.preprocessing import LabelEncoder 
# Assuming y_train contains the class labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [26]:
import numpy as np
# Calculate class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train_encoded), y=y_train_encoded)
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}


In [27]:
# Initialize and train the SVM classifier
svm_model = SVC(kernel='linear', class_weight=class_weights_dict)
svm_model.fit(X_train, y_train_encoded)

# Predict the stances on the test set using SVM
svm_y_pred_encoded = svm_model.predict(X_test)

# Decode the predicted labels back to the original class names
svm_y_pred = label_encoder.inverse_transform(svm_y_pred_encoded)

# Evaluate the SVM model
svm_accuracy = accuracy_score(y_test, svm_y_pred)
svm_report = classification_report(y_test, svm_y_pred)
print("SVM Results:")
print(f"Accuracy: {svm_accuracy}")
print("Classification Report:")
print(svm_report)

SVM Results:
Accuracy: 0.8250254323499492
Classification Report:
              precision    recall  f1-score   support

     Neutral       0.36      0.54      0.43        52
     Opposed       0.43      0.47      0.45        99
  Supportive       0.93      0.88      0.90       832

    accuracy                           0.83       983
   macro avg       0.57      0.63      0.60       983
weighted avg       0.85      0.83      0.83       983



In [28]:
# Assuming y_train contains the class labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_encoded), y=y_train_encoded)
class_weights_dict = {label: weight for label, weight in zip(label_encoder.classes_, class_weights)}

# Initialize and train the Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_model.fit(X_train, y_train_encoded)

# Predict the stances on the test set using Random Forest
rf_y_pred_encoded = rf_model.predict(X_test)

# Decode the predicted labels back to the original class names
rf_y_pred = label_encoder.inverse_transform(rf_y_pred_encoded)

# Evaluate the Random Forest model
rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_report = classification_report(y_test, rf_y_pred)
print("Random Forest Results:")
print(f"Accuracy: {rf_accuracy}")
print("Classification Report:")
print(rf_report)


Random Forest Results:
Accuracy: 0.8494404883011191
Classification Report:
              precision    recall  f1-score   support

     Neutral       1.00      0.04      0.07        52
     Opposed       1.00      0.01      0.02        99
  Supportive       0.85      1.00      0.92       832

    accuracy                           0.85       983
   macro avg       0.95      0.35      0.34       983
weighted avg       0.87      0.85      0.78       983



In [29]:
# Initialize the accuracy_scores dictionary
accuracy_scores = {}

# Add Naive, SVM and Random Forest accuracy scores to the comparison
accuracy_scores['Naive'] = nb_accuracy
accuracy_scores['SVM'] = svm_accuracy
accuracy_scores['Random Forest'] = rf_accuracy

best_model = max(accuracy_scores, key=accuracy_scores.get)
print(f"\nThe best model based on accuracy is: {best_model} with accuracy {accuracy_scores[best_model]}")



The best model based on accuracy is: Random Forest with accuracy 0.8494404883011191


In [30]:
# Tokenizer and padding for deep learning models
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['reviewText'])
X_seq = tokenizer.texts_to_sequences(df['reviewText'])
X_pad = pad_sequences(X_seq, maxlen=100)


In [31]:
# Encode the target labels for deep learning models
y_encoded = pd.get_dummies(df['Stance']).values

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pad, y_encoded, test_size=0.2, random_state=42)

In [32]:
# Compute class weights for deep learning models
class_weights_dl = compute_class_weight('balanced', classes=np.unique(y_train.argmax(axis=1)), y=y_train.argmax(axis=1))
class_weights_dl_dict = {i: weight for i, weight in enumerate(class_weights_dl)}

In [33]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Compute class weights for deep learning models
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_encoded), y=y_train_encoded)
class_weights_dl = {i: class_weights[i] for i in range(len(class_weights))}

# CNN Model
cnn_model = Sequential()
cnn_model.add(Embedding(5000, 128, input_length=100))
cnn_model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dense(128, activation='relu'))
cnn_model.add(Dense(3, activation='softmax'))
cnn_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
cnn_model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test), class_weight=class_weights_dl, verbose=2)

cnn_y_pred = cnn_model.predict(X_test)
cnn_y_pred_labels = cnn_y_pred.argmax(axis=1)
y_test_labels = y_test.argmax(axis=1)

cnn_accuracy = accuracy_score(y_test_labels, cnn_y_pred_labels)
cnn_report = classification_report(y_test_labels, cnn_y_pred_labels)

print("CNN Results:")
print(f"Accuracy: {cnn_accuracy}")
print("Classification Report:")
print(cnn_report)


Epoch 1/5




62/62 - 3s - 46ms/step - accuracy: 0.6105 - loss: 1.0421 - val_accuracy: 0.5748 - val_loss: 0.8043
Epoch 2/5
62/62 - 2s - 26ms/step - accuracy: 0.7202 - loss: 0.6897 - val_accuracy: 0.7477 - val_loss: 0.5938
Epoch 3/5
62/62 - 2s - 25ms/step - accuracy: 0.8573 - loss: 0.3400 - val_accuracy: 0.8189 - val_loss: 0.4300
Epoch 4/5
62/62 - 2s - 26ms/step - accuracy: 0.9522 - loss: 0.1239 - val_accuracy: 0.8515 - val_loss: 0.3800
Epoch 5/5
62/62 - 2s - 27ms/step - accuracy: 0.9865 - loss: 0.0372 - val_accuracy: 0.8922 - val_loss: 0.3398
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
CNN Results:
Accuracy: 0.8921668362156663
Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.65      0.64        52
           1       0.64      0.46      0.54        99
           2       0.93      0.96      0.94       832

    accuracy                           0.89       983
   macro avg       0.73      0.69      0.71       983
w

In [34]:
# LSTM Model
lstm_model = Sequential()
lstm_model.add(Embedding(5000, 128, input_length=100))
lstm_model.add(SpatialDropout1D(0.2))
lstm_model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
lstm_model.add(Dense(3, activation='softmax'))
lstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test), class_weight=class_weights_dl, verbose=2)

lstm_y_pred = lstm_model.predict(X_test)
lstm_y_pred_labels = lstm_y_pred.argmax(axis=1)
y_test_labels = y_test.argmax(axis=1)

lstm_accuracy = accuracy_score(y_test_labels, lstm_y_pred_labels)
lstm_report = classification_report(y_test_labels, lstm_y_pred_labels)

print("LSTM Results:")
print(f"Accuracy: {lstm_accuracy}")
print("Classification Report:")
print(lstm_report)

Epoch 1/5




62/62 - 7s - 109ms/step - accuracy: 0.4050 - loss: 1.0594 - val_accuracy: 0.5005 - val_loss: 0.9180
Epoch 2/5
62/62 - 4s - 71ms/step - accuracy: 0.6418 - loss: 0.8271 - val_accuracy: 0.7131 - val_loss: 0.6380
Epoch 3/5
62/62 - 4s - 69ms/step - accuracy: 0.7492 - loss: 0.5870 - val_accuracy: 0.5697 - val_loss: 0.8235
Epoch 4/5
62/62 - 5s - 74ms/step - accuracy: 0.8207 - loss: 0.4093 - val_accuracy: 0.7986 - val_loss: 0.5152
Epoch 5/5
62/62 - 4s - 69ms/step - accuracy: 0.8794 - loss: 0.2568 - val_accuracy: 0.7335 - val_loss: 0.6632
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step
LSTM Results:
Accuracy: 0.7334689725330621
Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.65      0.59        52
           1       0.24      0.63      0.35        99
           2       0.94      0.75      0.84       832

    accuracy                           0.73       983
   macro avg       0.57      0.68      0.59       98

In [35]:
# BiLSTM Model
bilstm_model = Sequential()
bilstm_model.add(Embedding(5000, 128, input_length=100))
bilstm_model.add(SpatialDropout1D(0.2))
bilstm_model.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)))
bilstm_model.add(Dense(3, activation='softmax'))
bilstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
bilstm_model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test), class_weight=class_weights_dl, verbose=2)

bilstm_y_pred = bilstm_model.predict(X_test)
bilstm_y_pred_labels = bilstm_y_pred.argmax(axis=1)
y_test_labels = y_test.argmax(axis=1)

bilstm_accuracy = accuracy_score(y_test_labels, bilstm_y_pred_labels)
bilstm_report = classification_report(y_test_labels, bilstm_y_pred_labels)

print("BiLSTM Results:")
print(f"Accuracy: {bilstm_accuracy}")
print("Classification Report:")
print(bilstm_report)

Epoch 1/5




62/62 - 10s - 166ms/step - accuracy: 0.3508 - loss: 1.0587 - val_accuracy: 0.5626 - val_loss: 0.9667
Epoch 2/5
62/62 - 7s - 111ms/step - accuracy: 0.6110 - loss: 0.8688 - val_accuracy: 0.3174 - val_loss: 1.1744
Epoch 3/5
62/62 - 7s - 107ms/step - accuracy: 0.6830 - loss: 0.6228 - val_accuracy: 0.6999 - val_loss: 0.5817
Epoch 4/5
62/62 - 7s - 111ms/step - accuracy: 0.7955 - loss: 0.4178 - val_accuracy: 0.7304 - val_loss: 0.5629
Epoch 5/5
62/62 - 7s - 113ms/step - accuracy: 0.8293 - loss: 0.3234 - val_accuracy: 0.8108 - val_loss: 0.4623
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step
BiLSTM Results:
Accuracy: 0.8107833163784334
Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.67      0.62        52
           1       0.32      0.53      0.40        99
           2       0.93      0.85      0.89       832

    accuracy                           0.81       983
   macro avg       0.61      0.68      0.64  

In [36]:
# Compare the accuracy of all models
accuracy_scores_all = {
    'Naive Bayes': nb_accuracy,
    'SVM': svm_accuracy,
    'Random Forest': rf_accuracy,
    'CNN': cnn_accuracy,
    'LSTM': lstm_accuracy,
    'BiLSTM': bilstm_accuracy
}

In [37]:
# Find the best model among all based on accuracy
best_model = max(accuracy_scores_all, key=accuracy_scores_all.get)
print(f"\nThe best model based on accuracy is: {best_model} with accuracy {accuracy_scores_all[best_model]}")


The best model based on accuracy is: CNN with accuracy 0.8921668362156663


In [41]:
# Function to detect stance of an input review
def predict_stance(review):
    # Preprocess the input review
    review_tfidf = tfidf.transform([review])
    
    # Use the best model to predict the stance
    if best_model == 'Naive Bayes':
        prediction = nb_model.predict(review_tfidf)
    elif best_model == 'SVM':
        prediction_encoded = svm_model.predict(review_tfidf)
        prediction = label_encoder.inverse_transform(prediction_encoded)
    elif best_model == 'Random Forest':
        prediction_encoded = rf_model.predict(review_tfidf)
        prediction = label_encoder.inverse_transform(prediction_encoded)
    elif best_model == 'CNN':
        review_seq = tokenizer.texts_to_sequences([review])
        review_pad = pad_sequences(review_seq, maxlen=100)
        prediction_probs = cnn_model.predict(review_pad)
        prediction = [np.argmax(prediction_probs)]
        prediction = label_encoder.inverse_transform(prediction)
    elif best_model == 'LSTM':
        review_seq = tokenizer.texts_to_sequences([review])
        review_pad = pad_sequences(review_seq, maxlen=100)
        prediction_probs = lstm_model.predict(review_pad)
        prediction = [np.argmax(prediction_probs)]
        prediction = label_encoder.inverse_transform(prediction)
    elif best_model == 'BiLSTM':
        review_seq = tokenizer.texts_to_sequences([review])
        review_pad = pad_sequences(review_seq, maxlen=100)
        prediction_probs = bilstm_model.predict(review_pad)
        prediction = [np.argmax(prediction_probs)]
        prediction = label_encoder.inverse_transform(prediction)
    
    return prediction[0]

# Example usage of the function
input_review = "I dont liked."
detected_stance = predict_stance(input_review)
print(f"The detected stance for the input review is: {detected_stance}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
The detected stance for the input review is: Neutral
