In [1]:
import pandas as pd
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
import nltk
from nltk.corpus import stopwords
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, LSTM, Dense, Bidirectional, SpatialDropout1D


In [2]:
# Ensure NLTK stopwords are downloaded
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Moneykicks\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Load data from CSV file
df = pd.read_csv('171kReviewWithSentiment.csv')

# Keep only 'Review' and 'Stance' columns
df = df[['Review', 'Stance']]

# Drop rows where 'Review' or 'Stance' is NaN
df = df.dropna(subset=['Review', 'Stance'])

# Ensure all entries in 'Review' are strings
df['Review'] = df['Review'].astype(str)

# for full review display without truncation
pd.set_option('display.max_colwidth', None)

# Print value counts of 'Stance' column
print(df['Stance'].value_counts())

Stance
supportive    25913
neutral       25790
oppose        25001
Name: count, dtype: int64


In [4]:
# Preprocess the text data
stop_words = list(stopwords.words('english'))
tfidf = TfidfVectorizer(stop_words=stop_words, max_df=0.7)

# Transform the text data into TF-IDF features
X = tfidf.fit_transform(df['Review'])

# Encode the target labels
y = df['Stance']

In [5]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Predict the stances on the test set using Naive Bayes
nb_y_pred = nb_model.predict(X_test)

# Evaluate the Naive Bayes model
nb_accuracy = accuracy_score(y_test, nb_y_pred)
nb_report = classification_report(y_test, nb_y_pred)
print("Naive Bayes Results:")
print(f"Accuracy: {nb_accuracy}")
print("Classification Report:")
print(nb_report)

Naive Bayes Results:
Accuracy: 0.7590769832475067
Classification Report:
              precision    recall  f1-score   support

     neutral       0.68      0.60      0.64      5257
      oppose       0.77      0.81      0.79      4976
  supportive       0.82      0.87      0.84      5108

    accuracy                           0.76     15341
   macro avg       0.76      0.76      0.76     15341
weighted avg       0.75      0.76      0.76     15341



In [9]:
from sklearn.preprocessing import LabelEncoder 
# Assuming y_train contains the class labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [10]:
import numpy as np
# Calculate class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train_encoded), y=y_train_encoded)
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}


In [13]:
# Initialize and train the SVM classifier
svm_model = SVC(kernel='linear', class_weight=class_weights_dict)
svm_model.fit(X_train, y_train_encoded)

# Predict the stances on the test set using SVM
svm_y_pred_encoded = svm_model.predict(X_test)

# Decode the predicted labels back to the original class names
svm_y_pred = label_encoder.inverse_transform(svm_y_pred_encoded)

# Evaluate the SVM model
svm_accuracy = accuracy_score(y_test, svm_y_pred)
svm_report = classification_report(y_test, svm_y_pred)
print("SVM Results:")
print(f"Accuracy: {svm_accuracy}")
print("Classification Report:")
print(svm_report)

SVM Results:
Accuracy: 0.7658562023336158
Classification Report:
              precision    recall  f1-score   support

     neutral       0.70      0.58      0.64      5257
      oppose       0.77      0.81      0.79      4976
  supportive       0.81      0.91      0.86      5108

    accuracy                           0.77     15341
   macro avg       0.76      0.77      0.76     15341
weighted avg       0.76      0.77      0.76     15341



In [15]:
# Assuming y_train contains the class labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_encoded), y=y_train_encoded)
class_weights_dict = {label: weight for label, weight in zip(label_encoder.classes_, class_weights)}

# Initialize and train the Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_model.fit(X_train, y_train_encoded)

# Predict the stances on the test set using Random Forest
rf_y_pred_encoded = rf_model.predict(X_test)

# Decode the predicted labels back to the original class names
rf_y_pred = label_encoder.inverse_transform(rf_y_pred_encoded)

# Evaluate the Random Forest model
rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_report = classification_report(y_test, rf_y_pred)
print("Random Forest Results:")
print(f"Accuracy: {rf_accuracy}")
print("Classification Report:")
print(rf_report)


Random Forest Results:
Accuracy: 0.7665732351215696
Classification Report:
              precision    recall  f1-score   support

     neutral       0.70      0.58      0.64      5257
      oppose       0.77      0.81      0.79      4976
  supportive       0.81      0.92      0.86      5108

    accuracy                           0.77     15341
   macro avg       0.76      0.77      0.76     15341
weighted avg       0.76      0.77      0.76     15341



In [16]:
# Initialize the accuracy_scores dictionary
accuracy_scores = {}

# Add Naive, SVM and Random Forest accuracy scores to the comparison
accuracy_scores['Naive'] = nb_accuracy
accuracy_scores['SVM'] = svm_accuracy
accuracy_scores['Random Forest'] = rf_accuracy

best_model = max(accuracy_scores, key=accuracy_scores.get)
print(f"\nThe best model based on accuracy is: {best_model} with accuracy {accuracy_scores[best_model]}")



The best model based on accuracy is: Random Forest with accuracy 0.7665732351215696


In [22]:
# Tokenizer and padding for deep learning models
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['Review'])
X_seq = tokenizer.texts_to_sequences(df['Review'])
X_pad = pad_sequences(X_seq, maxlen=100)


In [23]:
# Encode the target labels for deep learning models
y_encoded = pd.get_dummies(df['Review']).values

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pad, y_encoded, test_size=0.2, random_state=42)

In [35]:
from sklearn.utils.class_weight import compute_class_weight
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Assuming y_train_encoded and y_test_encoded contain integer labels
# One-hot encode the target variables
y_train = to_categorical(y_train_encoded)
y_test = to_categorical(y_test_encoded)

# Compute class weights
class_weights_dl = compute_class_weight('balanced', classes=np.unique(y_train.argmax(axis=1)), y=y_train.argmax(axis=1))
class_weights_dl_dict = {i: weight for i, weight in enumerate(class_weights_dl)}

# Define and compile the CNN model
cnn_model = Sequential()
cnn_model.add(Embedding(5000, 128, input_length=100))
cnn_model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dense(128, activation='relu'))
cnn_model.add(Dense(3, activation='softmax'))  # Assuming 3 classes
cnn_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
cnn_model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test), class_weight=class_weights_dl_dict, verbose=2)

# Make predictions and evaluate the model
cnn_y_pred = cnn_model.predict(X_test)
cnn_y_pred_labels = cnn_y_pred.argmax(axis=1)
y_test_labels = y_test.argmax(axis=1)

cnn_accuracy = accuracy_score(y_test_labels, cnn_y_pred_labels)
cnn_report = classification_report(y_test_labels, cnn_y_pred_labels)

print("CNN Results:")
print(f"Accuracy: {cnn_accuracy}")
print("Classification Report:")
print(cnn_report)


Epoch 1/5




959/959 - 26s - 27ms/step - accuracy: 0.7689 - loss: 0.6015 - val_accuracy: 0.7779 - val_loss: 0.5819
Epoch 2/5
959/959 - 25s - 26ms/step - accuracy: 0.7764 - loss: 0.5807 - val_accuracy: 0.7779 - val_loss: 0.5811
Epoch 3/5
959/959 - 25s - 26ms/step - accuracy: 0.7770 - loss: 0.5791 - val_accuracy: 0.7777 - val_loss: 0.5849
Epoch 4/5
959/959 - 22s - 23ms/step - accuracy: 0.7777 - loss: 0.5772 - val_accuracy: 0.7778 - val_loss: 0.5803
Epoch 5/5
959/959 - 21s - 22ms/step - accuracy: 0.7777 - loss: 0.5765 - val_accuracy: 0.7779 - val_loss: 0.5806
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step
CNN Results:
Accuracy: 0.7779153901310214
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.55      0.64      5257
           1       0.77      0.85      0.81      4976
           2       0.81      0.94      0.87      5108

    accuracy                           0.78     15341
   macro avg       0.77      0.78     

In [36]:
from sklearn.utils.class_weight import compute_class_weight
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Embedding, SpatialDropout1D, LSTM, Dense
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Assuming y_train_encoded and y_test_encoded contain integer labels
# One-hot encode the target variables
y_train = to_categorical(y_train_encoded)
y_test = to_categorical(y_test_encoded)

# Compute class weights
class_weights_dl = compute_class_weight('balanced', classes=np.unique(y_train.argmax(axis=1)), y=y_train.argmax(axis=1))
class_weights_dl_dict = {i: weight for i, weight in enumerate(class_weights_dl)}

# Define and compile the LSTM model
lstm_model = Sequential()
lstm_model.add(Embedding(5000, 128, input_length=100))
lstm_model.add(SpatialDropout1D(0.2))
lstm_model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
lstm_model.add(Dense(3, activation='softmax'))  # Assuming 3 classes
lstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
lstm_model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test), class_weight=class_weights_dl_dict, verbose=2)

# Make predictions and evaluate the model
lstm_y_pred = lstm_model.predict(X_test)
lstm_y_pred_labels = lstm_y_pred.argmax(axis=1)
y_test_labels = y_test.argmax(axis=1)

lstm_accuracy = accuracy_score(y_test_labels, lstm_y_pred_labels)
lstm_report = classification_report(y_test_labels, lstm_y_pred_labels)

print("LSTM Results:")
print(f"Accuracy: {lstm_accuracy}")
print("Classification Report:")
print(lstm_report)


Epoch 1/5




959/959 - 71s - 74ms/step - accuracy: 0.7582 - loss: 0.6186 - val_accuracy: 0.7776 - val_loss: 0.5813
Epoch 2/5
959/959 - 68s - 71ms/step - accuracy: 0.7760 - loss: 0.5827 - val_accuracy: 0.7775 - val_loss: 0.5798
Epoch 3/5
959/959 - 68s - 70ms/step - accuracy: 0.7769 - loss: 0.5804 - val_accuracy: 0.7777 - val_loss: 0.5805
Epoch 4/5
959/959 - 69s - 72ms/step - accuracy: 0.7768 - loss: 0.5788 - val_accuracy: 0.7776 - val_loss: 0.5794
Epoch 5/5
959/959 - 69s - 72ms/step - accuracy: 0.7773 - loss: 0.5779 - val_accuracy: 0.7777 - val_loss: 0.5793
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 11ms/step
LSTM Results:
Accuracy: 0.7777198357343068
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.55      0.64      5257
           1       0.77      0.85      0.81      4976
           2       0.80      0.94      0.87      5108

    accuracy                           0.78     15341
   macro avg       0.77      0.78   

In [None]:
# BiLSTM Model
bilstm_model = Sequential()
bilstm_model.add(Embedding(5000, 128, input_length=100))
bilstm_model.add(SpatialDropout1D(0.2))
bilstm_model.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)))
bilstm_model.add(Dense(3, activation='softmax'))
bilstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
bilstm_model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test), class_weight=class_weights_dl, verbose=2)

bilstm_y_pred = bilstm_model.predict(X_test)
bilstm_y_pred_labels = bilstm_y_pred.argmax(axis=1)
y_test_labels = y_test.argmax(axis=1)

bilstm_accuracy = accuracy_score(y_test_labels, bilstm_y_pred_labels)
bilstm_report = classification_report(y_test_labels, bilstm_y_pred_labels)

print("BiLSTM Results:")
print(f"Accuracy: {bilstm_accuracy}")
print("Classification Report:")
print(bilstm_report)

Epoch 1/5




959/959 - 101s - 106ms/step - accuracy: 0.8136 - loss: 0.4610 - val_accuracy: 0.8632 - val_loss: 0.3658
Epoch 2/5
959/959 - 102s - 106ms/step - accuracy: 0.8711 - loss: 0.3459 - val_accuracy: 0.8645 - val_loss: 0.3526
Epoch 3/5
959/959 - 97s - 101ms/step - accuracy: 0.8813 - loss: 0.3194 - val_accuracy: 0.8652 - val_loss: 0.3514
Epoch 4/5
959/959 - 111s - 116ms/step - accuracy: 0.8886 - loss: 0.3020 - val_accuracy: 0.8676 - val_loss: 0.3457
Epoch 5/5
959/959 - 182s - 189ms/step - accuracy: 0.8936 - loss: 0.2884 - val_accuracy: 0.8655 - val_loss: 0.3577
[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 36ms/step
BiLSTM Results:
Accuracy: 0.8654585750602959
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.80      0.81      5257
           1       0.84      0.90      0.87      4976
           2       0.94      0.90      0.92      5108

    accuracy                           0.87     15341
   macro avg       0.87 

In [None]:
# Compare the accuracy of all models
accuracy_scores_all = {
    'Naive Bayes': nb_accuracy,
    'SVM': svm_accuracy,
    'Random Forest': rf_accuracy,
    'CNN': cnn_accuracy,
    'LSTM': lstm_accuracy,
    'BiLSTM': bilstm_accuracy
}

In [None]:
# Find the best model among all based on accuracy
best_model = max(accuracy_scores_all, key=accuracy_scores_all.get)
print(f"\nThe best model based on accuracy is: {best_model} with accuracy {accuracy_scores_all[best_model]}")


The best model based on accuracy is: CNN with accuracy 0.8687829998044456


In [33]:
# Function to detect stance of an input review
def predict_stance(review):
    # Preprocess the input review
    review_tfidf = tfidf.transform([review])
    
    # Use the best model to predict the stance
    if best_model == 'Naive Bayes':
        prediction = nb_model.predict(review_tfidf)
    elif best_model == 'SVM':
        prediction_encoded = svm_model.predict(review_tfidf)
        prediction = label_encoder.inverse_transform(prediction_encoded)
    elif best_model == 'Random Forest':
        prediction_encoded = rf_model.predict(review_tfidf)
        prediction = label_encoder.inverse_transform(prediction_encoded)
    elif best_model == 'CNN':
        review_seq = tokenizer.texts_to_sequences([review])
        review_pad = pad_sequences(review_seq, maxlen=100)
        prediction_probs = cnn_model.predict(review_pad)
        prediction = [np.argmax(prediction_probs)]
        prediction = label_encoder.inverse_transform(prediction)
    elif best_model == 'LSTM':
        review_seq = tokenizer.texts_to_sequences([review])
        review_pad = pad_sequences(review_seq, maxlen=100)
        prediction_probs = lstm_model.predict(review_pad)
        prediction = [np.argmax(prediction_probs)]
        prediction = label_encoder.inverse_transform(prediction)
    elif best_model == 'BiLSTM':
        review_seq = tokenizer.texts_to_sequences([review])
        review_pad = pad_sequences(review_seq, maxlen=100)
        prediction_probs = bilstm_model.predict(review_pad)
        prediction = [np.argmax(prediction_probs)]
        prediction = label_encoder.inverse_transform(prediction)
    
    return prediction[0]

# Example usage of the function
input_review = "I love some features of the product overall."
detected_stance = predict_stance(input_review)
print(f"The detected stance for the input review is: {detected_stance}")

The detected stance for the input review is: supportive
