In [1]:
pip install tensorflow




In [2]:
import pandas as pd
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, LSTM, Dense, Bidirectional, SpatialDropout1D

In [3]:
# Ensure NLTK stopwords are downloaded
nltk.download('stopwords')

# Load data from CSV file
df = pd.read_csv('171kReviewWithSentiment.csv') 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Moneykicks\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:

# Keep only 'Summary' and 'Stance' columns
df = df[['Summary', 'Stance']]  

# Drop rows where 'Summary' or 'Stance' is NaN
df = df.dropna(subset=['Summary', 'Stance'])

# Ensure all entries in 'Summary' are strings
df['Summary'] = df['Summary'].astype(str)

# for full review display without truncation
pd.set_option('display.max_colwidth', None)

# Print value counts of 'Stance' column
print(df['Stance'].value_counts())

Stance
supportive    25913
neutral       25790
oppose        25001
Name: count, dtype: int64


In [7]:
# Preprocess the text data
stop_words = list(stopwords.words('english')) 
tfidf = TfidfVectorizer(stop_words=stop_words, max_df=0.7)

In [8]:
# Transform the text data into TF-IDF features
X = tfidf.fit_transform(df['Summary'])

# Encode the target labels
y = df['Stance']

In [9]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Initialize and train the Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

In [11]:
# Predict the stances on the test set using Naive Bayes
nb_y_pred = nb_model.predict(X_test)
# Evaluate the Naive Bayes model
nb_accuracy = accuracy_score(y_test, nb_y_pred)
nb_report = classification_report(y_test, nb_y_pred)

In [12]:
print("Naive Bayes Results:")
print(f"Accuracy: {nb_accuracy}")
print("Classification Report:")
print(nb_report)

Naive Bayes Results:
Accuracy: 0.7942115898572453
Classification Report:
              precision    recall  f1-score   support

     neutral       0.81      0.67      0.73      5257
      oppose       0.75      0.82      0.78      4976
  supportive       0.82      0.90      0.86      5108

    accuracy                           0.79     15341
   macro avg       0.80      0.80      0.79     15341
weighted avg       0.80      0.79      0.79     15341



In [14]:
# Initialize and train the SVM classifier
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# Predict the stances on the test set using SVM
svm_y_pred = svm_model.predict(X_test)

In [15]:
# Evaluate the SVM model
svm_accuracy = accuracy_score(y_test, svm_y_pred)
svm_report = classification_report(y_test, svm_y_pred)
print("SVM Results:")
print(f"Accuracy: {svm_accuracy}")
print("Classification Report:")
print(svm_report)

SVM Results:
Accuracy: 0.8063359624535559
Classification Report:
              precision    recall  f1-score   support

     neutral       0.78      0.75      0.76      5257
      oppose       0.78      0.78      0.78      4976
  supportive       0.86      0.89      0.87      5108

    accuracy                           0.81     15341
   macro avg       0.81      0.81      0.81     15341
weighted avg       0.81      0.81      0.81     15341



In [16]:
# Initialize and train the Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict the stances on the test set using Random Forest
rf_y_pred = rf_model.predict(X_test)

# Evaluate the Random Forest model
rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_report = classification_report(y_test, rf_y_pred)
print("Random Forest Results:")
print(f"Accuracy: {rf_accuracy}")
print("Classification Report:")
print(rf_report)


Random Forest Results:
Accuracy: 0.8202855094192034
Classification Report:
              precision    recall  f1-score   support

     neutral       0.81      0.76      0.79      5257
      oppose       0.80      0.80      0.80      4976
  supportive       0.85      0.90      0.87      5108

    accuracy                           0.82     15341
   macro avg       0.82      0.82      0.82     15341
weighted avg       0.82      0.82      0.82     15341



In [17]:
# Initialize the accuracy_scores dictionary
accuracy_scores = {}

# Add Naive, SVM and Random Forest accuracy scores to the comparison
accuracy_scores['Naive']=nb_accuracy
accuracy_scores['SVM'] = svm_accuracy
accuracy_scores['Random Forest'] = rf_accuracy

best_model = max(accuracy_scores, key=accuracy_scores.get)
print(f"\nThe best model based on accuracy is: {best_model} with accuracy {accuracy_scores[best_model]}")


The best model based on accuracy is: Random Forest with accuracy 0.8202855094192034


In [18]:
# Tokenizer and padding for deep learning models
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['Summary'])
X_seq = tokenizer.texts_to_sequences(df['Summary'])
X_pad = pad_sequences(X_seq, maxlen=100)

In [19]:
# Encode the target labels for deep learning models
y_encoded = pd.get_dummies(df['Stance']).values

In [20]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pad, y_encoded, test_size=0.2, random_state=42)

In [21]:
# CNN Model
cnn_model = Sequential()
cnn_model.add(Embedding(5000, 128, input_length=100))
cnn_model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dense(128, activation='relu'))
cnn_model.add(Dense(3, activation='softmax'))



In [22]:
cnn_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
cnn_model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test), verbose=2)

Epoch 1/5
959/959 - 28s - 29ms/step - accuracy: 0.8413 - loss: 0.4043 - val_accuracy: 0.8715 - val_loss: 0.3434
Epoch 2/5
959/959 - 27s - 28ms/step - accuracy: 0.8899 - loss: 0.2982 - val_accuracy: 0.8711 - val_loss: 0.3408
Epoch 3/5
959/959 - 25s - 26ms/step - accuracy: 0.9123 - loss: 0.2454 - val_accuracy: 0.8759 - val_loss: 0.3431
Epoch 4/5
959/959 - 25s - 26ms/step - accuracy: 0.9297 - loss: 0.2023 - val_accuracy: 0.8713 - val_loss: 0.3818
Epoch 5/5
959/959 - 26s - 27ms/step - accuracy: 0.9410 - loss: 0.1756 - val_accuracy: 0.8673 - val_loss: 0.4047


<keras.src.callbacks.history.History at 0x11182259eb0>

In [23]:
cnn_y_pred = cnn_model.predict(X_test)
cnn_y_pred_labels = cnn_y_pred.argmax(axis=1)
y_test_labels = y_test.argmax(axis=1)

cnn_accuracy = accuracy_score(y_test_labels, cnn_y_pred_labels)
cnn_report = classification_report(y_test_labels, cnn_y_pred_labels)

[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step


In [24]:
print("CNN Results:")
print(f"Accuracy: {cnn_accuracy}")
print("Classification Report:")
print(cnn_report)

CNN Results:
Accuracy: 0.867283749429633
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.86      0.82      5257
           1       0.88      0.84      0.86      4976
           2       0.94      0.90      0.92      5108

    accuracy                           0.87     15341
   macro avg       0.87      0.87      0.87     15341
weighted avg       0.87      0.87      0.87     15341



In [25]:
# LSTM Model
lstm_model = Sequential()
lstm_model.add(Embedding(5000, 128, input_length=100))
lstm_model.add(SpatialDropout1D(0.2))
lstm_model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
lstm_model.add(Dense(3, activation='softmax'))




In [26]:
lstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test), verbose=2)

Epoch 1/5
959/959 - 79s - 82ms/step - accuracy: 0.8143 - loss: 0.4599 - val_accuracy: 0.8620 - val_loss: 0.3609
Epoch 2/5
959/959 - 74s - 78ms/step - accuracy: 0.8703 - loss: 0.3456 - val_accuracy: 0.8636 - val_loss: 0.3549
Epoch 3/5
959/959 - 74s - 77ms/step - accuracy: 0.8821 - loss: 0.3169 - val_accuracy: 0.8669 - val_loss: 0.3518
Epoch 4/5
959/959 - 73s - 76ms/step - accuracy: 0.8907 - loss: 0.2980 - val_accuracy: 0.8703 - val_loss: 0.3478
Epoch 5/5
959/959 - 72s - 75ms/step - accuracy: 0.8969 - loss: 0.2814 - val_accuracy: 0.8677 - val_loss: 0.3526


<keras.src.callbacks.history.History at 0x111aa6a5910>

In [27]:
lstm_y_pred = lstm_model.predict(X_test)
lstm_y_pred_labels = lstm_y_pred.argmax(axis=1)
y_test_labels = y_test.argmax(axis=1)

[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step


In [28]:
lstm_accuracy = accuracy_score(y_test_labels, lstm_y_pred_labels)
lstm_report = classification_report(y_test_labels, lstm_y_pred_labels)

print("LSTM Results:")
print(f"Accuracy: {lstm_accuracy}")
print("Classification Report:")
print(lstm_report)

LSTM Results:
Accuracy: 0.8677400430219673
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.83      0.82      5257
           1       0.86      0.87      0.87      4976
           2       0.94      0.91      0.92      5108

    accuracy                           0.87     15341
   macro avg       0.87      0.87      0.87     15341
weighted avg       0.87      0.87      0.87     15341



In [29]:
# BiLSTM Model
bilstm_model = Sequential()
bilstm_model.add(Embedding(5000, 128, input_length=100))
bilstm_model.add(SpatialDropout1D(0.2))
bilstm_model.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)))
bilstm_model.add(Dense(3, activation='softmax'))



In [30]:
bilstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
bilstm_model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test), verbose=2)


Epoch 1/5
959/959 - 106s - 110ms/step - accuracy: 0.8134 - loss: 0.4619 - val_accuracy: 0.8638 - val_loss: 0.3617
Epoch 2/5
959/959 - 103s - 107ms/step - accuracy: 0.8694 - loss: 0.3467 - val_accuracy: 0.8640 - val_loss: 0.3515
Epoch 3/5
959/959 - 109s - 113ms/step - accuracy: 0.8817 - loss: 0.3196 - val_accuracy: 0.8669 - val_loss: 0.3497
Epoch 4/5
959/959 - 103s - 107ms/step - accuracy: 0.8890 - loss: 0.3007 - val_accuracy: 0.8664 - val_loss: 0.3500
Epoch 5/5
959/959 - 110s - 114ms/step - accuracy: 0.8951 - loss: 0.2861 - val_accuracy: 0.8685 - val_loss: 0.3462


<keras.src.callbacks.history.History at 0x111a90e5340>

In [31]:
bilstm_y_pred = bilstm_model.predict(X_test)
bilstm_y_pred_labels = bilstm_y_pred.argmax(axis=1)
y_test_labels = y_test.argmax(axis=1)


[1m480/480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 18ms/step


In [32]:
bilstm_accuracy = accuracy_score(y_test_labels, bilstm_y_pred_labels)
bilstm_report = classification_report(y_test_labels, bilstm_y_pred_labels)

In [33]:
print("BiLSTM Results:")
print(f"Accuracy: {bilstm_accuracy}")
print("Classification Report:")
print(bilstm_report)

BiLSTM Results:
Accuracy: 0.868522260608826
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.81      0.81      5257
           1       0.85      0.89      0.87      4976
           2       0.93      0.92      0.92      5108

    accuracy                           0.87     15341
   macro avg       0.87      0.87      0.87     15341
weighted avg       0.87      0.87      0.87     15341



In [35]:
# Compare the accuracy of all models
accuracy_scores_all = {
    'Naive Bayes': nb_accuracy,
    'SVM': svm_accuracy,
    'Random Forest': rf_accuracy,
    'CNN': cnn_accuracy,
    'LSTM': lstm_accuracy,
    'BiLSTM': bilstm_accuracy
}



In [36]:
# Find the best model among all based on accuracy
best_model = max(accuracy_scores_all, key=accuracy_scores_all.get)
print(f"\nThe best model based on accuracy is: {best_model} with accuracy {accuracy_scores_all[best_model]}")


The best model based on accuracy is: BiLSTM with accuracy 0.868522260608826
