In [40]:
! pip install tensorflow imbalanced-learn





In [42]:
import pandas as pd
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import nltk
from nltk.corpus import stopwords
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, LSTM, Dense, Bidirectional, SpatialDropout1D
from tensorflow.keras.preprocessing.text import text_to_word_sequence
import numpy as np

# Ensure NLTK stopwords are downloaded
nltk.download('stopwords')

# Load data from CSV file
df = pd.read_csv('balanced_200kReviewWithSentimentFlipkart.csv')

# # Function to map sentiment to stance
# def sentiment_to_stance(sentiment):
#     if sentiment == 'positive':
#         return 'supportive'
#     elif sentiment == 'negative':
#         return 'oppose'
#     else:
#         return 'neutral'
# # Apply the function to create the stance column
# df['Stance'] = df['Sentiment'].apply(sentiment_to_stance)
# # Save the DataFrame to a new CSV file
# df.to_csv('reviews_with_stance.csv', index=False)

# Keep only 'Summary' and 'Stance' columns
df = df[['Summary', 'Stance']]

# Drop rows where 'Summary' or 'Stance' is NaN
df = df.dropna(subset=['Summary', 'Stance'])

# Ensure all entries in 'Summary' are strings
df['Summary'] = df['Summary'].astype(str)

# for full review display without truncation
pd.set_option('display.max_colwidth', None)

# Print value counts of 'Stance' column
print(df['Stance'].value_counts())

# Preprocess the text data
stop_words = list(stopwords.words('english'))
tfidf = TfidfVectorizer(stop_words=stop_words, max_df=0.7)

# Transform the text data into TF-IDF features
X = tfidf.fit_transform(df['Summary'])

# Encode the target labels
y = df['Stance']

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Initialize and train the Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Predict the stances on the test set using Naive Bayes
nb_y_pred = nb_model.predict(X_test)

# Evaluate the Naive Bayes model
nb_accuracy = accuracy_score(y_test, nb_y_pred)
nb_report = classification_report(y_test, nb_y_pred)
print("Naive Bayes Results:")
print(f"Accuracy: {nb_accuracy}")
print("Classification Report:")
print(nb_report)

# Initialize and train the SVM classifier
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# Predict the stances on the test set using SVM
svm_y_pred = svm_model.predict(X_test)

# Evaluate the SVM model
svm_accuracy = accuracy_score(y_test, svm_y_pred)
svm_report = classification_report(y_test, svm_y_pred)
print("SVM Results:")
print(f"Accuracy: {svm_accuracy}")
print("Classification Report:")
print(svm_report)

# Initialize and train the Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict the stances on the test set using Random Forest
rf_y_pred = rf_model.predict(X_test)

# Evaluate the Random Forest model
rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_report = classification_report(y_test, rf_y_pred)
print("Random Forest Results:")
print(f"Accuracy: {rf_accuracy}")
print("Classification Report:")
print(rf_report)

# Initialize the accuracy_scores dictionary
accuracy_scores = {
    'Naive Bayes': nb_accuracy,
    'SVM': svm_accuracy,
    'Random Forest': rf_accuracy
}

# Find the best model based on accuracy
best_model = max(accuracy_scores, key=accuracy_scores.get)
print(f"\nThe best model based on accuracy is: {best_model} with accuracy {accuracy_scores[best_model]}")

# Tokenizer and padding for deep learning models
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['Summary'])
X_seq = tokenizer.texts_to_sequences(df['Summary'])
X_pad = pad_sequences(X_seq, maxlen=100)

# Encode the target labels for deep learning models
y_encoded = pd.get_dummies(df['Stance']).values

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pad, y_encoded, test_size=0.2, random_state=42)

# Data augmentation for deep learning models
def augment_text(text):
    words = text_to_word_sequence(text)
    augmented_texts = [text]
    if len(words) > 1:
        augmented_texts.append(' '.join(np.random.permutation(words)))
    return augmented_texts

augmented_summaries = []
augmented_stances = []

for summary, stance in zip(df['Summary'], df['Stance']):
    augmented_texts = augment_text(summary)
    for text in augmented_texts:
        augmented_summaries.append(text)
        augmented_stances.append(stance)

# Tokenizer and padding for augmented data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(augmented_summaries)
X_aug_seq = tokenizer.texts_to_sequences(augmented_summaries)
X_aug_pad = pad_sequences(X_aug_seq, maxlen=100)

# Encode the target labels for augmented data
y_aug_encoded = pd.get_dummies(augmented_stances).values

# Split the augmented dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_aug_pad, y_aug_encoded, test_size=0.2, random_state=42)

# CNN Model
cnn_model = Sequential()
cnn_model.add(Embedding(5000, 128, input_length=100))
cnn_model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dense(128, activation='relu'))
cnn_model.add(Dense(3, activation='softmax'))
cnn_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
cnn_model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test), verbose=2)

cnn_y_pred = cnn_model.predict(X_test)
cnn_y_pred_labels = cnn_y_pred.argmax(axis=1)
y_test_labels = y_test.argmax(axis=1)

cnn_accuracy = accuracy_score(y_test_labels, cnn_y_pred_labels)
cnn_report = classification_report(y_test_labels, cnn_y_pred_labels)
print("CNN Results:")
print(f"Accuracy: {cnn_accuracy}")
print("Classification Report:")
print(cnn_report)

# LSTM Model
lstm_model = Sequential()
lstm_model.add(Embedding(5000, 128, input_length=100))
lstm_model.add(SpatialDropout1D(0.2))
lstm_model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
lstm_model.add(Dense(3, activation='softmax'))

lstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test), verbose=2)

lstm_y_pred = lstm_model.predict(X_test)
lstm_y_pred_labels = lstm_y_pred.argmax(axis=1)
y_test_labels = y_test.argmax(axis=1)

lstm_accuracy = accuracy_score(y_test_labels, lstm_y_pred_labels)
lstm_report = classification_report(y_test_labels, lstm_y_pred_labels)
print("LSTM Results:")
print(f"Accuracy: {lstm_accuracy}")
print("Classification Report:")
print(lstm_report)

# BiLSTM Model
bilstm_model = Sequential()
bilstm_model.add(Embedding(5000, 128, input_length=100))
bilstm_model.add(SpatialDropout1D(0.2))
bilstm_model.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)))
bilstm_model.add(Dense(3, activation='softmax'))

bilstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
bilstm_model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test), verbose=2)

bilstm_y_pred = bilstm_model.predict(X_test)
bilstm_y_pred_labels = bilstm_y_pred.argmax(axis=1)
y_test_labels = y_test.argmax(axis=1)

bilstm_accuracy = accuracy_score(y_test_labels, bilstm_y_pred_labels)
bilstm_report = classification_report(y_test_labels, bilstm_y_pred_labels)
print("BiLSTM Results:")
print(f"Accuracy: {bilstm_accuracy}")
print("Classification Report:")
print(bilstm_report)

# Compare the accuracy of all models
accuracy_scores_all = {
    'Naive Bayes': nb_accuracy,
    'SVM': svm_accuracy,
    'Random Forest': rf_accuracy,
    'CNN': cnn_accuracy,
    'LSTM': lstm_accuracy,
    'BiLSTM': bilstm_accuracy
}

# Find the best model among all based on accuracy
best_model = max(accuracy_scores_all, key=accuracy_scores_all.get)
print(f"\nThe best model based on accuracy is: {best_model} with accuracy {accuracy_scores_all[best_model]}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Moneykicks\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Stance
neutral       10234
supportive    10234
oppose        10234
Name: count, dtype: int64
Naive Bayes Results:
Accuracy: 0.7637192639635239
Classification Report:
              precision    recall  f1-score   support

     neutral       0.81      0.64      0.71      2076
      oppose       0.72      0.79      0.76      2038
  supportive       0.77      0.86      0.81      2027

    accuracy                           0.76      6141
   macro avg       0.77      0.76      0.76      6141
weighted avg       0.77      0.76      0.76      6141

SVM Results:
Accuracy: 0.7726754600227976
Classification Report:
              precision    recall  f1-score   support

     neutral       0.74      0.75      0.74      2076
      oppose       0.78      0.73      0.75      2038
  supportive       0.80      0.84      0.82      2027

    accuracy                           0.77      6141
   macro avg       0.77      0.77      0.77      6141
weighted avg       0.77      0.77      0.77      6141

Random 



695/695 - 16s - 23ms/step - accuracy: 0.8180 - loss: 0.4602 - val_accuracy: 0.8494 - val_loss: 0.3940
Epoch 2/5
695/695 - 15s - 21ms/step - accuracy: 0.8800 - loss: 0.3202 - val_accuracy: 0.8545 - val_loss: 0.3755
Epoch 3/5
695/695 - 15s - 21ms/step - accuracy: 0.9118 - loss: 0.2462 - val_accuracy: 0.8589 - val_loss: 0.3814
Epoch 4/5
695/695 - 15s - 22ms/step - accuracy: 0.9328 - loss: 0.1939 - val_accuracy: 0.8611 - val_loss: 0.3997
Epoch 5/5
695/695 - 15s - 21ms/step - accuracy: 0.9455 - loss: 0.1597 - val_accuracy: 0.8584 - val_loss: 0.4327
[1m348/348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
CNN Results:
Accuracy: 0.8583513318934485
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.77      0.79      3486
           1       0.87      0.87      0.87      4016
           2       0.88      0.93      0.90      3610

    accuracy                           0.86     11112
   macro avg       0.86      0.86     



695/695 - 45s - 65ms/step - accuracy: 0.7941 - loss: 0.5136 - val_accuracy: 0.8419 - val_loss: 0.4138
Epoch 2/5
695/695 - 47s - 67ms/step - accuracy: 0.8612 - loss: 0.3681 - val_accuracy: 0.8569 - val_loss: 0.3730
Epoch 3/5
695/695 - 46s - 66ms/step - accuracy: 0.8761 - loss: 0.3311 - val_accuracy: 0.8616 - val_loss: 0.3590
Epoch 4/5
695/695 - 45s - 65ms/step - accuracy: 0.8863 - loss: 0.3051 - val_accuracy: 0.8625 - val_loss: 0.3531
Epoch 5/5
695/695 - 46s - 66ms/step - accuracy: 0.8950 - loss: 0.2851 - val_accuracy: 0.8663 - val_loss: 0.3528
[1m348/348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 18ms/step
LSTM Results:
Accuracy: 0.8662706983441325
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.79      0.80      3486
           1       0.87      0.90      0.89      4016
           2       0.91      0.91      0.91      3610

    accuracy                           0.87     11112
   macro avg       0.86      0.86   



695/695 - 74s - 107ms/step - accuracy: 0.7905 - loss: 0.5154 - val_accuracy: 0.8403 - val_loss: 0.4095
Epoch 2/5
695/695 - 68s - 97ms/step - accuracy: 0.8600 - loss: 0.3690 - val_accuracy: 0.8539 - val_loss: 0.3808
Epoch 3/5
695/695 - 68s - 97ms/step - accuracy: 0.8783 - loss: 0.3296 - val_accuracy: 0.8633 - val_loss: 0.3607
Epoch 4/5
695/695 - 68s - 98ms/step - accuracy: 0.8861 - loss: 0.3075 - val_accuracy: 0.8629 - val_loss: 0.3558
Epoch 5/5
695/695 - 68s - 98ms/step - accuracy: 0.8947 - loss: 0.2876 - val_accuracy: 0.8687 - val_loss: 0.3496
[1m348/348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step
BiLSTM Results:
Accuracy: 0.8687005039596832
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.80      0.80      3486
           1       0.89      0.88      0.88      4016
           2       0.91      0.92      0.91      3610

    accuracy                           0.87     11112
   macro avg       0.87      0.87