In [61]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [31]:
import pandas as pd
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, LSTM, Dense, Bidirectional, SpatialDropout1D

In [62]:
# Ensure NLTK stopwords are downloaded
nltk.download('stopwords')

# Load data from CSV file
df = pd.read_csv('train_stances.csv') 


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Moneykicks\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [63]:

# Keep only 'Headline' and 'Stance' columns
df = df[['Headline', 'Stance']]  

# Drop rows where 'Headline' or 'Stance' is NaN
df = df.dropna(subset=['Headline', 'Stance'])

# Ensure all entries in 'Headline' are strings
df['Headline'] = df['Headline'].astype(str)

# for full review display without truncation
pd.set_option('display.max_colwidth', None)

# Print value counts of 'Stance' column
print(df['Stance'].value_counts())

Stance
unrelated    36545
discuss       8909
agree         3678
disagree       840
Name: count, dtype: int64


In [34]:
# Preprocess the text data
stop_words = list(stopwords.words('english')) 
tfidf = TfidfVectorizer(stop_words=stop_words, max_df=0.7)

In [35]:
# Transform the text data into TF-IDF features
X = tfidf.fit_transform(df['Headline'])

# Encode the target labels
y = df['Stance']

In [67]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [68]:
# Initialize and train the Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

In [69]:
# Predict the stances on the test set using Naive Bayes
nb_y_pred = nb_model.predict(X_test)
# Evaluate the Naive Bayes model
nb_accuracy = accuracy_score(y_test, nb_y_pred)
nb_report = classification_report(y_test, nb_y_pred)

In [70]:
print("Naive Bayes Results:")
print(f"Accuracy: {nb_accuracy}")
print("Classification Report:")
print(nb_report)

Naive Bayes Results:
Accuracy: 0.7169584792396199
Classification Report:
              precision    recall  f1-score   support

       agree       0.24      0.06      0.10       703
    disagree       0.13      0.01      0.02       180
     discuss       0.38      0.07      0.12      1779
   unrelated       0.74      0.95      0.83      7333

    accuracy                           0.72      9995
   macro avg       0.37      0.27      0.27      9995
weighted avg       0.63      0.72      0.64      9995



In [71]:
# Initialize and train the SVM classifier
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# Predict the stances on the test set using SVM
svm_y_pred = svm_model.predict(X_test)

In [72]:
# Evaluate the SVM model
svm_accuracy = accuracy_score(y_test, svm_y_pred)
svm_report = classification_report(y_test, svm_y_pred)
print("SVM Results:")
print(f"Accuracy: {svm_accuracy}")
print("Classification Report:")
print(svm_report)

SVM Results:
Accuracy: 0.7334667333666833
Classification Report:
              precision    recall  f1-score   support

       agree       0.00      0.00      0.00       703
    disagree       0.00      0.00      0.00       180
     discuss       0.00      0.00      0.00      1779
   unrelated       0.73      1.00      0.85      7333

    accuracy                           0.73      9995
   macro avg       0.18      0.25      0.21      9995
weighted avg       0.54      0.73      0.62      9995



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [73]:
# Initialize and train the Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict the stances on the test set using Random Forest
rf_y_pred = rf_model.predict(X_test)

# Evaluate the Random Forest model
rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_report = classification_report(y_test, rf_y_pred)
print("Random Forest Results:")
print(f"Accuracy: {rf_accuracy}")
print("Classification Report:")
print(rf_report)


Random Forest Results:
Accuracy: 0.7234617308654328
Classification Report:
              precision    recall  f1-score   support

       agree       0.21      0.01      0.02       703
    disagree       0.33      0.01      0.01       180
     discuss       0.24      0.02      0.04      1779
   unrelated       0.73      0.98      0.84      7333

    accuracy                           0.72      9995
   macro avg       0.38      0.25      0.23      9995
weighted avg       0.60      0.72      0.62      9995



In [74]:
# Initialize the accuracy_scores dictionary
accuracy_scores = {}

# Add Naive, SVM and Random Forest accuracy scores to the comparison
accuracy_scores['Naive']=nb_accuracy
accuracy_scores['SVM'] = svm_accuracy
accuracy_scores['Random Forest'] = rf_accuracy

best_model = max(accuracy_scores, key=accuracy_scores.get)
print(f"\nThe best model based on accuracy is: {best_model} with accuracy {accuracy_scores[best_model]}")


The best model based on accuracy is: SVM with accuracy 0.7334667333666833


In [124]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, LSTM, Bidirectional, Dense, Dropout

# Load the dataset
df = pd.read_csv('train_stances.csv')

# Parameters
max_num_words = 10000
max_sequence_length = 100
embedding_dim = 100

# Extract the features and labels
X = df['Headline'].values
y = df['Stance'].values

# Encode the labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Tokenize the text
tokenizer = Tokenizer(num_words=max_num_words)
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_sequences, maxlen=max_sequence_length)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# Define the CNN model
def create_cnn_model():
    model = Sequential()
    model.add(Embedding(max_num_words, embedding_dim, input_length=max_sequence_length))
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(MaxPooling1D(5))
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(4, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Define the LSTM model
def create_lstm_model():
    model = Sequential()
    model.add(Embedding(max_num_words, embedding_dim, input_length=max_sequence_length))
    model.add(LSTM(128))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(4, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Define the BiLSTM model
def create_bilstm_model():
    model = Sequential()
    model.add(Embedding(max_num_words, embedding_dim, input_length=max_sequence_length))
    model.add(Bidirectional(LSTM(128)))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(4, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Create and train the CNN model
cnn_model = create_cnn_model()
cnn_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

# Create and train the LSTM model
lstm_model = create_lstm_model()
lstm_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

# Create and train the BiLSTM model
bilstm_model = create_bilstm_model()
bilstm_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))


Epoch 1/10




[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 23ms/step - accuracy: 0.7259 - loss: 0.8157 - val_accuracy: 0.7313 - val_loss: 0.7181
Epoch 2/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 25ms/step - accuracy: 0.7288 - loss: 0.7103 - val_accuracy: 0.7313 - val_loss: 0.7086
Epoch 3/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 25ms/step - accuracy: 0.7321 - loss: 0.6880 - val_accuracy: 0.7313 - val_loss: 0.7068
Epoch 4/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 23ms/step - accuracy: 0.7292 - loss: 0.6780 - val_accuracy: 0.7313 - val_loss: 0.7026
Epoch 5/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 22ms/step - accuracy: 0.7301 - loss: 0.6666 - val_accuracy: 0.7313 - val_loss: 0.7011
Epoch 6/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 22ms/step - accuracy: 0.7307 - loss: 0.6584 - val_accuracy: 0.7313 - val_loss: 0.7032
Epoch 7/10
[1m

<keras.src.callbacks.history.History at 0x2af311645f0>

In [127]:
# Function to evaluate and print the classification report and accuracy
def evaluate_model(model, X_val, y_val, model_name):
    y_pred = model.predict(X_val)
    y_pred_classes = np.argmax(y_pred, axis=1)
    report = classification_report(y_val, y_pred_classes, target_names=label_encoder.classes_)
    accuracy = accuracy_score(y_val, y_pred_classes)
    print(f"Classification Report for {model_name}:\n")
    print(report)
    print(f"Overall Accuracy for {model_name}: {accuracy:.4f}\n")

# Evaluate and print the report and accuracy for the CNN model
evaluate_model(cnn_model, X_val, y_val, "CNN")

# Evaluate and print the report and accuracy for the LSTM model
evaluate_model(lstm_model, X_val, y_val, "LSTM")

# Evaluate and print the report and accuracy for the BiLSTM model
evaluate_model(bilstm_model, X_val, y_val, "BiLSTM")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
Classification Report for CNN:

              precision    recall  f1-score   support

       agree       0.00      0.00      0.00       736
    disagree       0.00      0.00      0.00       168
     discuss       0.00      0.00      0.00      1782
   unrelated       0.73      1.00      0.84      7309

    accuracy                           0.73      9995
   macro avg       0.18      0.25      0.21      9995
weighted avg       0.53      0.73      0.62      9995

Overall Accuracy for CNN: 0.7313

[1m  8/313[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m4s[0m 16ms/step

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step
Classification Report for LSTM:

              precision    recall  f1-score   support

       agree       0.00      0.00      0.00       736
    disagree       0.00      0.00      0.00       168
     discuss       0.00      0.00      0.00      1782
   unrelated       0.73      1.00      0.84      7309

    accuracy                           0.73      9995
   macro avg       0.18      0.25      0.21      9995
weighted avg       0.53      0.73      0.62      9995

Overall Accuracy for LSTM: 0.7313

[1m  7/313[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m5s[0m 19ms/step

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 19ms/step
Classification Report for BiLSTM:

              precision    recall  f1-score   support

       agree       0.00      0.00      0.00       736
    disagree       0.00      0.00      0.00       168
     discuss       0.10      0.00      0.00      1782
   unrelated       0.73      1.00      0.84      7309

    accuracy                           0.73      9995
   macro avg       0.21      0.25      0.21      9995
weighted avg       0.55      0.73      0.62      9995

Overall Accuracy for BiLSTM: 0.7305



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
