In [2]:
from nltk.tokenize import word_tokenize
from gensim.models import KeyedVectors
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from gensim.models import Word2Vec
from textblob import TextBlob
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Load the dataset
train_data = pd.read_csv('/kaggle/input/lemma-balanced/lemmatized_dataset_final_balanced_train.csv')
test_data = pd.read_csv('/kaggle/input/lemma-balanced/lemmatized_dataset_final_balanced_test.csv')

# Display the first few rows of the dataset
train_data.head()

Unnamed: 0,Headline,Body ID,Stance,articleBody,stance_cat
0,dna test confirm lebanon is holding isi leader...,2042,unrelated,there is a story currently making the round ab...,3
1,somalia shebab chief ahmed abdi godane likely ...,1610,discuss,ahmed abdi godane the leader of al shabab the ...,2
2,dna test prove lebanon is holding isi chief al...,1468,disagree,an iraqi official denied that a woman detained...,1
3,the pumpkinspice condom is just a figment of y...,1253,unrelated,the united state department of defense said on...,3
4,u probing claim isi fighter seized airdropped ...,465,discuss,the pentagon admitted on wednesday that isi di...,2


In [3]:
X_train, X_test = train_data[['Headline', 'articleBody']], test_data[['Headline', 'articleBody']]
y_train, y_test = train_data['stance_cat'], test_data['stance_cat']

In [4]:
# Count feature generator
def count_feature_generator(X_train, X_test):
    count_vectorizer = CountVectorizer()
    X_train_count = count_vectorizer.fit_transform(X_train['Headline'] + ' ' + X_train['articleBody'])
    X_test_count = count_vectorizer.transform(X_test['Headline'] + ' ' + X_test['articleBody'])
    return X_train_count, X_test_count

# TF-IDF feature generator
def tfidf_feature_generator(X_train, X_test):
    tfidf_vectorizer = TfidfVectorizer()
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['Headline'] + ' ' + X_train['articleBody'])
    X_test_tfidf = tfidf_vectorizer.transform(X_test['Headline'] + ' ' + X_test['articleBody'])
    return X_train_tfidf, X_test_tfidf

# SVD feature generator
def svd_feature_generator(X_train, X_test, n_components=100):
    svd = TruncatedSVD(n_components=n_components)
    X_train_svd = svd.fit_transform(X_train)
    X_test_svd = svd.transform(X_test)
    return X_train_svd, X_test_svd

# Word2Vec feature generator
def word2vec_feature_generator(X_train, X_test):
    word2vec_model = Word2Vec(sentences=X_train['Headline'] + ' ' + X_train['articleBody'], vector_size=100, window=5, min_count=1, workers=4)
    X_train_word2vec = np.array([np.mean([word2vec_model.wv[word] for word in sentence.split() if word in word2vec_model.wv] or [np.zeros(100)], axis=0) for sentence in X_train['Headline'] + ' ' + X_train['articleBody']])
    X_test_word2vec = np.array([np.mean([word2vec_model.wv[word] for word in sentence.split() if word in word2vec_model.wv] or [np.zeros(100)], axis=0) for sentence in X_test['Headline'] + ' ' + X_test['articleBody']])
    return X_train_word2vec, X_test_word2vec

# Sentiment feature generator
def sentiment_feature_generator(X_train, X_test):
    X_train_sentiment = np.array([TextBlob(sentence).sentiment for sentence in X_train['Headline'] + ' ' + X_train['articleBody']])
    X_test_sentiment = np.array([TextBlob(sentence).sentiment for sentence in X_test['Headline'] + ' ' + X_test['articleBody']])
    return X_train_sentiment, X_test_sentiment


In [5]:
# Generate features
X_train_count, X_test_count = count_feature_generator(X_train, X_test)
X_train_tfidf, X_test_tfidf = tfidf_feature_generator(X_train, X_test)
X_train_svd, X_test_svd = svd_feature_generator(X_train_tfidf, X_test_tfidf)
X_train_word2vec, X_test_word2vec = word2vec_feature_generator(X_train, X_test)
X_train_sentiment, X_test_sentiment = sentiment_feature_generator(X_train, X_test)


In [10]:
# XGBoost model
xgb = XGBClassifier()

In [12]:
# Train XGBoost model with Count features
xgb_model_count = xgb
xgb_model_count.fit(X_train_count, y_train)
xgb_pred_count_test = xgb_model_count.predict(X_test_count)
xgb_accuracy_count_test = accuracy_score(y_test, xgb_pred_count_test)
print("XGBoost Accuracy (CountVectorizer - Test):", xgb_accuracy_count_test)

# Train XGBoost model with TF-IDF features
xgb_model_tfidf = xgb
xgb_model_tfidf.fit(X_train_tfidf, y_train)
xgb_pred_tfidf_test = xgb_model_tfidf.predict(X_test_tfidf)
xgb_accuracy_tfidf_test = accuracy_score(y_test, xgb_pred_tfidf_test)
print("XGBoost Accuracy (TF-IDF - Test):", xgb_accuracy_tfidf_test)

# Train XGBoost model with SVD features
xgb_model_svd = xgb
xgb_model_svd.fit(X_train_svd, y_train)
xgb_pred_svd_test = xgb_model_svd.predict(X_test_svd)
xgb_accuracy_svd_test = accuracy_score(y_test, xgb_pred_svd_test)
print("XGBoost Accuracy (SVD - Test):", xgb_accuracy_svd_test)

# Train XGBoost model with Word2Vec features
xgb_model_word2vec = xgb
xgb_model_word2vec.fit(X_train_word2vec, y_train)
xgb_pred_word2vec_test = xgb_model_word2vec.predict(X_test_word2vec)
xgb_accuracy_word2vec_test = accuracy_score(y_test, xgb_pred_word2vec_test)
print("XGBoost Accuracy (Word2Vec - Test):", xgb_accuracy_word2vec_test)

# Train XGBoost model with Sentiment features
xgb_model_sentiment = xgb
xgb_model_sentiment.fit(X_train_sentiment, y_train)
xgb_pred_sentiment_test = xgb_model_sentiment.predict(X_test_sentiment)
xgb_accuracy_sentiment_test = accuracy_score(y_test, xgb_pred_sentiment_test)
print("XGBoost Accuracy (Sentiment - Test):", xgb_accuracy_sentiment_test)

XGBoost Accuracy (CountVectorizer - Test): 0.8741154562383613
XGBoost Accuracy (TF-IDF - Test): 0.870391061452514
XGBoost Accuracy (SVD - Test): 0.8499068901303538
XGBoost Accuracy (Word2Vec - Test): 0.623463687150838
XGBoost Accuracy (Sentiment - Test): 0.6033519553072626


In [13]:
from sklearn.metrics import precision_recall_fscore_support

def evaluate_xgboost_model(X_train, X_test, y_train, y_test, feature_name):
    # Train XGBoost model
    xgb_model.fit(X_train, y_train)
    
    # Predictions
    xgb_pred_test = xgb_model.predict(X_test)
    
    # Accuracy for test set
    accuracy_test = accuracy_score(y_test, xgb_pred_test)
    print("XGBoost Accuracy ({0} - Test): {1}".format(feature_name, accuracy_test))
    
    # Calculate precision, recall, F1-score for test set
    precision_test, recall_test, f1_score_test, _ = precision_recall_fscore_support(y_test, xgb_pred_test, average=None)
    
    # Print class-wise F1 scores for test set
    for i in range(len(precision_test)):
        print(f"Class {i} - Precision (Test): {precision_test[i]}, Recall (Test): {recall_test[i]}, F1-score (Test): {f1_score_test[i]}")
    
    # Calculate overall metrics for test set
    macro_precision_test = precision_test.mean()
    macro_recall_test = recall_test.mean()
    macro_f1_score_test = f1_score_test.mean()
    
    # Print overall metrics for test set
    print("Macro Precision (Test):", macro_precision_test)
    print("Macro Recall (Test):", macro_recall_test)
    print("Macro F1 Score (Test):", macro_f1_score_test)

# Evaluate XGBoost models with different feature engineering types
evaluate_xgboost_model(X_train_count, X_test_count, y_train, y_test, "CountVectorizer")
evaluate_xgboost_model(X_train_tfidf, X_test_tfidf, y_train, y_test, "TF-IDF")
evaluate_xgboost_model(X_train_svd, X_test_svd, y_train, y_test, "SVD")
evaluate_xgboost_model(X_train_word2vec, X_test_word2vec, y_train, y_test, "Word2Vec")
evaluate_xgboost_model(X_train_sentiment, X_test_sentiment, y_train, y_test, "Sentiment")

XGBoost Accuracy (CountVectorizer - Test): 0.8741154562383613
Class 0 - Precision (Test): 0.7763578274760383, Recall (Test): 0.6603260869565217, F1-score (Test): 0.7136563876651982
Class 1 - Precision (Test): 0.6545454545454545, Recall (Test): 0.42857142857142855, F1-score (Test): 0.5179856115107914
Class 2 - Precision (Test): 0.8274111675126904, Recall (Test): 0.9147025813692481, F1-score (Test): 0.8688699360341151
Class 3 - Precision (Test): 0.9406906906906907, Recall (Test): 0.9336810730253353, F1-score (Test): 0.9371727748691099
Macro Precision (Test): 0.7997512850562185
Macro Recall (Test): 0.7343202924806334
Macro F1 Score (Test): 0.7594211775198036
XGBoost Accuracy (TF-IDF - Test): 0.870391061452514
Class 0 - Precision (Test): 0.7531645569620253, Recall (Test): 0.6467391304347826, F1-score (Test): 0.6959064327485379
Class 1 - Precision (Test): 0.6071428571428571, Recall (Test): 0.40476190476190477, F1-score (Test): 0.4857142857142857
Class 2 - Precision (Test): 0.834375, Recall 

In [14]:
# Comparing accuracies
test_accuracies = {
    "XGBoost (CountVectorizer)": xgb_accuracy_count_test,
    "XGBoost (TF-IDF)": xgb_accuracy_tfidf_test,
    "XGBoost (SVD)": xgb_accuracy_svd_test,
    "XGBoost (Word2Vec)": xgb_accuracy_word2vec_test,
    "XGBoost (Sentiment)": xgb_accuracy_sentiment_test,
}

best_model_test = max(test_accuracies, key=test_accuracies.get)
print("Best model based on Test Accuracy:", best_model_test)
print("Test Accuracy of the best model:", test_accuracies[best_model_test])


Best model based on Test Accuracy: XGBoost (CountVectorizer)
Test Accuracy of the best model: 0.8741154562383613
