In [75]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
import numpy as np
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import hstack
import matplotlib.pyplot as plt

In [76]:
file_path = "../data/train.jsonl"

In [77]:
data = []
with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        data.append(json.loads(line))
df = pd.DataFrame(data)

In [78]:
df_filtered = df[(df["verifiable"] == "VERIFIABLE") & (df["label"].isin(["SUPPORTS", "REFUTES"]))].copy()
df_filtered = df_filtered[["claim", "label"]]

In [79]:
excel_filename = "data/filtered_fever_data.xlsx"
df_filtered.to_excel(excel_filename, index=False)

OSError: Cannot save file into a non-existent directory: 'data'

In [None]:
# Train-test split (70-30 ratio)
X_train, X_test, y_train, y_test = train_test_split(df_filtered["claim"], df_filtered["label"], test_size=0.3, random_state=42)

### 1. TF-IDF Vectorization ###

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range = (1,3))
# tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
# Train and evaluate Naive Bayes model
model_tfidf = MultinomialNB(alpha = 1)
model_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = model_tfidf.predict(X_test_tfidf)

# print("\n=== TF-IDF Model Performance ===")
# print("Accuracy:", accuracy_score(y_test, y_pred_tfidf))
# print(classification_report(y_test, y_pred_tfidf))



=== TF-IDF Model Performance ===
Accuracy: 0.7740946483319673
              precision    recall  f1-score   support

     REFUTES       0.86      0.21      0.33      8987
    SUPPORTS       0.77      0.99      0.86     23956

    accuracy                           0.77     32943
   macro avg       0.81      0.60      0.60     32943
weighted avg       0.79      0.77      0.72     32943



In [None]:
# Evaluate the model
print("\n=== TF-IDF Model Performance ===")
accuracy = round(accuracy_score(y_test, y_pred_tfidf), 5)
print("Accuracy:", accuracy)

print("Classification Report:")
print(classification_report(y_test, y_pred_tfidf, digits=5))

print("EDA and Fake News Classification completed successfully!")

In [None]:
y_proba = model_tfidf.predict_proba(X_test_tfidf)[:, 1]
roc_auc = round(roc_auc_score(y_test, y_proba), 5)
print("ROC AUC Score:", roc_auc)

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred_tfidf)
plt.figure()
plt.matshow(conf_matrix, cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.colorbar()
plt.xticks([0, 1])
plt.yticks([0, 1])
plt.grid(False)
for (i, j), value in np.ndenumerate(conf_matrix):
    plt.text(j, i, f'{value}', ha='center', va='center')
plt.show()

In [None]:
label_map = {'REFUTES': 0, 'SUPPORTS': 1}
y_test_bin = y_test.map(label_map)
fpr, tpr, _ = roc_curve(y_test_bin, y_proba)
roc_auc = round(roc_auc_score(y_test_bin, y_proba), 5)

plt.figure()
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc})")
plt.plot([0, 1], [0, 1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.grid(True)
plt.show()