In [282]:
import pandas as pd
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier

In [292]:
df = pd.read_csv("../data/processed/clean_fake_news_dataset.csv")

In [294]:
df.head()

Unnamed: 0,title,text,subject,date,label,clean_text
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",US_News,"February 13, 2017",0,st century wire says ben stein reputable profe...
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"April 5, 2017",1,washington reuters u s president donald trump ...
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,politicsNews,"September 27, 2017",1,reuters puerto rico governor ricardo rossello ...
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",News,"May 22, 2017",0,on monday donald trump once again embarrassed ...
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",politicsNews,"June 24, 2016",1,glasgow scotland reuters most u s presidential...


In [298]:
df["clean_text"] = df["clean_text"].fillna("")

In [300]:
df["clean_text"].isna().sum()

0

In [143]:
df = df[df["clean_text"] != ""].reset_index(drop=True)

In [145]:
X = df["clean_text"]

In [147]:
y = df["label"]

In [151]:
tfidf = TfidfVectorizer(
    max_features = 5000,
    ngram_range = (1, 2),
    stop_words = "english"
)

In [153]:
X_tfidf = tfidf.fit_transform(X)

In [155]:
pd.DataFrame(
    X_tfidf.todense(),
    columns=tfidf.get_feature_names_out()
)

Unnamed: 0,abadi,abandon,abandoned,abc,abc news,abdullah,abe,abedin,ability,able,...,youth,youtube,zealand,zero,zika,zimbabwe,zone,zones,zor,zuma
0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44178,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
44179,0.0,0.0,0.0,0.0,0.0,0.0,0.332326,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
44180,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
44181,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.033866,0.0,0.0,0.0


In [157]:
(X_tfidf != 0).sum(axis=1).mean()

128.75323088065545

In [167]:
joblib.dump(tfidf, "../models/tfidf_vectorizer.pkl")

['../models/tfidf_vectorizer.pkl']

In [179]:
"Feature Shape:", X_tfidf.shape

('Feature Shape:', (44183, 5000))

In [184]:
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42, stratify=y
)

In [186]:
X_train.shape, X_test.shape

((35346, 5000), (8837, 5000))

In [190]:
svm_model = SVC(kernel='linear', probability=True, random_state=42)

In [192]:
svm_model.fit(X_train, y_train)

In [193]:
joblib.dump(svm_model, "../models/svm_model.pkl")

['../models/svm_model.pkl']

In [240]:
y_pred = svm_model.predict(X_test)

In [241]:
acc = accuracy_score(y_test, y_pred)

In [242]:
f1 = f1_score(y_test, y_pred)

In [243]:
report = classification_report(y_test, y_pred)

In [244]:
cm = confusion_matrix(y_test, y_pred)

In [245]:
print("Accuracy:", acc)
print("F1 Score:", f1)
print("Classification Report:", report)
print("Confusion Matrix:", cm)

Accuracy: 0.9956998981554827
F1 Score: 0.9955679962677864
Classification Report:               precision    recall  f1-score   support

           0       1.00      0.99      1.00      4554
           1       0.99      1.00      1.00      4283

    accuracy                           1.00      8837
   macro avg       1.00      1.00      1.00      8837
weighted avg       1.00      1.00      1.00      8837

Confusion Matrix: [[4531   23]
 [  15 4268]]


In [218]:
mlp_model = MLPClassifier(hidden_layer_sizes=(128,64), max_iter=300, random_state=42)

In [220]:
mlp_model.fit(X_train, y_train)

In [222]:
joblib.dump(mlp_model, "../models/mlp_model.pkl")

['../models/mlp_model.pkl']

In [264]:
y_pred = mlp_model.predict(X_test)

In [266]:
acc = accuracy_score(y_test, y_pred)

In [268]:
f1 = f1_score(y_test, y_pred)

In [270]:
report = classification_report(y_test, y_pred)

In [272]:
cm = confusion_matrix(y_test, y_pred)

In [274]:
print("Accuracy:", acc)
print("F1 Score:", f1)
print("Classification Report:", report)
print("Confusion Matrix:", cm)

Accuracy: 0.990720832861831
F1 Score: 0.9904026217228464
Classification Report:               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4554
           1       0.99      0.99      0.99      4283

    accuracy                           0.99      8837
   macro avg       0.99      0.99      0.99      8837
weighted avg       0.99      0.99      0.99      8837

Confusion Matrix: [[4524   30]
 [  52 4231]]


In [256]:
metrics_SVC = pd.DataFrame([{
    "accuracy" : acc,
    "f1_score" : f1
}])

In [260]:
metrics_SVC

Unnamed: 0,accuracy,f1_score
0,0.9957,0.995568


In [262]:
metrics_SVC.to_csv("../results/metrics_SVC.csv", index=False)

In [276]:
metrics_MLP = pd.DataFrame([{
    "accuracy" : acc,
    "f1_score" : f1
}])

In [278]:
metrics_MLP

Unnamed: 0,accuracy,f1_score
0,0.990721,0.990403


In [280]:
metrics_MLP.to_csv("../results/metrics_MLP.csv", index=False)