In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from tqdm import tqdm
import joblib
import os 
import re

In [10]:
# loading cleaned data
DATA_FOLDER = "../data/fake_news/"
full_df = pd.read_csv(DATA_FOLDER + "clean_fake_news.csv")

train_df,valid_df = train_test_split(full_df, test_size=0.2, stratify=full_df['label'], random_state=42)
train_df.head()

Unnamed: 0,title,text,subject,date,label
2649,Majority of people in France now dissatisfied ...,paris reuters most french voters are now diss...,worldnews,"August 26, 2017",REAL
29808,White SC Cops Sexually Assault Black Couple D...,a video published by the washington post shows...,News,"April 2, 2016",FAKE
29981,Organizers name TV journalists to moderate U.S...,washington reuters journalists from nbc abc c...,politicsNews,"September 2, 2016",REAL
17278,Brother of Marseille attacker arrested in Ital...,rome reuters this october 8 story has been re...,worldnews,"October 8, 2017",REAL
21329,Cameroon orders Anglophone region total lockdo...,yaounde reuters cameroon authorities on frida...,worldnews,"September 29, 2017",REAL


In [11]:

# extracting text and validation labels 
train_texts=train_df['text'].tolist()
train_labels = train_df['label'].tolist()


val_texts = valid_df['text'].tolist()
val_labels = valid_df['label'].tolist()

In [12]:
label_encoder = LabelEncoder()
label_encoder.fit(['FAKE','REAL'])
train_labels_enc = label_encoder.fit_transform(train_labels)
val_labels_enc = label_encoder.transform(val_labels)
print("Enoded classes: ", label_encoder.classes_)





Enoded classes:  ['FAKE' 'REAL']


In [None]:
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
def encode_in_batches(sentences, model, batch_size=32):
    embeddings = []
    for i in tqdm(range(0, len(sentences), batch_size), desc="Encoding in batches"):
        batch = sentences[i:i+batch_size]
        emb = model.encode(batch, show_progress_bar=False)
        embeddings.extend(emb)
    return np.array(embeddings)

X_train = encode_in_batches(train_texts, sbert_model)
X_val = encode_in_batches(val_texts, sbert_model)


# Train classifier
clf = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
clf.fit(X_train, train_labels_enc)

#evaluate
val_preds = clf.predict(X_val)
acc = accuracy_score(val_labels_enc, val_preds)
print("Validation Accuracy:", acc)

target_names = [str(cls) for cls in label_encoder.classes_]

print("\nClassification Report:\n", classification_report(
    val_labels_enc, val_preds,
    labels=list(range(len(target_names))),
    target_names=target_names
))
 


# Get the path to the `src/` directory from the notebook
save_path = os.path.join("..", "src")

# Ensure the directory exists
os.makedirs(save_path, exist_ok=True)

# Save the files to the src/ folder
joblib.dump(clf, os.path.join(save_path, "classifier_model.joblib"))
joblib.dump(label_encoder, os.path.join(save_path, "label_encoder.joblib"))
joblib.dump(sbert_model, os.path.join(save_path, "sbert_model.joblib"))


Encoding in batches: 100%|██████████| 965/965 [24:08<00:00,  1.50s/it]
Encoding in batches: 100%|██████████| 242/242 [04:46<00:00,  1.18s/it]
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Validation Accuracy: 0.9424646883503952

Classification Report:
               precision    recall  f1-score   support

        FAKE       0.94      0.94      0.94      3479
        REAL       0.95      0.95      0.95      4238

    accuracy                           0.94      7717
   macro avg       0.94      0.94      0.94      7717
weighted avg       0.94      0.94      0.94      7717



['..\\src\\sbert_model.joblib']