In [43]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from tqdm import tqdm
import joblib
import os 
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize



In [44]:
nltk.download('punkt', download_dir='nltk_data') 
nltk.data.path.append('./nltk_data') 
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

[nltk_data] Downloading package punkt to nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [45]:


def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(str(text))
    cleaned = [lemmatizer.lemmatize(word.lower()) for word in words if word.isalpha() and word.lower() not in stop_words]
    return ' '.join(cleaned)

In [None]:

DATA_FOLDER = "../data/fake_news/"
full_df = pd.read_csv(DATA_FOLDER + "clean_fake_news.csv")

full_df['text'] = full_df['text'].apply(preprocess_text)






In [50]:

train_df, valid_df = train_test_split(full_df, test_size=0.2, stratify=full_df['label'], random_state=42)

train_df.head()

Unnamed: 0,title,text,subject,date,label
2649,Majority of people in France now dissatisfied ...,paris reuters french voter dissatisfied emmanu...,worldnews,"August 26, 2017",REAL
29808,White SC Cops Sexually Assault Black Couple D...,video published washington post show white pol...,News,"April 2, 2016",FAKE
29981,Organizers name TV journalists to moderate U.S...,washington reuters journalist nbc abc cnn fox ...,politicsNews,"September 2, 2016",REAL
17278,Brother of Marseille attacker arrested in Ital...,rome reuters october story refiled correct nam...,worldnews,"October 8, 2017",REAL
21329,Cameroon orders Anglophone region total lockdo...,yaounde reuters cameroon authority friday bann...,worldnews,"September 29, 2017",REAL


In [51]:
train_texts=train_df['text'].tolist()
train_labels = train_df['label'].tolist()


val_texts = valid_df['text'].tolist()
val_labels = valid_df['label'].tolist()

In [52]:
label_encoder = LabelEncoder()
label_encoder.fit(['FAKE','REAL'])
train_labels_enc = label_encoder.fit_transform(train_labels)
val_labels_enc = label_encoder.transform(val_labels)
print("Enoded classes: ", label_encoder.classes_)





Enoded classes:  ['FAKE' 'REAL']


In [49]:
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
def encode_in_batches(sentences, model, batch_size=32):
    embeddings = []
    for i in tqdm(range(0, len(sentences), batch_size), desc="Encoding in batches"):
        batch = sentences[i:i+batch_size]
        emb = model.encode(batch, show_progress_bar=False)
        embeddings.extend(emb)
    return np.array(embeddings)

X_train = encode_in_batches(train_texts, sbert_model)
X_val = encode_in_batches(val_texts, sbert_model)

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}


xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='accuracy',
    cv=3,
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train, train_labels_enc)

clf = grid_search.best_estimator_

val_preds = clf.predict(X_val)
acc = accuracy_score(val_labels_enc, val_preds)
print("Validation Accuracy:", acc)

target_names = [str(cls) for cls in label_encoder.classes_]

print("\nClassification Report:\n", classification_report(
    val_labels_enc, val_preds,
    labels=list(range(len(target_names))),
    target_names=target_names
))
 



save_path = os.path.join("..", "src")

os.makedirs(save_path, exist_ok=True)

joblib.dump(clf, os.path.join(save_path, "classifier_model.joblib"))
joblib.dump(label_encoder, os.path.join(save_path, "label_encoder.joblib"))
joblib.dump(sbert_model, os.path.join(save_path, "sbert_model.joblib"))


Encoding in batches: 100%|██████████| 965/965 [51:00<00:00,  3.17s/it]  
Encoding in batches: 100%|██████████| 242/242 [11:53<00:00,  2.95s/it]


Fitting 3 folds for each of 36 candidates, totalling 108 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Validation Accuracy: 0.9596993650382273

Classification Report:
               precision    recall  f1-score   support

        FAKE       0.96      0.95      0.95      3479
        REAL       0.96      0.97      0.96      4238

    accuracy                           0.96      7717
   macro avg       0.96      0.96      0.96      7717
weighted avg       0.96      0.96      0.96      7717



['..\\src\\sbert_model.joblib']