In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer

In [2]:
df = pd.read_csv('train.csv')

In [9]:
# Remove NaN or non-string entries
df = df.dropna(subset=["text"])        # drop rows with null text
df["text"] = df["text"].astype(str)    # ensure all are strings

In [10]:
def split_text_into_chunks(text, max_words=400):
    """
    Splits a long text into smaller chunks of max_words each.
    """
    words = text.split()
    chunks = []
    for i in range(0, len(words), max_words):
        chunk = " ".join(words[i:i+max_words])
        chunks.append(chunk)
    return chunks

In [11]:
expanded_texts = []
expanded_labels = []

for text, label in zip(df["text"], df["label"]):
    chunks = split_text_into_chunks(text, max_words=400)
    expanded_texts.extend(chunks)
    expanded_labels.extend([label] * len(chunks))

# Convert back to DataFrame
expanded_df = pd.DataFrame({"text": expanded_texts, "label": expanded_labels})
print("Original size:", len(df))
print("Expanded size:", len(expanded_df))

Original size: 20761
Expanded size: 50262


In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    expanded_df["text"], expanded_df["label"], 
    test_size=0.2, random_state=42, stratify=expanded_df["label"]
)

In [13]:
sbert_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [14]:
# Encode with SBERT
X_train_embeddings = sbert_model.encode(X_train.tolist(), convert_to_numpy=True, show_progress_bar=True)
print("Embedding shape:", X_train_embeddings.shape)

Batches: 100%|██████████| 1257/1257 [02:50<00:00,  7.36it/s]


Embedding shape: (40209, 384)


In [15]:
X_test_embeddings  = sbert_model.encode(X_test.tolist(), convert_to_numpy=True, show_progress_bar=True)
print(X_test_embeddings.shape)

Batches: 100%|██████████| 315/315 [00:42<00:00,  7.33it/s]

(10053, 384)





In [16]:
# Save the embeddings
np.save("X_train_embeddings.npy", X_train_embeddings)
np.save("X_val_embeddings.npy", X_test_embeddings)

# Load back
X_train_embeddings = np.load("X_train_embeddings.npy")
X_test_embeddings = np.load("X_val_embeddings.npy")

# Classify using Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=500)
clf.fit(X_train_embeddings, y_train)

In [18]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = clf.predict(X_test_embeddings)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8338804337013827
              precision    recall  f1-score   support

           0       0.84      0.87      0.85      5596
           1       0.83      0.79      0.81      4457

    accuracy                           0.83     10053
   macro avg       0.83      0.83      0.83     10053
weighted avg       0.83      0.83      0.83     10053



# Use XGboost

In [27]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [28]:
xgb_clf = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    tree_method="hist",
    device="cuda",
    eval_metric="logloss"
)

In [29]:
# Train
xgb_clf.fit(X_train_embeddings, y_train)

# Predict
y_pred_xgb = xgb_clf.predict(X_test_embeddings)

print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))

XGBoost Accuracy: 0.864319108723764


In [32]:
import joblib

# Save
joblib.dump(xgb_clf, "xgb_fake_news.pkl")

['xgb_fake_news.pkl']