<a href="https://colab.research.google.com/github/kuheli31/Fake-News-Detector/blob/main/True_Fake_News_Algorithm_checking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# --- Import Libraries ---
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# --- Load Datasets ---
data_fake = pd.read_csv('Fake.csv', quoting=3, encoding='utf-8', on_bad_lines='skip')
data_true = pd.read_csv('True.csv', quoting=3, encoding='utf-8', on_bad_lines='skip')

# --- Label Data ---
data_fake['class'] = 0
data_true['class'] = 1

# --- Drop unnecessary columns ---
data_fake = data_fake.drop(['title', 'subject', 'date'], axis=1, errors='ignore')
data_true = data_true.drop(['title', 'subject', 'date'], axis=1, errors='ignore')

# --- Merge and shuffle ---
data = pd.concat([data_fake, data_true], axis=0)
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

# --- Text Preprocessing ---
def wordopt(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', ' ', text)
    # Keep numbers as they may help in classification
    text = re.sub('\s+', ' ', text).strip()
    return text

data = data.dropna(subset=['text'])
data['text'] = data['text'].apply(wordopt)

# --- Features and Labels ---
x = data['text']
y = data['class']

# --- Train/Test Split ---
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.25, random_state=42, stratify=y
)

# --- TF-IDF Vectorization ---
vectorization = TfidfVectorizer(max_features=5000)  # limit vocab to top 5000 words
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

# --- Model Training ---
# Logistic Regression
LR = LogisticRegression(class_weight='balanced', max_iter=1000)
LR.fit(xv_train, y_train)
pred_lr = LR.predict(xv_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, pred_lr))
print(classification_report(y_test, pred_lr))

# Decision Tree
DT = DecisionTreeClassifier()
DT.fit(xv_train, y_train)
pred_dt = DT.predict(xv_test)
print("Decision Tree Accuracy:", accuracy_score(y_test, pred_dt))
print(classification_report(y_test, pred_dt))

# Gradient Boosting
GB = GradientBoostingClassifier(random_state=42)
GB.fit(xv_train, y_train)
pred_gb = GB.predict(xv_test)
print("Gradient Boosting Accuracy:", accuracy_score(y_test, pred_gb))
print(classification_report(y_test, pred_gb))

# Random Forest
RF = RandomForestClassifier(class_weight='balanced', random_state=42)
RF.fit(xv_train, y_train)
pred_rf = RF.predict(xv_test)
print("Random Forest Accuracy:", accuracy_score(y_test, pred_rf))
print(classification_report(y_test, pred_rf))

# --- Manual Testing Function ---
def manual_testing(news):
    news_df = pd.DataFrame({"text": [news]})
    news_df['text'] = news_df['text'].apply(wordopt)
    new_x_test = vectorization.transform(news_df['text'])

    pred_LR = LR.predict(new_x_test)
    pred_DT = DT.predict(new_x_test)
    pred_GB = GB.predict(new_x_test)
    pred_RF = RF.predict(new_x_test)

    print("\nManual Testing Results:")
    print("Logistic Regression Prediction:", "Real" if pred_LR[0]==1 else "Fake")
    print("Decision Tree Prediction:", "Real" if pred_DT[0]==1 else "Fake")
    print("Gradient Boosting Prediction:", "Real" if pred_GB[0]==1 else "Fake")
    print("Random Forest Prediction:", "Real" if pred_RF[0]==1 else "Fake")

# --- Example Manual Test ---
news = str(input("Enter news text: "))
manual_testing(news)

  text = re.sub('\[.*?\]', '', text)
  text = re.sub('https?://\S+|www\.\S+', '', text)
  text = re.sub('\s+', ' ', text).strip()


Logistic Regression Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       1.00      1.00      1.00         1

    accuracy                           1.00         5
   macro avg       1.00      1.00      1.00         5
weighted avg       1.00      1.00      1.00         5

Decision Tree Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       1.00      1.00      1.00         1

    accuracy                           1.00         5
   macro avg       1.00      1.00      1.00         5
weighted avg       1.00      1.00      1.00         5

Gradient Boosting Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       1.00      1.00      1.00         1

    accuracy                           1.00         5
   macro avg       1.00      1.00 