In [None]:
!pip install --upgrade scikit-learn==1.2.2
!pip install numpy pandas nltk matplotlib ipywidgets imbalanced-learn shap requests


import pandas as pd
import numpy as np
import re
import nltk
import joblib
import requests
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, balanced_accuracy_score, ConfusionMatrixDisplay
from sklearn.calibration import CalibrationDisplay, CalibratedClassifierCV
from imblearn.under_sampling import RandomUnderSampler
import ipywidgets as widgets
from IPython.display import display, clear_output
import shap

# 2. NLTK Resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# 3. Data Loading and Validation
def validate_data(df):
    if 'text' not in df.columns:
        raise ValueError("Dataset missing 'text' column")
    if df['text'].isnull().any():
        raise ValueError("Dataset contains missing text values")
    return df

df_fake = validate_data(pd.read_csv("Fake.csv"))
df_real = validate_data(pd.read_csv("True.csv"))
df_fake["label"] = 0  # Fake
df_real["label"] = 1  # Real
df = pd.concat([df_fake, df_real], ignore_index=True).sample(frac=1, random_state=42)
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df = df[df['date'] > '2010-01-01']

# 4. Enhanced Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = re.sub(r'[^a-zA-Z0-9.,;:!?\'" ]', '', str(text), flags=re.I).lower()
    tokens = nltk.word_tokenize(text)
    return ' '.join([lemmatizer.lemmatize(word) for word in tokens 
                     if word not in stop_words and len(word) > 2])

df["cleaned_text"] = df["text"].apply(preprocess)

# 5. Feature Engineering and Train/Test Split
tfidf = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1,2)
)
X = tfidf.fit_transform(df["cleaned_text"])
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 6. Class Balancing on Training Data
undersampler = RandomUnderSampler(random_state=42)
X_train_bal, y_train_bal = undersampler.fit_resample(X_train, y_train)

# 7. Custom Logistic Regression
class BalancedLogisticRegression(LogisticRegression):
    def fit(self, X, y, sample_weight=None):
        if sample_weight is None:
            class_weights = {0: len(y)/sum(y==0), 1: len(y)/sum(y==1)}
            sample_weight = [class_weights[c] for c in y]
        return super().fit(X, y, sample_weight)

# 8. Hyperparameter Tuning
lr_params = {'C': [0.1, 1, 10], 'class_weight': [None, 'balanced']}
lr = GridSearchCV(
    BalancedLogisticRegression(max_iter=1000),
    lr_params,
    cv=3,
    scoring='accuracy'
)
lr.fit(X_train_bal, y_train_bal)
best_model = lr.best_estimator_

# 9. Model Calibration
calibrated_model = CalibratedClassifierCV(best_model, cv='prefit')
calibrated_model.fit(X_train_bal, y_train_bal)

# 10. Save Models
joblib.dump(calibrated_model, "news_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")

# 11. Evaluation
print("=== Model Performance ===")
print(f"Best Parameters: {lr.best_params_}")
print(f"Test Accuracy: {accuracy_score(y_test, calibrated_model.predict(X_test)):.2%}")
print(f"Balanced Accuracy: {balanced_accuracy_score(y_test, calibrated_model.predict(X_test)):.2%}")

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18,6))
ConfusionMatrixDisplay.from_estimator(calibrated_model, X_test, y_test, ax=ax1)
ax1.set_title("Confusion Matrix")
CalibrationDisplay.from_estimator(
    calibrated_model,
    X_test,
    y_test,
    n_bins=10,
    ax=ax2
)
ax2.set_title("Calibration Curve")
plt.show()

# 12. Fact-Checking API Integration
API_KEY = 'add your google api'  # <-- Replace with your actual API key
API_ENDPOINT = "https://factchecktools.googleapis.com/v1alpha1/claims:search"

def check_fact_claims(text):
    params = {'query': text, 'key': API_KEY}
    try:
        response = requests.get(API_ENDPOINT, params=params)
        response.raise_for_status()
        claims = response.json().get('claims', [])
        return claims if claims else None
    except Exception as e:
        return f"API Error: {str(e)}"

def get_source_credibility(text):
    verified_sources = ['reuters', 'associated press', 'bbc']
    return "Verified Source" if any(source in text.lower() for source in verified_sources) else "Unverified Source"

# 13. Interactive Interface
input_text = widgets.Textarea(
    placeholder='Paste news article here...',
    layout={'width': '90%', 'height': '200px'}
)
analyze_btn = widgets.Button(
    description="Analyze Article", 
    button_style='success',
    layout={'width': '200px'}
)
output = widgets.Output()

def on_analyze_click(b):
    with output:
        clear_output()
        text = input_text.value.strip()
        if not text:
            print("Error: Please enter a news article!")
            return

        source_status = get_source_credibility(text)
        clean_text = preprocess(text)
        vector = tfidf.transform([clean_text])
        pred = calibrated_model.predict(vector)[0]
        proba = calibrated_model.predict_proba(vector)[0][1]

        fact_check = check_fact_claims(text)
        print(f"\n{'═'*60}")
        print(f"📰 Source: {source_status}")
        print(f"🔍 Prediction: {'FAKE NEWS 🔴' if pred == 0 else 'REAL NEWS 🟢'}")
        print(f"📊 Confidence: {proba*100:.2f}%")
        print(f"{'═'*60}\n")

        print("🧠 Top Predictive Features:")
        feature_names = tfidf.get_feature_names_out()
        coefs = calibrated_model.base_estimator.coef_[0]
        top_features = sorted(zip(feature_names, coefs), key=lambda x: abs(x[1]), reverse=True)[:5]
        for feat, weight in top_features:
            print(f" - {'🚩' if weight < 0 else '✅'} {feat}: {abs(weight):.2f}")

        print("\n🔍 Prediction Explanation:")
        explainer = shap.LinearExplainer(calibrated_model.base_estimator, X_train_bal)
        shap_values = explainer.shap_values(vector)
        shap.plots.waterfall(shap_values[0], max_display=10,show=False)

        print("\n🔎 Fact-Check Findings:")
        if isinstance(fact_check, str):
            print(fact_check)
        elif fact_check:
            for claim in fact_check[:3]:
                print(f"- Claim: {claim.get('text', 'N/A')}")
                print(f"  By: {claim.get('claimant', 'Unknown')}")
                if claim.get('claimReview'):
                    review = claim['claimReview'][0]
                    print(f"  Verdict: {review.get('textualRating', 'N/A')}")
                    print(f"  Source: {review.get('publisher', {}).get('name', 'N/A')}")
                    print(f"  URL: {review.get('url', 'N/A')}\n")
        else:
            print("No related fact-checks found")

analyze_btn.on_click(on_analyze_click)

display(widgets.VBox([
    widgets.HTML("""
        <h1 style='text-align:center'>🕵️♂️ Fake News Detector</h1>
        <h3 style='text-align:center'>AI-Powered Verification with Fact-Checking</h3>
    """),
    widgets.HTML("<b>Enter News Article:</b>"),
    input_text,
    widgets.HBox([analyze_btn], layout={'justify_content': 'center'}),
    output
]))


SyntaxError: invalid syntax (1540792444.py, line 2)