In [3]:
import pandas as pd

# Load datasets
fake = pd.read_csv('Fake.csv')
true = pd.read_csv('True.csv')

# Add labels
fake['label'] = 0
true['label'] = 1

# Merge datasets
combined = pd.concat([fake, true], axis=0)

# Handle missing values
combined.dropna(inplace=True)

# Combine title and text
combined['content'] = combined['title'] + ' ' + combined['text']

# Analyze class distribution
print(combined['label'].value_counts())

label
0    23481
1    21417
Name: count, dtype: int64


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2

# Text preprocessing
X = combined['content']
y = combined['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorization and Feature Selection
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1, 2))),
    ('select', SelectKBest(chi2, k=5000))
])

X_train_transformed = pipeline.fit_transform(X_train, y_train)
X_test_transformed = pipeline.transform(X_test)

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

# Train models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(n_estimators=100)
}

predictions = []
for name, model in models.items():
    model.fit(X_train_transformed, y_train)
    preds = model.predict_proba(X_test_transformed)[:, 1]  # Probability of class 1
    predictions.append(preds)

# Ensemble (average predictions)
ensemble_preds = sum(predictions) / len(predictions)
ensemble_preds_class = (ensemble_preds > 0.5).astype(int)

# Evaluate ensemble
ensemble_report = classification_report(y_test, ensemble_preds_class, output_dict=True)
ensemble_accuracy = accuracy_score(y_test, ensemble_preds_class)
print(ensemble_report, ensemble_accuracy)

{'0': {'precision': 0.9879670677644079, 'recall': 0.9888020283118529, 'f1-score': 0.9883843717001056, 'support': 4733.0}, '1': {'precision': 0.9875088380862598, 'recall': 0.9865787614786908, 'f1-score': 0.9870435806831567, 'support': 4247.0}, 'accuracy': 0.987750556792873, 'macro avg': {'precision': 0.9877379529253338, 'recall': 0.9876903948952719, 'f1-score': 0.9877139761916311, 'support': 8980.0}, 'weighted avg': {'precision': 0.987750352681658, 'recall': 0.987750556792873, 'f1-score': 0.9877502581757199, 'support': 8980.0}} 0.987750556792873


In [13]:
# Save the ensemble model
ensemble_results = pd.DataFrame({
    'True Label': y_test,
    'Ensemble Prediction': ensemble_preds_class,
    'Confidence': ensemble_preds
})

In [14]:
import joblib
import os
import numpy as np

# Create Ensemble Model Class
class EnsembleModel:
    def __init__(self, models):
        self.models = models  # Dictionary of models
    
    def fit(self, X, y):
        for model in self.models.values():
            model.fit(X, y)
    
    def predict_proba(self, X):
        predictions = np.zeros((X.shape[0], 2))  # Assuming binary classification
        for model in self.models.values():
            predictions += model.predict_proba(X)
        return predictions / len(self.models)
    
    def predict(self, X):
        proba = self.predict_proba(X)
        return (proba[:, 1] > 0.5).astype(int)
    def print_models(self):
        for name, model in self.models.items():
            print(f"{name}: {model}")

# Train the ensemble model
ensemble_model = EnsembleModel(models)
ensemble_model.fit(X_train_transformed, y_train)

# Directory to save model
output_dir = 'saved_models'
os.makedirs(output_dir, exist_ok=True)

# Save the ensemble model
joblib.dump(ensemble_model, f'{output_dir}/ensemble_model.pkl')

print("Ensemble model saved successfully.")


Ensemble model saved successfully.


In [15]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
import joblib
import os

# 1. Train the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1, 2))),
    ('select', SelectKBest(chi2, k=5000))
])
pipeline.fit(X_train, y_train)

# Transform the data using the pipeline
X_train_transformed = pipeline.transform(X_train)
X_test_transformed = pipeline.transform(X_test)

# 2. Train individual models (already in your script)
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(n_estimators=100)
}

# Train models and save the ensemble
for name, model in models.items():
    model.fit(X_train_transformed, y_train)

# Create and save the ensemble model
ensemble_model = EnsembleModel(models)
ensemble_model.fit(X_train_transformed, y_train)

# Ensure the directory exists
os.makedirs('saved_models', exist_ok=True)

# Save the ensemble model
joblib.dump(ensemble_model, 'saved_models/ensemble_model.pkl')

# 3. Save the pipeline
joblib.dump(pipeline, 'saved_models/pipeline.pkl')
print("Pipeline and ensemble model saved successfully.")


Pipeline and ensemble model saved successfully.


In [11]:
!pip install shap


Defaulting to user installation because normal site-packages is not writeable


In [12]:
import shap
import numpy as np
import matplotlib.pyplot as plt

# Convert to dense if sparse
background = X_train_transformed.toarray()  # Use a small sample as background
test_sample = X_test_transformed.toarray()   # Take a subset of test data

shap_values_ensemble = np.zeros(test_sample.shape)

for name, model in models.items():
    if isinstance(model, MultinomialNB):
        explainer = shap.KernelExplainer(model.predict_proba, background)
        shap_values = explainer.shap_values(test_sample)
        shap_values_model = np.array(shap_values[1])  # Class 1 SHAP values
    else:
        explainer = shap.PermutationExplainer(model.predict, background)
        shap_values = explainer(test_sample)

        # Handle dimensional mismatch
        shap_values_model = shap_values.values if shap_values.values.ndim == 2 else shap_values.values[:, :, 1]
    
    # Aggregate SHAP values for ensemble
    shap_values_ensemble += shap_values_model

shap_values_ensemble /= len(models)

plt.title("SHAP Summary Plot - Ensemble Model")
shap.summary_plot(shap_values_ensemble, test_sample)


ValueError: max_evals=500 is too low for the Permutation explainer, it must be at least 2 * num_features + 1 = 7439!

In [None]:
import os

output_dir = 'output'
os.makedirs(output_dir, exist_ok=True)

ensemble_results.to_csv(f'{output_dir}/ensemble_predictions.csv', index=False)


In [None]:
from flask import Flask, request, jsonify
from web3 import Web3

app = Flask(__name__)

w3 = Web3(Web3.HTTPProvider('https://rinkeby.infura.io/v3/YOUR_INFURA_PROJECT_ID'))
contract_address = '0xYourContractAddress'
contract_abi = [...]  # ABI of the smart contract
contract = w3.eth.contract(address=contract_address, abi=contract_abi)

@app.route('/predict', methods=['POST'])
def predict():
    data = request.json['text']
    transformed_text = pipeline.transform([data])
    prediction = models['Logistic Regression'].predict(transformed_text)
    tx_hash = contract.functions.logResult(int(prediction[0])).transact({'from': w3.eth.accounts[0]})
    receipt = w3.eth.wait_for_transaction_receipt(tx_hash)
    return jsonify({'prediction': int(prediction[0]), 'transaction_hash': receipt.transactionHash.hex()})

if __name__ == '__main__':
    app.run(debug=True)

In [None]:
import streamlit as st

st.title('TruthGuard Dashboard')
user_input = st.text_area("Enter news content to verify:")
if user_input:
    transformed_text = pipeline.transform([user_input])
    prediction = models['Logistic Regression'].predict(transformed_text)
    st.write('Prediction: ', 'Fake News' if prediction == 0 else 'True News')

In [10]:
#Code to load models:
import joblib

# Load models
loaded_models = {}
for name in models.keys():
    model_path = f'{output_dir}/{name.replace(" ", "_")}_model.pkl'
    loaded_models[name] = joblib.load(model_path)

# Load pipeline
loaded_pipeline = joblib.load(f'{output_dir}/text_pipeline.pkl')

print("Models and pipeline loaded successfully.")

# Load ensemble model
loaded_ensemble = joblib.load(f'{output_dir}/ensemble_model.pkl')

# Predict on new data
y_pred_ensemble = loaded_ensemble.predict(X_test_transformed)

# Evaluate the loaded ensemble model
print("Loaded Ensemble Model Results:")
print(classification_report(y_test, y_pred_ensemble))
print(f"Accuracy: {accuracy_score(y_test, y_pred_ensemble):.4f}")



Models and pipeline loaded successfully.
Loaded Ensemble Model Results:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4733
           1       0.99      0.99      0.99      4247

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980

Accuracy: 0.9879


In [None]:
# Load ensemble model
loaded_ensemble = joblib.load(f'{output_dir}/ensemble_model.pkl')

# Predict on new data
y_pred_ensemble = loaded_ensemble.predict(X_test_transformed)

# Evaluate the loaded ensemble model
print("Loaded Ensemble Model Results:")
print(classification_report(y_test, y_pred_ensemble))
print(f"Accuracy: {accuracy_score(y_test, y_pred_ensemble):.4f}")
