In [1]:
# Import the required libraries
from flask import Flask, render_template, request,jsonify
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from bs4 import BeautifulSoup
import requests
import spacy
nlp = spacy.load('en_core_web_sm')
nlp.max_length = 2000000

In [4]:
# Load the fake news dataset
df = pd.read_csv('/content/drive/MyDrive/train.csv')
df = df.dropna()
vectorized = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorized.fit_transform(df['text'])

# Initialize Flask application
#app = Flask(__name__)

In [5]:
# Function to cluster and summarize input news
def cluster_and_summarize(news_text, k, vectorizer):
  try:
    # Perform clustering
    kmeans = KMeans(n_clusters=k, random_state=0, n_init=10)
    tfidf_matrix = vectorizer.transform(df['text'])
    kmeans.fit(tfidf_matrix)

    # Assign input news to clusters
    news_vectorized = vectorizer.transform([news_text])
    cluster_label = kmeans.predict(news_vectorized)[0]

    # Summarize headlines of the assigned cluster
    cluster_headlines = df[kmeans.labels_ == cluster_label]['title']
    headlines = ' '.join(cluster_headlines)

    print(headlines)

    return headlines

  except Exception as e:
    print("An error occurred during clustering and summarization:", str(e))
    return None

In [6]:
# Function to perform Google search
def search_google(query, num_results):
    try:
        search_results = []
        api_key = "AIzaSyAVtMUqMQ-yRengeU2HbMXYB3uLVNPRmTM"  # Replace with your Google Custom Search API key
        cx = "973a14399ba9a4289"  # Replace with your Google Custom Search Engine ID
        url = f"https://www.googleapis.com/customsearch/v1?key={api_key}&cx={cx}&q={query}&num={num_results}"
        response = requests.get(url)
        data = response.json()
        items = data.get('items', [])
        for item in items:
            search_results.append(item['link'])
        print('Top 5 Websites Extracted by Google Search:', search_results)
        return search_results

    except Exception as e:
        print("An error occurred during Google search:", str(e))
        return []

In [7]:
# Function to calculate cosine similarity
def calculate_cosine_similarity(news_text, summaries):
    try:
        doc1 = nlp(news_text)
        scores = []
        for summary in summaries:
            doc2 = nlp(summary)
            cos_sim = doc1.similarity(doc2)
            scores.append(cos_sim)
        print('Cosine Similarity Scores:', scores)
        return scores

    except Exception as e:
        print("An error occurred during cosine similarity calculation:", str(e))
        return []

In [8]:
# Function to calculate Jaccard similarity
def calculate_jaccard_similarity(news_text, summaries):
    try:
        set1 = set(news_text.lower().split())
        scores = []
        for summary in summaries:
            set2 = set(summary.lower().split())
            intersection = len(set1.intersection(set2))
            union = len(set1) + len(set2) - intersection
            similarity = intersection / union
            scores.append(similarity)
        print('Jaccard Similarity Scores:', scores)
        return scores

    except Exception as e:
        print("An error occurred during Jaccard similarity calculation:", str(e))
        return []


In [9]:
# Train the DNN model
def train_model():
    # Prepare the features and labels
    X = df['text']
    y = df['label']
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # Vectorize the text data
    X_train_vectorized = vectorized.fit_transform(X_train)
    # Train the DNN model
    model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500)
    model.fit(X_train_vectorized, y_train)
    # Evaluate the model
    X_test_vectorized = vectorized.transform(X_test)
    y_pred = model.predict(X_test_vectorized)
    accuracy = accuracy_score(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)
    print('Model Accuracy :',accuracy)
    print('Confusion Metrix :',confusion)
    return model

In [10]:
# Function to scrape and summarize webpages
def scrape_webpages(urls):
    contents = []
    for url in urls:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser',from_encoding='utf-8')
        text = soup.get_text(separator=' ')
        contents.append(text)
    return contents


In [11]:
# Function for extractive summarization
def extractive_summarization(texts):
    summaries = []
    for text in texts:
        doc = nlp(text)
        sentences = [sent.text for sent in doc.sents]
        summary = ' '.join(sentences[:3])  # Extract first 3 sentences as a summary
        summaries.append(summary)
    return summaries

In [12]:
# Perform K-means clustering and determine optimal clusters
def determine_optimal_clusters(tfidf_matrix):
    try:
        sse = []
        silhouette_scores = []

        for k in range(2, 15):
            kmeans = KMeans(n_clusters=k, random_state=0, n_init=10)
            kmeans.fit(tfidf_matrix)
            sse.append(kmeans.inertia_)
            silhouette_scores.append(silhouette_score(tfidf_matrix, kmeans.labels_))

        print('SSE Values:', sse)
        print('Silhouette Score:', silhouette_scores)

        return sse, silhouette_scores

    except Exception as e:
        print("An error occurred during cluster determination:", str(e))
        return None,None

In [13]:
# Function to generate final article
def generate_final_article(summaries, cosine_similarities):
    try:
        final_article = []
        for summary, similarity in zip(summaries, cosine_similarities):
            if any(score <= 0.5 for score in similarity):
                final_article.append(summary)
        return ' '.join(final_article)

    except Exception as e:
        print("An error occurred during final article generation:", str(e))
        return " "


In [14]:
news_text = input()

# Perform K-means clustering
sse, silhouette_scores = determine_optimal_clusters(tfidf_matrix)
if sse is None or silhouette_scores is None:
  raise ValueError("Cluster determination failed.")

# Choose optimal clusters
optimal_k = silhouette_scores.index(max(silhouette_scores)) + 2

print('Optimal Number of Clusters : ',optimal_k)

# Split input news into headlines and summarize using TFIDF vectorization
news_vectorized = vectorized.transform([news_text])
headlines = cluster_and_summarize(news_vectorized, optimal_k, vectorized)

# Perform Google search and retrieve top 5 search results
search_results = search_google(headlines, 5)

# Web scraping and summarization of top 5 webpages
content = scrape_webpages(search_results[:5])
summaries = extractive_summarization(content)

cosine_similarity_scores = [calculate_cosine_similarity(news_text, summary) for summary in summaries]
jaccard_similarity_scores = [calculate_jaccard_similarity(news_text, summary) for summary in summaries]
# Generate final article based on similarity scores
final_article = generate_final_article(summaries, cosine_similarity_scores)
# Check if the news is classified as fake
if any(max(cosine_similarity_scores) )>= 0.5  or any(max(jaccard_similarity_scores) )>= 0.5:
  classification = '100% Real News'
  print(classification)
else:
  # Train the DNN model
  model = train_model()
  if model is None:
    raise ValueError("Model training failed.")
    news_vectorized = vectorized.transform([news_text])
    prediction = model.predict(news_vectorized)
    classification = 'Fake' if prediction[0] == 0 else "Maybe True"
    print(classification)




Hollywood Would Love Trump if He Bombed North Korea over Lack of Trans Bathrooms (Exclusive Video) - Breitbart Obama’s Organizing for Action Partners with Soros-Linked ‘Indivisible’ to Disrupt Trump’s Agenda BBC Comedy Sketch "Real Housewives of ISIS" Causes Outrage Russian Researchers Discover Secret Nazi Military Base ‘Treasure Hunter’ in the Arctic [Photos] Re
SSE Values: [17610.611367128786, 17505.83768410731, 17440.74118046695, 17386.687918194235, 17338.478657321713, 17295.813897519154, 17267.919246085527, 17229.12928651338, 17191.11993937224, 17170.34446830526, 17135.758440310834, 17110.426282277884, 17086.697894397]
Silhouette Score: [0.0001811737586264834, 0.0014109348119309582, 0.0021934119247078534, 0.00284065975738246, 0.0012230143006315168, 0.0024577335383799254, 0.003064407879740654, 0.002124962637923431, 0.0034530970917729586, 0.0038153853166182808, 0.00471408627308861, 0.006764822429542123, 0.0056765177043851065]
Optimal Number of Clusters :  13
An error occurred during 

  cos_sim = doc1.similarity(doc2)


Cosine Similarity Scores: [0.027863216251242665, 0.030352828058009526, 0.027863216251242665, 0.030352828058009526, 0.027863216251242665, 0.030352828058009526, 0.027863216251242665, 0.030352828058009526, 0.027863216251242665, 0.030352828058009526, 0.027863216251242665, 0.030352828058009526, 0.027863216251242665, 0.030352828058009526, 0.027863216251242665, 0.030352828058009526, 0.027863216251242665, 0.030352828058009526, 0.027863216251242665, 0.030352828058009526, 0.027863216251242665, 0.030352828058009526, 0.027863216251242665, 0.030352828058009526, 0.1732492150507988, 0.031011132745603215, -0.12495274321092045, 0.06895175409249053, 0.030352828058009526, 0.1944131174783823, 0.06895175409249053, -0.10855590876175882, 0.07351253985276124, -0.12495274321092045, 0.07351253985276124, 0.19813967996903023, 0.07351253985276124, 0.031011132745603215, -0.12495274321092045, 0.030352828058009526, -0.1341625206388886, 0.030352828058009526, 0.3718253434327076, 0.06895175409249053, -0.0343995138432810