In [None]:
import pandas as pd
import numpy as np
import re
import string
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from google.colab import files


uploaded = files.upload()


fd_fake = pd.read_csv('Fake.csv')
fd_true = pd.read_csv('True.csv')

In [None]:
fd_fake["class"] = 0
fd_true["class"] = 1

fd_margin = pd.concat([fd_fake, fd_true], axis=0)

print("Columns in the DataFrame:", fd_margin.columns)
print("First few rows of the DataFrame:", fd_margin.head())

In [None]:
fd = fd_margin.sample(frac=1).reset_index(drop=True)

In [None]:
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W", " ", text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
fd["text"] = fd["text"].apply(wordopt)

In [None]:
vec = TfidfVectorizer(max_features=1000, stop_words='english')
x_text = vec.fit_transform(fd["text"])

In [None]:
vec_title = TfidfVectorizer(max_features=500, stop_words='english')
x_title = vec_title.fit_transform(fd["title"])

vec_subject = TfidfVectorizer(max_features=100, stop_words='english')
x_subject = vec_subject.fit_transform(fd["subject"])

In [None]:
from scipy.sparse import hstack
x_combined = hstack([x_text, x_title, x_subject])

In [None]:
kmeans = KMeans(n_clusters=2, random_state=0)
kmeans.fit(x_combined)

clusters = kmeans.labels_
fd["cluster"] = clusters

In [None]:
def predict_news(article, title, subject):
    cleaned_article = wordopt(article)
    cleaned_title = wordopt(title)
    cleaned_subject = wordopt(subject)

    vectorized_article = vec.transform([cleaned_article])
    vectorized_title = vec_title.transform([cleaned_title])
    vectorized_subject = vec_subject.transform([cleaned_subject])

    combined_vector = hstack([vectorized_article, vectorized_title, vectorized_subject])

    cluster_label = kmeans.predict(combined_vector)[0]

    if 'class' in fd.columns:
        majority_class = fd[fd['cluster'] == cluster_label]['class'].mode()[0]
        return 'Fake' if majority_class == 0 else 'True'
    else:
        print("Error: 'class' column not found in the DataFrame.")
        return None

In [None]:
new_article = input("Enter the news article to check if it is True or Fake: ")
new_title = input("Enter the title of the news article: ")
new_subject = input("Enter the subject of the news article: ")

In [None]:
result = predict_news(new_article, new_title, new_subject)
if result:
    print(f"The news article is: {result}")

In [None]:
pca = PCA(n_components=2)
x_reduced = pca.fit_transform(x_combined.toarray())
plt.figure(figsize=(10, 6))
plt.scatter(x_reduced[:, 0], x_reduced[:, 1], c=clusters, cmap='viridis', marker='o')
plt.title("KMeans Clustering of News Data")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.colorbar(label="Cluster")
plt.show()

sil_score = silhouette_score(x_combined, clusters)
print(f"Silhouette Score: {sil_score}")