In [48]:
import pandas as pd
df_true = pd.read_csv("datasets/ISOT/True.csv")
df_false = pd.read_csv("datasets/ISOT/Fake.csv")

import numpy as np
import pandas as pd
import re
import string

In [49]:
df_true['label'] = 1
df_false['label'] = 0
label_dict={'fake':0,'true':1}
dataframe = pd.concat([df_true, df_false])

In [50]:
dataframe.to_csv("datasets/ISOT/merged/dataset.csv")

In [51]:
merged = pd.read_csv("datasets/ISOT/merged/dataset.csv")

In [52]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\[.*?\]', ' ', text)  # Remove text in square brackets
    text = re.sub(r'https?://\S+|www\.\S+', ' ', text)  # Remove links
    text = re.sub(r'<.*?>+', ' ', text)  # Remove HTML tags
    text = re.sub(r'[{}]+'.format(re.escape(string.punctuation)), ' ', text)  # Remove punctuation
    text = re.sub(r'\n', ' ', text)  # Remove newline characters
    text = re.sub(r'[.]', ' ', text)  # Remove period
    return text

In [53]:
merged.text = merged.text.map(clean_text)

In [54]:
def add_cls_token(df, content_column='content', subject_column='subject', title_column='title', text_column='text', date_column='date'):
    text_sep = "content is"
    title_sep = "title is"
    additional_text = " [SEP], "

    df[content_column] = "[CLS] at " +df[date_column]+additional_text+ df[subject_column] + additional_text+ title_sep + df[title_column] + additional_text+ text_sep + " " + df[text_column]+ additional_text+" ."

    return df


In [55]:
merged = add_cls_token(merged)

In [57]:
import tensorflow as tf
import tensorflow_hub as hub

def setup_embedding_models(dataframe):
    # Check GPU availability
    if tf.test.is_gpu_available():
        print("GPU is available. Using GPU for computation.")
        tf.config.experimental.set_memory_growth(tf.config.list_physical_devices('GPU')[0], True)
    else:
        print("GPU is not available. Using CPU for computation.")

    print("Loading embedding models...")
    use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
    print("Models loaded successfully!")

    def get_embeddings(text):
        return use_model(text).numpy()

    # Get the embeddings for each non-empty text string in the series
    embeddings = dataframe.content.apply(lambda x: get_embeddings([x]) if x.strip() else [])

    # Convert the embeddings to NumPy arrays
    embeddings = embeddings.apply(lambda x: np.array(x[0]) if len(x) > 0 else np.array([]))

    dataframe['embedding'] = embeddings
    return dataframe

In [58]:
merged = setup_embedding_models(merged)

GPU is not available. Using CPU for computation.
Loading embedding models...
Models loaded successfully!
0        [0.022326546, -0.054701515, 0.020800635, -0.04...
1        [-0.054021433, -0.053802226, -0.0062716617, -0...
2        [0.042473506, 0.010865708, -0.054122575, -0.04...
3        [-0.0043542148, 0.03477461, -0.04886739, -0.05...
4        [-0.05212041, -0.052163288, -0.045429505, 0.04...
                               ...                        
44893    [-0.035157423, 0.018418824, -0.05020842, -0.02...
44894    [-0.053530693, -0.059889995, -0.045325857, 0.0...
44895    [0.041855942, -0.049123228, 0.026592048, -0.02...
44896    [-0.030299284, -0.059425127, -0.04813366, 0.03...
44897    [-0.01798755, -0.031110464, -0.052893, -0.0175...
Name: embedding, Length: 44898, dtype: object


In [59]:
data = np.array(merged.embedding.apply(pd.Series).astype(float))

In [61]:
from sklearn.metrics import accuracy_score

In [62]:
from sklearn.cluster import KMeans

# K-Means
km = KMeans(n_clusters=2, init='k-means++', max_iter=300, tol=1e-4, random_state=42)
km.fit(data)

In [63]:
merged['K_means'] = np.abs(km.predict(data) - 1)

In [64]:
kmeans_accuracy = accuracy_score(merged.label, merged.K_means)

In [None]:
from sklearn.cluster import Birch

model = Birch(branching_factor = 50, n_clusters = 2, threshold = 0.5)
model.fit(data)
merged['Birch']= np.abs(model.predict(data) - 1)

In [None]:
birch_accuracy = accuracy_score(merged.label, merged.Birch)

In [69]:
from sklearn.mixture import GaussianMixture
nclusters=2
gmm = GaussianMixture(n_components=nclusters)
gmm.fit(data)
merged['GMM']=gmm.predict(data)

In [70]:
gmm_accuracy = accuracy_score(merged.label, merged.GMM)

In [72]:
def majority_voting(predictions):
    return 0 if predictions.count(0) > predictions.count(1) else 1

In [73]:
merged['majority_voting'] = merged.apply(
    lambda row: majority_voting([row['Birch'], row['K_means'], row['GMM']]),
    axis=1
)

In [75]:
merged_accuracy= accuracy_score(merged.label, merged.majority_voting)

In [77]:
merged.to_csv("datasets/ISOT/merged/dataset_labeled.csv")