In [48]:
import pandas as pd
df_true = pd.read_csv("datasets/ISOT/True.csv")
df_false = pd.read_csv("datasets/ISOT/Fake.csv")

import numpy as np
import pandas as pd
import re
import string

In [49]:
df_true['label'] = 1
df_false['label'] = 0
label_dict={'fake':0,'true':1}
dataframe = pd.concat([df_true, df_false])

In [50]:
dataframe.to_csv("datasets/ISOT/merged/dataset.csv")

In [51]:
merged = pd.read_csv("datasets/ISOT/merged/dataset.csv")

In [52]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\[.*?\]', ' ', text)  # Remove text in square brackets
    text = re.sub(r'https?://\S+|www\.\S+', ' ', text)  # Remove links
    text = re.sub(r'<.*?>+', ' ', text)  # Remove HTML tags
    text = re.sub(r'[{}]+'.format(re.escape(string.punctuation)), ' ', text)  # Remove punctuation
    text = re.sub(r'\n', ' ', text)  # Remove newline characters
    text = re.sub(r'[.]', ' ', text)  # Remove period
    return text

In [53]:
merged.text = merged.text.map(clean_text)

In [54]:
def add_cls_token(df, content_column='content', subject_column='subject', title_column='title', text_column='text', date_column='date'):
    text_sep = "content is"
    title_sep = "title is"
    additional_text = " [SEP], "

    df[content_column] = "[CLS] at " +df[date_column]+additional_text+ df[subject_column] + additional_text+ title_sep + df[title_column] + additional_text+ text_sep + " " + df[text_column]+ additional_text+" ."

    return df


In [55]:
merged = add_cls_token(merged)

In [56]:
merged.head()

Unnamed: 0.1,Unnamed: 0,title,text,subject,date,label,content
0,0,"As U.S. budget fight looms, Republicans flip t...",washington reuters the head of a conservat...,politicsNews,"December 31, 2017",1,"[CLS] at December 31, 2017 [SEP], politicsNew..."
1,1,U.S. military to accept transgender recruits o...,washington reuters transgender people will...,politicsNews,"December 29, 2017",1,"[CLS] at December 29, 2017 [SEP], politicsNew..."
2,2,Senior U.S. Republican senator: 'Let Mr. Muell...,washington reuters the special counsel inv...,politicsNews,"December 31, 2017",1,"[CLS] at December 31, 2017 [SEP], politicsNew..."
3,3,FBI Russia probe helped by Australian diplomat...,washington reuters trump campaign adviser ...,politicsNews,"December 30, 2017",1,"[CLS] at December 30, 2017 [SEP], politicsNew..."
4,4,Trump wants Postal Service to charge 'much mor...,seattle washington reuters president donal...,politicsNews,"December 29, 2017",1,"[CLS] at December 29, 2017 [SEP], politicsNew..."


In [57]:
import tensorflow as tf
import tensorflow_hub as hub

def setup_embedding_models(dataframe):
    # Check GPU availability
    if tf.test.is_gpu_available():
        print("GPU is available. Using GPU for computation.")
        tf.config.experimental.set_memory_growth(tf.config.list_physical_devices('GPU')[0], True)
    else:
        print("GPU is not available. Using CPU for computation.")

    print("Loading embedding models...")
    use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
    print("Models loaded successfully!")

    def get_embeddings(text):
        return use_model(text).numpy()

    # Get the embeddings for each non-empty text string in the series
    embeddings = dataframe.content.apply(lambda x: get_embeddings([x]) if x.strip() else [])

    # Convert the embeddings to NumPy arrays
    embeddings = embeddings.apply(lambda x: np.array(x[0]) if len(x) > 0 else np.array([]))

    dataframe['embedding'] = embeddings
    return dataframe

In [58]:
merged = setup_embedding_models(merged)
print(merged['embedding'])

GPU is not available. Using CPU for computation.
Loading embedding models...
Models loaded successfully!
0        [0.022326546, -0.054701515, 0.020800635, -0.04...
1        [-0.054021433, -0.053802226, -0.0062716617, -0...
2        [0.042473506, 0.010865708, -0.054122575, -0.04...
3        [-0.0043542148, 0.03477461, -0.04886739, -0.05...
4        [-0.05212041, -0.052163288, -0.045429505, 0.04...
                               ...                        
44893    [-0.035157423, 0.018418824, -0.05020842, -0.02...
44894    [-0.053530693, -0.059889995, -0.045325857, 0.0...
44895    [0.041855942, -0.049123228, 0.026592048, -0.02...
44896    [-0.030299284, -0.059425127, -0.04813366, 0.03...
44897    [-0.01798755, -0.031110464, -0.052893, -0.0175...
Name: embedding, Length: 44898, dtype: object


In [59]:
data = np.array(merged.embedding.apply(pd.Series).astype(float))

In [60]:
data.shape

(44898, 512)

In [61]:
from sklearn.metrics import accuracy_score

In [62]:
from sklearn.cluster import KMeans

# K-Means
km = KMeans(n_clusters=2, init='k-means++', max_iter=300, tol=1e-4, random_state=42)
km.fit(data)

In [63]:
merged['K_means'] = np.abs(km.predict(data) - 1)

In [64]:
kmeans_accuracy = accuracy_score(merged.label, merged.K_means)

In [None]:
kmeans_accuracy

In [None]:
from sklearn.cluster import Birch

model = Birch(branching_factor = 50, n_clusters = 2, threshold = 0.5)
model.fit(data)
merged['Birch']= np.abs(model.predict(data) - 1)

In [None]:
birch_accuracy = accuracy_score(merged.label, merged.Birch)

In [68]:
birch_accuracy

0.6545503140451691

In [69]:
from sklearn.mixture import GaussianMixture
nclusters=2
gmm = GaussianMixture(n_components=nclusters)
gmm.fit(data)
merged['GMM']=gmm.predict(data)

In [70]:
gmm_accuracy = accuracy_score(merged.label, merged.GMM)

In [71]:
gmm_accuracy

0.4107087175375295

In [72]:
def majority_voting(predictions):
    return 0 if predictions.count(0) > predictions.count(1) else 1

In [73]:
merged['majority_voting'] = merged.apply(
    lambda row: majority_voting([row['Birch'], row['K_means'], row['GMM']]),
    axis=1
)

In [74]:
merged

Unnamed: 0.1,Unnamed: 0,title,text,subject,date,label,content,embedding,K_means,Birch,GMM,majority_voting
0,0,"As U.S. budget fight looms, Republicans flip t...",washington reuters the head of a conservat...,politicsNews,"December 31, 2017",1,"[CLS] at December 31, 2017 [SEP], politicsNew...","[0.022326546, -0.054701515, 0.020800635, -0.04...",0,1,1,1
1,1,U.S. military to accept transgender recruits o...,washington reuters transgender people will...,politicsNews,"December 29, 2017",1,"[CLS] at December 29, 2017 [SEP], politicsNew...","[-0.054021433, -0.053802226, -0.0062716617, -0...",0,1,1,1
2,2,Senior U.S. Republican senator: 'Let Mr. Muell...,washington reuters the special counsel inv...,politicsNews,"December 31, 2017",1,"[CLS] at December 31, 2017 [SEP], politicsNew...","[0.042473506, 0.010865708, -0.054122575, -0.04...",0,0,1,0
3,3,FBI Russia probe helped by Australian diplomat...,washington reuters trump campaign adviser ...,politicsNews,"December 30, 2017",1,"[CLS] at December 30, 2017 [SEP], politicsNew...","[-0.0043542148, 0.03477461, -0.04886739, -0.05...",0,0,1,0
4,4,Trump wants Postal Service to charge 'much mor...,seattle washington reuters president donal...,politicsNews,"December 29, 2017",1,"[CLS] at December 29, 2017 [SEP], politicsNew...","[-0.05212041, -0.052163288, -0.045429505, 0.04...",1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
44893,23476,McPain: John McCain Furious That Iran Treated ...,21st century wire says as 21wire reported earl...,Middle-east,"January 16, 2016",0,"[CLS] at January 16, 2016 [SEP], Middle-east [...","[-0.035157423, 0.018418824, -0.05020842, -0.02...",1,1,1,1
44894,23477,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st century wire says it s a familiar theme ...,Middle-east,"January 16, 2016",0,"[CLS] at January 16, 2016 [SEP], Middle-east [...","[-0.053530693, -0.059889995, -0.045325857, 0.0...",1,1,0,1
44895,23478,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,patrick henningsen 21st century wireremember ...,Middle-east,"January 15, 2016",0,"[CLS] at January 15, 2016 [SEP], Middle-east [...","[0.041855942, -0.049123228, 0.026592048, -0.02...",1,1,0,1
44896,23479,How to Blow $700 Million: Al Jazeera America F...,21st century wire says al jazeera america will...,Middle-east,"January 14, 2016",0,"[CLS] at January 14, 2016 [SEP], Middle-east [...","[-0.030299284, -0.059425127, -0.04813366, 0.03...",1,1,0,1


In [75]:
merged_accuracy= accuracy_score(merged.label, merged.majority_voting)

In [76]:
merged_accuracy

0.6818121074435387

In [77]:
merged.to_csv("datasets/ISOT/merged/dataset_labeled.csv")