In [63]:
#Import all the necessary libraries

import pandas as pd
import numpy as np

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import nltk
import string
from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.metrics import classification_report

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from keras.callbacks import EarlyStopping

import gensim.downloader

# Load Datasets

## Main Dataset

In [128]:
def load_target_concat():
    
    #load data
    pol_real=pd.read_json("/home/kathrin/code/kbank1/fake_news/raw_data/politifact_hr.json", orient='index')
    pol_fake=pd.read_json("/home/kathrin/code/kbank1/fake_news/raw_data/politifact_hf.json", orient='index')
    gossip_fake=pd.read_json("/home/kathrin/code/kbank1/fake_news/raw_data/gossipcop_hf.json", orient='index')
    gossip_real=pd.read_json("/home/kathrin/code/kbank1/fake_news/raw_data/gossipcop_hr.json", orient='index')
    
    #define target
    pol_real["fake"]=0
    pol_fake["fake"]=1
    gossip_real["fake"]=0
    gossip_fake["fake"]=1
    
    #concat
    data=pd.concat((pol_fake, pol_real, gossip_fake, gossip_real),axis=0, ignore_index=True)
    
    print("✅ data loaded, target defined, data concatenated")
    
    return data

In [129]:
data=load_target_concat()

✅ data loaded, target defined, data concatenated


## Test Datasets

In [130]:
#politifact=pd.read_json("/home/kathrin/code/kbank1/fake_news/raw_data/politifact/politifact_factcheck_data.json", lines=True)
#guardian=pd.read_csv("/home/kathrin/code/kbank1/fake_news/raw_data/guardian/the_guardian_full.csv")
#corpus=pd.read_csv("/home/kathrin/code/kbank1/fake_news/raw_data/fakenewscorpus/news_1.csv", nrows=20000)

In [131]:
#load data, filter, roughly cleaned and saved
politifact=pd.read_csv("/home/kathrin/code/kbank1/fake_news/raw_data/test/politifact_test.csv")
guardian=pd.read_csv("/home/kathrin/code/kbank1/fake_news/raw_data/test/guardian_test.csv")
corpus=pd.read_csv("/home/kathrin/code/kbank1/fake_news/raw_data/test/corpus_test.csv")
guard_corp=pd.read_csv("/home/kathrin/code/kbank1/fake_news/raw_data/test/guard_corp_test.csv")

# Cleaning

## main dataset

In [132]:
def remove_duplicates_errors(data: pd.DataFrame) -> pd.DataFrame:
    """
    Clean raw data by
    - removing duplicates within fake-category (keep 1)
    - removing duplicates across fake-categories (delete both)
    - deleting texts that are shorter than their title (error messages, headers etc.)
    """
    # Remove duplicates within fake-category
    data =data.drop_duplicates(subset=("text", "fake"), keep='first', ignore_index=True)

    # Remove duplicates across fake-category
    data=data.drop_duplicates(subset=("text"), keep=False, ignore_index=True)

    # Delete false texts
    data["text_len"] = data['text'].str.len()
    data["title_len"]=data['title'].str.len()
    data=data[data["text_len"]>=data["title_len"]]

    print("✅ duplicates and errors removed")

    return data

In [133]:
data=remove_duplicates_errors(data)

✅ duplicates and errors removed


## test datasets

In [134]:
#politifact
#extract year
#politifact['statement_date'] = pd.to_datetime(politifact['statement_date'])
#politifact["year"]=politifact["statement_date"].dt.year
#select obs after 2019
#politifact=politifact[politifact["year"]>2019]
#only keep true, false and pants on fire
#politifact= politifact[politifact["verdict"].isin(["false", "pants-fire", "true"])]
#create variable fake that is 0 if real, 1 if fake
#politifact["fake"] = 0  # Default to 0
#politifact.loc[politifact["verdict"].isin(["false", "pants-fire"]), "fake"] = 1

#politifact["text"]=politifact["statement"]

#save
#politifact.to_csv("/home/kathrin/code/kbank1/fake_news/raw_data/test/politifact_test.csv")

In [135]:
#guardian
#guardian= guardian.tail(10000)
#guardian=guardian.dropna()
#guardian["fake"]=0
#guardian["text"]=guardian["Content"]

#t1="skip past newsletter promotionSign up to The BreakdownFree weekly newsletterThe latest rugby union news and analysis, plus all the week's action reviewed"
#t2="Privacy Notice: Newsletters may contain info about charities, online ads, and content funded by outside parties. For more information see our Privacy Policy. We use Google reCaptcha to protect our website and the Google Privacy Policy and Terms of Service apply"
#t3="skip past newsletter promotionSign"
#t4=".after newsletter promotion"
#t5="newsletter"

#tlist=[t1, t2, t3, t4, t5]

#def clean_guardian(text):
#    for t in tlist:
#        text = text.replace(t, '')
#    return text

#guardian["text"]=guardian.text.apply(clean_guardian)

#save
#guardian.to_csv("/home/kathrin/code/kbank1/fake_news/raw_data/test/guardian_test.csv")

In [9]:
#error_count=guardian['text'].str.contains(text).sum()
#if error_count>0:
#    print ("There are {m} occurrences".format(m=error_count))

In [10]:
#corpus
#keep only fakes
#corpus=corpus[corpus["type"]=="fake"]
#corpus["fake"]=1
#corpus["text"]=corpus["content"]
#save
#corpus.to_csv("/home/kathrin/code/kbank1/fake_news/raw_data/test/corpus_test.csv")

In [11]:
#guard_corp
#concat
#guard_corp= pd.concat((corpus, guardian),axis=0, ignore_index=True)
#guard_corp=guard_corp.sample(frac=1).reset_index(drop=True)
#save
#guard_corp.to_csv("/home/kathrin/code/kbank1/fake_news/raw_data/test/guard_corp_test.csv")

# Preprocessing

In [269]:
def preprocessing(text):

    # Removing whitespaces
    text = text.strip()
    # Lowercasing
    text = text.lower()
    # Removing numbers
    text = ''.join(char for char in text if not char.isdigit())
    # Removing punctuation
    #for punctuation in string.punctuation:
        #text = text.replace(punctuation, '')
    
   # symbols=["—", "“", "”", "’", "‘"]
    #for symbol in symbols:
      #  text=text.replace(symbol, '')
    
    # Tokenizing
    tokenized = word_tokenize(text)
    
    # Removing stopwords
    stop_words = set(stopwords.words('english')) 
    without_stopwords = [word for word in tokenized if not word in stop_words]
    
    cleaned_sentence = " ".join(without_stopwords)
    
    return cleaned_sentence
    #return text


In [137]:
data['text'] = data.text.apply(preprocessing)

In [138]:
politifact["text"]=politifact.text.apply(preprocessing)

In [139]:
guardian["text"]=guardian.text.apply(preprocessing)

In [140]:
corpus["text"]=corpus.text.apply(preprocessing)

In [270]:
guard_corp["text"]=guard_corp.text.apply(preprocessing)

# Balancing, train test split

In [166]:
def balancing(data):
    
    #split data in true and false and sample
    true = data[data['fake'] == 0].sample(n=3500)
    #true=pd.read_csv('/home/kathrin/code/kbank1/fake_news/raw_data/sample/sample_true.csv')
    
    false = data[data['fake'] == 1].sample(n=3500)
    #false=pd.read_csv('/home/kathrin/code/kbank1/fake_news/raw_data/sample/sample_false.csv')
    
    #concat
    files = [true, false]
    data_concat = pd.concat(files, ignore_index=True)
    data_concat.shape
    
    #define X and Y
    X = data_concat['text']
    y = data_concat['fake']
    
    return X, y

In [262]:
#X, y= balancing(data)
X=pd.read_csv('/home/kathrin/code/kbank1/fake_news/raw_data/sample/X_punc.csv')
y=pd.read_csv('/home/kathrin/code/kbank1/fake_news/raw_data/sample/y_punc.csv')

In [263]:
X["text"]=X.text.apply(preprocessing)

In [264]:
X=X["text"]
y=y["fake"]

In [265]:
# Create a train/test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=0)

# Test score functions

In [79]:
def test_score(pipe, dataset_name):
    """
    input a pipe (preprocessing and modeling)
    output a classification report on the test set
    """
    #load data
    df=dataset_name
    
    #apply preprocessing
    #df['text'] = df.text.apply(preprocessing)
    
    #create X and y
    X_test_new=df["text"]
    y_test_new=df["fake"]
    #predict the label
    y_pred_new=pipe.predict(X_test_new)
    #make a classification report
    report=classification_report(y_test_new, y_pred_new, output_dict=True)
    
    return report

In [80]:
#create test function for rnn with keras embedding
def test_score_RNN(model, dataset_name):
    """
    input a pipe (preprocessing and modeling)
    output a classification report on the test set
    """
    
    #load data
    df=dataset_name
    
    #apply preprocessing
    #df['text'] = df.text.apply(preprocessing)
    
    #create X and y
    X_test_new=df["text"]
    y_test_new=df["fake"]
 
    #tokenize
    X_test_token_new=tk.texts_to_sequences(X_test_new)
    
    #pad
    X_test_pad_new=pad_sequences(X_test_token_new,  padding='pre', maxlen=max_len)
    
    #evaluate
    report=model.evaluate(X_test_pad_new, y_test_new, return_dict=True)
    
    return report

In [81]:
#create test function for rnn with glove embedding
def test_score_RNN_2(model, dataset_name):
    """
    input a pipe (preprocessing and modeling)
    output a classification report on the test set
    """
    
    #load data
    df=dataset_name
    
    #apply preprocessing
    #df['text'] = df.text.apply(preprocessing)
    
    #create X and y
    X_test_new=df["text"]
    y_test_new=df["fake"]
 
    #tokenize
    X_test_token_new = X_test_new.apply(word_tokenize)
    
    #embed
    X_test_embed_new = X_test_token_new.apply(embed_sentence)
    
    #pad
    X_test_pad_new=pad_sequences(X_test_embed_new,  padding='pre', maxlen=max_len)
    
    
    #evaluate
    report=model.evaluate(X_test_pad_new, y_test_new, return_dict=True)
    
    return report

# Models

## Baseline model

In [32]:
pipe = make_pipeline(CountVectorizer(), MultinomialNB())
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
classification_report(y_test, y_pred, output_dict=True)

{'0': {'precision': 0.7431192660550459,
  'recall': 0.7074235807860262,
  'f1-score': 0.7248322147651007,
  'support': 687.0},
 '1': {'precision': 0.7305630026809652,
  'recall': 0.7643758765778401,
  'f1-score': 0.7470870459218644,
  'support': 713.0},
 'accuracy': 0.7364285714285714,
 'macro avg': {'precision': 0.7368411343680055,
  'recall': 0.7358997286819331,
  'f1-score': 0.7359596303434826,
  'support': 1400.0},
 'weighted avg': {'precision': 0.7367245404938176,
  'recall': 0.7364285714285714,
  'f1-score': 0.736166282347081,
  'support': 1400.0}}

## Naive Bayes tuned

In [18]:
pipe2 = make_pipeline(TfidfVectorizer(max_features=300), MultinomialNB(alpha=0.1))
pipe2.fit(X_train, y_train)
y_pred = pipe2.predict(X_test)
classification_report(y_test, y_pred, output_dict=True)

{'0': {'precision': 0.7111111111111111,
  'recall': 0.8384279475982532,
  'f1-score': 0.7695390781563126,
  'support': 687.0},
 '1': {'precision': 0.811864406779661,
  'recall': 0.6718092566619915,
  'f1-score': 0.7352264006139677,
  'support': 713.0},
 'accuracy': 0.7535714285714286,
 'macro avg': {'precision': 0.7614877589453861,
  'recall': 0.7551186021301224,
  'f1-score': 0.7523827393851401,
  'support': 1400.0},
 'weighted avg': {'precision': 0.7624233252623083,
  'recall': 0.7535714285714286,
  'f1-score': 0.7520641216651042,
  'support': 1400.0}}

## SVM model

In [19]:
pipe3 = make_pipeline(TfidfVectorizer(), SVC(kernel="rbf"))
pipe3.fit(X_train, y_train)
y_pred = pipe3.predict(X_test)
classification_report(y_test, y_pred, output_dict=True)

{'0': {'precision': 0.749379652605459,
  'recall': 0.8791848617176128,
  'f1-score': 0.8091091761553918,
  'support': 687.0},
 '1': {'precision': 0.8602693602693603,
  'recall': 0.7166900420757363,
  'f1-score': 0.7819433817903596,
  'support': 713.0},
 'accuracy': 0.7964285714285714,
 'macro avg': {'precision': 0.8048245064374097,
  'recall': 0.7979374518966745,
  'f1-score': 0.7955262789728756,
  'support': 1400.0},
 'weighted avg': {'precision': 0.805854196580003,
  'recall': 0.7964285714285714,
  'f1-score': 0.7952740251680575,
  'support': 1400.0}}

## SVM tuned 1

In [267]:
pipe4 = make_pipeline(TfidfVectorizer(max_features=300), SVC(kernel="rbf"))
pipe4.fit(X_train, y_train)
y_pred = pipe4.predict(X_test)
classification_report(y_test, y_pred, output_dict=True)

{'0': {'precision': 0.7171464330413017,
  'recall': 0.834061135371179,
  'f1-score': 0.7711978465679677,
  'support': 687.0},
 '1': {'precision': 0.8103161397670549,
  'recall': 0.6830294530154277,
  'f1-score': 0.7412480974124809,
  'support': 713.0},
 'accuracy': 0.7571428571428571,
 'macro avg': {'precision': 0.7637312864041783,
  'recall': 0.7585452941933033,
  'f1-score': 0.7562229719902243,
  'support': 1400.0},
 'weighted avg': {'precision': 0.7645964336809175,
  'recall': 0.7571428571428571,
  'f1-score': 0.7559448671766377,
  'support': 1400.0}}

In [146]:
test_score(pipe4, guardian)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'0': {'precision': 1.0,
  'recall': 0.9157884171434307,
  'f1-score': 0.9560433803112066,
  'support': 9963.0},
 '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0.0},
 'accuracy': 0.9157884171434307,
 'macro avg': {'precision': 0.5,
  'recall': 0.45789420857171537,
  'f1-score': 0.4780216901556033,
  'support': 9963.0},
 'weighted avg': {'precision': 1.0,
  'recall': 0.9157884171434307,
  'f1-score': 0.9560433803112066,
  'support': 9963.0}}

In [152]:
test_score(pipe4, corpus)

KeyboardInterrupt: 

In [272]:
test_score(pipe4, guard_corp)

{'0': {'precision': 0.6935813109548651,
  'recall': 0.9208069858476362,
  'f1-score': 0.791203104786546,
  'support': 9963.0},
 '1': {'precision': 0.8909919867366676,
  'recall': 0.6140735098076557,
  'f1-score': 0.7270574971815107,
  'support': 10502.0},
 'accuracy': 0.763400928414366,
 'macro avg': {'precision': 0.7922866488457663,
  'recall': 0.767440247827646,
  'f1-score': 0.7591303009840283,
  'support': 20465.0},
 'weighted avg': {'precision': 0.7948863154533009,
  'recall': 0.763400928414366,
  'f1-score': 0.7582855787143211,
  'support': 20465.0}}

## LSTM with Keras embedding

In [51]:
### Let's tokenize the vocabulary 
tk = Tokenizer(num_words=20000)
tk.fit_on_texts(X_train)
vocab_size = len(tk.word_index)
print(f'There are {vocab_size} different words in your corpus')

There are 70779 different words in your corpus


In [52]:
# Set parameters
max_features = 20000  # Maximum number of words to get out of our data
max_len = 300  # Maximum sequence length
embedding_dim = 50  # Dimensionality of word embeddings

In [53]:
#tokenization
X_train_token = tk.texts_to_sequences(X_train)
X_test_token=tk.texts_to_sequences(X_test)

# Pad the inputs to a fixed length
X_train_pad = pad_sequences(X_train_token,  padding='pre', maxlen=max_len)
X_test_pad=pad_sequences(X_test_token,  padding='pre', maxlen=max_len)

print("Max index in training data:", np.max(X_train_pad))
print("Max index in test data:", np.max(X_test_pad))

Max index in training data: 19999
Max index in test data: 19999


In [54]:
# Build the model
model = Sequential()
model.add(Embedding(max_features, embedding_dim, input_length=max_len))
model.add(LSTM(16))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
metrics=['accuracy', 'Precision', 'Recall' ]
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=metrics)
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 300, 50)           1000000   
                                                                 
 lstm_3 (LSTM)               (None, 16)                4288      
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
Total params: 1,004,305
Trainable params: 1,004,305
Non-trainable params: 0
_________________________________________________________________


In [55]:
# Define Early Stopping
early_stopping = EarlyStopping(
    monitor='val_recall',  # Stop when validation loss stops improving
    patience=5,          # Wait for 3 epochs without improvement before stopping
    mode="max",
    restore_best_weights=True  # Restore the best model weights after stopping
)

# Train the model
model.fit(X_train_pad, y_train, batch_size=128, epochs=20, validation_data=(X_test_pad, y_test), callbacks=[early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20


<keras.callbacks.History at 0x7fc64a730580>

In [56]:
# Evaluate the model
model.evaluate(X_test_pad, y_test, return_dict=True)



{'loss': 0.5717882513999939,
 'accuracy': 0.7442857027053833,
 'precision': 0.7326343655586243,
 'recall': 0.7840112447738647}

In [57]:
test_score_RNN(model, guardian)



{'loss': 0.6149290800094604,
 'accuracy': 0.699086606502533,
 'precision': 0.0,
 'recall': 0.0}

In [58]:
test_score_RNN(model, corpus)



{'loss': 0.9589491486549377,
 'accuracy': 0.49704816937446594,
 'precision': 1.0,
 'recall': 0.49704816937446594}

## LSTM with GloVe embedding

In [59]:
model_wiki = gensim.downloader.load('glove-wiki-gigaword-50')

In [60]:
# Set parameters
max_len = 300  # Maximum sequence length
embedding_dim = 50  # Dimensionality of word embeddings

In [61]:
# Function to embed a sentence
def embed_sentence(sentence, model=model_wiki, embedding_dim=50):
    embedded_sentence = []
    for word in sentence:
        if word in model:  # Directly check in model (no `.wv` needed)
            embedded_sentence.append(model[word])  # Get vector
    return np.array(embedded_sentence)

In [62]:
#tokenization
X_train_token = X_train.apply(word_tokenize)
X_test_token=X_test.apply(word_tokenize)

In [63]:
X_train_embed = X_train_token.apply(embed_sentence)
X_test_embed = X_test_token.apply(embed_sentence)

In [64]:
# Pad the inputs to a fixed length
X_train_pad = pad_sequences(X_train_embed,  padding='pre', maxlen=max_len)
X_test_pad=pad_sequences(X_test_embed,  padding='pre', maxlen=max_len)

In [65]:
# Build the model

model2 = Sequential()

# No Embedding layer needed since inputs are already embedded (X_train_pad)
model2.add(LSTM(16, input_shape=(300, 50)))  # Add LSTM layer with 16 units
model2.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the model
metrics = ['accuracy', 'Precision', 'Recall']
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=metrics)

# Check model summary
model2.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_4 (LSTM)               (None, 16)                4288      
                                                                 
 dense_4 (Dense)             (None, 1)                 17        
                                                                 
Total params: 4,305
Trainable params: 4,305
Non-trainable params: 0
_________________________________________________________________


In [66]:
# Define Early Stopping
early_stopping = EarlyStopping(
    monitor='val_recall',  # Stop when validation loss stops improving
    patience=5,          # Wait for 3 epochs without improvement before stopping
    mode="max",
    restore_best_weights=True  # Restore the best model weights after stopping
)
# Train the model
model2.fit(X_train_pad, y_train, batch_size=200, epochs=20, validation_data=(X_test_pad, y_test), callbacks=[early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20


<keras.callbacks.History at 0x7fc6911d3610>

In [67]:
model2.evaluate(X_test_pad, y_test, return_dict=True)



{'loss': 0.6502742767333984,
 'accuracy': 0.6214285492897034,
 'precision': 0.6060255169868469,
 'recall': 0.7335203289985657}

In [68]:
test_score_RNN_2(model2, guardian)

2025-01-31 16:47:01.764959: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 597780000 exceeds 10% of free system memory.




{'loss': 0.5927222371101379,
 'accuracy': 0.6563284397125244,
 'precision': 0.0,
 'recall': 0.0}

In [69]:
test_score_RNN_2(model2, corpus)

  1/329 [..............................] - ETA: 20s - loss: 0.6938 - accuracy: 0.6562 - precision: 1.0000 - recall: 0.6562

2025-01-31 16:47:36.283638: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 630120000 exceeds 10% of free system memory.




{'loss': 0.6776471734046936,
 'accuracy': 0.6254999041557312,
 'precision': 1.0,
 'recall': 0.6254999041557312}