In [1]:
import pandas as pd
import sklearn
import numpy as np
import os
from pathlib import Path
from sklearn.model_selection import train_test_split
import torch
from datasets import load_dataset
import tensorflow as tf
import tensorflow_addons as tfa
import gc

from keras import optimizers
import matplotlib.pyplot as plt
from keras import initializers
from keras.utils import np_utils
from keras import regularizers
from keras.models import Sequential, Model
from keras.callbacks import ModelCheckpoint
from keras.layers import concatenate
from keras.layers import Dense, Flatten, Dropout, Input, BatchNormalization, ReLU
from keras.layers import LSTM, Bidirectional
import pickle
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, accuracy_score


import keras

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df_goss_train = pd.read_pickle('../../Dissertation - Fake News/Embeddings/df_train_bert_simple.pkl')
df_goss_val = pd.read_pickle('../../Dissertation - Fake News/Embeddings/df_val_bert_simple.pkl')
df_goss_test = pd.read_pickle('../../Dissertation - Fake News/Embeddings/df_test_bert_simple.pkl')

In [3]:
df_goss_train.head()

Unnamed: 0,hidden_state,label,idx
0,"[[-0.6351673, 0.014559049, 0.29377505, 0.37676...",0,1934
1,"[[-0.3496328, -0.1398266, 0.20144267, 0.455405...",0,1408
2,"[[-0.45250654, 0.11762404, 0.14361618, -0.6483...",0,2589
3,"[[0.11294373, -0.20785874, 0.2868215, -0.27205...",1,5778
4,"[[-0.07389187, 0.09317138, 0.5003885, -0.26259...",0,2922


In [4]:
def label_change(df):
    if df['label'] == 0: return 1
    else: return 0

In [5]:
df_goss_train['target'] = df_goss_train.apply(label_change, axis=1)
df_goss_train.head()

Unnamed: 0,hidden_state,label,idx,target
0,"[[-0.6351673, 0.014559049, 0.29377505, 0.37676...",0,1934,1
1,"[[-0.3496328, -0.1398266, 0.20144267, 0.455405...",0,1408,1
2,"[[-0.45250654, 0.11762404, 0.14361618, -0.6483...",0,2589,1
3,"[[0.11294373, -0.20785874, 0.2868215, -0.27205...",1,5778,0
4,"[[-0.07389187, 0.09317138, 0.5003885, -0.26259...",0,2922,1


In [6]:
df_goss_val['target'] = df_goss_val.apply(label_change, axis=1)
df_goss_test['target'] = df_goss_test.apply(label_change, axis=1)

In [7]:
# Count samples per class: 0-real, 1-fake
classes_zero = df_goss_train[df_goss_train['target'] == 0]
classes_one = df_goss_train[df_goss_train['target'] == 1]

# Convert parts into NumPy arrays for weight computation
zero_numpy = classes_zero['target'].to_numpy()
one_numpy = classes_one['target'].to_numpy()
all_together = np.concatenate((zero_numpy, one_numpy))
unique_classes = np.unique(all_together)

# Compute weights
weights = sklearn.utils.class_weight.compute_class_weight('balanced', unique_classes, all_together)
print(weights)

[0.64400494 2.2360515 ]




In [8]:
weights_ker = {i : weights[i] for i in range(2)}
weights_ker

{0: 0.6440049443757726, 1: 2.236051502145923}

In [9]:
array_train = df_goss_train[["hidden_state","target"]].to_numpy() 
X_train = np.array(array_train[:,0].tolist())
X_train.shape

(10420, 1, 768)

In [10]:
X_train = np.squeeze(X_train, 1)
X_train.shape

(10420, 768)

In [11]:
y_label = np.array(array_train[:,1].tolist())
y_label.shape

(10420,)

In [12]:
array_val = df_goss_val[["hidden_state","target"]].to_numpy() 
X_val = np.array(array_val[:,0].tolist())
X_val = np.squeeze(X_val, 1)
y_val_label = np.array(array_val[:,1].tolist())

array_test = df_goss_test[["hidden_state","target"]].to_numpy() 
X_test = np.array(array_test[:,0].tolist())
X_test = np.squeeze(X_test, 1)
y_test_label = np.array(array_test[:,1].tolist())

In [13]:
def reset_tensorflow_keras_backend():
    keras.backend.clear_session()
    tf.compat.v1.reset_default_graph()
    _ = gc.collect()

In [14]:
def reinitialize(model):
    for l in model.layers:
        if hasattr(l,"kernel_initializer"):
            l.kernel.assign(l.kernel_initializer(tf.shape(l.kernel)))
        if hasattr(l,"bias_initializer"):
            l.bias.assign(l.bias_initializer(tf.shape(l.bias)))
        if hasattr(l,"recurrent_initializer"):
            l.recurrent_kernel.assign(l.recurrent_initializer(tf.shape(l.recurrent_kernel)))

In [15]:
def run_model(X_train, train_label, X_val, val_label, X_test, test_label, batch_size=32,epochs=100, verbose=1, class_weight=weights_ker, \
              itr=30, shuffle=True):
    
    adam = optimizers.Adam(learning_rate=1e-5)
    callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    
    real_precision_list = []
    real_recall_list = []
    real_f1_list = []
    fake_precision_list = []
    fake_recall_list = []
    fake_f1_list = []
    
    macro_precision_list = []
    macro_recall_list = []
    macro_f1_list = []
    accuracy_list = []
    num_epochs = []

    #Define model
    input_text = Input(shape=(768))
    dense_text = Dense(1024, activation='relu', kernel_regularizer=regularizers.l2(0.01),\
                kernel_initializer=initializers.he_normal(seed=0))(input_text)
    dropout = Dropout(0.2)(dense_text)
    dense_text = Dense(512, activation='relu', kernel_regularizer=regularizers.l2(0.01),\
                kernel_initializer=initializers.he_normal(seed=0))(dropout)
    dense_text = Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.01),\
                kernel_initializer=initializers.he_normal(seed=0))(dense_text)
    output = Dense(1, activation='sigmoid')(dense_text)
    model = Model(inputs=input_text, outputs=output)
    model.compile(loss='binary_crossentropy', optimizer=adam, metrics='accuracy')
        
    for i in range(0,itr):
        print("\nTraining for iteration number: ", i)
        reset_tensorflow_keras_backend()
        reinitialize(model)
        history = model.fit(X_train, train_label, validation_data=(X_val, val_label), batch_size=batch_size,epochs=epochs,
                            callbacks=[callback], verbose=verbose, class_weight=class_weight, shuffle=shuffle)
        
        y_pred = model.predict(X_test)
        predictions = list(map(lambda x: 0 if x<0.5 else 1, y_pred))
        
        report = classification_report(y_test_label, predictions, target_names=['real','fake'], output_dict=True)
        real_precision = report['real']['precision'] 
        real_recall = report['real']['recall'] 
        real_f1 = report['real']['f1-score'] 
        fake_precision = report['fake']['precision'] 
        fake_recall = report['fake']['recall'] 
        fake_f1 = report['fake']['f1-score'] 
        
        macro_precision =  report['macro avg']['precision'] 
        macro_recall = report['macro avg']['recall']    
        macro_f1 = report['macro avg']['f1-score']
        accuracy = report['accuracy']

        real_precision_list.append(real_precision)
        real_recall_list.append(real_recall)
        real_f1_list.append(real_f1)
        fake_precision_list.append(fake_precision)
        fake_recall_list.append(fake_recall)
        fake_f1_list.append(fake_f1)
        
        macro_precision_list.append(macro_precision)
        macro_recall_list.append(macro_recall)
        macro_f1_list.append(macro_f1)
        accuracy_list.append(accuracy)
        num_epochs.append(len(history.history['loss']))
        
        
    mean_real_precision = np.mean(real_precision_list)
    mean_real_recall = np.mean(real_recall_list)
    mean_real_f1 = np.mean(real_f1_list)
    mean_fake_precision = np.mean(fake_precision_list)
    mean_fake_recall= np.mean(fake_recall_list)
    mean_fake_f1 = np.mean(fake_f1_list)
    
    mean_macro_precision = np.mean(macro_precision_list)
    mean_macro_recall = np.mean(macro_recall)
    mean_macro_f1 = np.mean(macro_f1_list)
    mean_accuracy = np.mean(accuracy_list)
    mean_epoch = np.mean(num_epochs)
    
    std_real_precision = np.std(real_precision_list)
    std_real_recall = np.std(real_recall_list)
    std_real_f1 = np.std(real_f1_list)
    std_fake_precision = np.std(fake_precision_list)
    std_fake_recall= np.std(fake_recall_list)
    std_fake_f1 = np.std(fake_f1_list)
    
    std_macro_precision = np.std(macro_precision_list)
    std_macro_recall = np.std(macro_recall)
    std_macro_f1 = np.std(macro_f1_list)
    std_accuracy = np.std(accuracy_list)

    print("The average real precision : ", mean_real_precision)
    print("The average real recall: ", mean_real_recall)
    print("The average real f1: ", mean_real_f1)
    print("The average fake precision : ", mean_fake_precision)
    print("The average fake recall: ", mean_fake_recall)
    print("The average macro precision: ", mean_macro_precision)
    print("The average macro recall: ", mean_macro_recall)
    print("The average macro_f1: ", mean_macro_f1)
    print("The average macro accuracy: ", mean_accuracy)
    
    print("The standard deviation on real precision : ", std_real_precision)
    print("The standard deviation on real recall: ", std_real_recall)
    print("The standard deviation on real f1: ", std_real_f1)
    print("The standard deviation on fake precision : ", std_fake_precision)
    print("The standard deviation on fake recall: ", std_fake_recall)
    print("The standard deviation on macro precision: ", std_macro_precision)
    print("The standard deviation on macro recall: ", std_macro_recall)
    print("The standard deviation on macro_f1: ", std_macro_f1)
    print("The standard deviation on macro accuracy: ", std_accuracy)
    print("Average number of epochs", mean_epoch)
    
    return real_precision_list, real_recall_list, real_f1_list, fake_precision_list, fake_recall_list, fake_f1_list,\
    macro_precision_list, macro_recall_list, macro_f1_list, accuracy_list

In [18]:
real_precision, real_recall, real_f_score, fake_precision, fake_recall,\
fake_f1, macro_precision, macro_recall, macro_f1, accuracy = run_model(X_train, \
y_label, X_val, y_val_label, X_test, y_test_label, verbose=0, itr=30)


Training for iteration number:  0

Training for iteration number:  1

Training for iteration number:  2

Training for iteration number:  3

Training for iteration number:  4

Training for iteration number:  5

Training for iteration number:  6

Training for iteration number:  7

Training for iteration number:  8

Training for iteration number:  9

Training for iteration number:  10

Training for iteration number:  11

Training for iteration number:  12

Training for iteration number:  13

Training for iteration number:  14

Training for iteration number:  15

Training for iteration number:  16

Training for iteration number:  17

Training for iteration number:  18

Training for iteration number:  19

Training for iteration number:  20

Training for iteration number:  21

Training for iteration number:  22

Training for iteration number:  23

Training for iteration number:  24

Training for iteration number:  25

Training for iteration number:  26

Training for iteration number:  27

T

In [19]:
print("Real Precision:\n ", real_precision)
print("Real Recall:\n ", real_recall)
print("Real F1-Score:\n ",  real_f_score) 

Real Precision:
  [0.8858858858858859, 0.8864321608040201, 0.8847290640394089, 0.8883248730964467, 0.8936605316973415, 0.897196261682243, 0.8943514644351465, 0.8920863309352518, 0.8944618599791013, 0.8916155419222904, 0.8858585858585859, 0.8899176954732511, 0.8873096446700508, 0.8885511651469098, 0.890927624872579, 0.8918367346938776, 0.8782112274024738, 0.8913705583756345, 0.8892276422764228, 0.8987206823027718, 0.8860887096774194, 0.8842412451361867, 0.8941798941798942, 0.9, 0.8928939237899073, 0.8991596638655462, 0.8940329218106996, 0.8905622489959839, 0.8994708994708994, 0.8902439024390244]
Real Recall:
  [0.8684985279685966, 0.8655544651619235, 0.8812561334641805, 0.858684985279686, 0.8577036310107949, 0.8478900883218842, 0.8390578999018645, 0.8518155053974484, 0.8400392541707556, 0.8557409224730128, 0.8606476938174681, 0.8488714425907753, 0.8577036310107949, 0.8606476938174681, 0.8577036310107949, 0.8577036310107949, 0.9057899901864573, 0.8616290480863592, 0.858684985279686, 0.82

In [20]:
print("Fake Precision:\n ", fake_precision)
print("Fake Recall:\n ", fake_recall)
print("Fake F1-Score:\n ", fake_f1)

Fake Precision:
  [0.5592105263157895, 0.5551948051948052, 0.5798611111111112, 0.5471698113207547, 0.5538461538461539, 0.5441176470588235, 0.5273775216138329, 0.5424242424242425, 0.5289017341040463, 0.5476923076923077, 0.5463258785942492, 0.5347432024169184, 0.5440251572327044, 0.5506329113924051, 0.5496894409937888, 0.5510835913312694, 0.6190476190476191, 0.5566037735849056, 0.54858934169279, 0.5178082191780822, 0.5498392282958199, 0.6, 0.5139664804469274, 0.5234159779614325, 0.5421686746987951, 0.5356125356125356, 0.5468277945619335, 0.5700325732899023, 0.5279329608938548, 0.5517241379310345]
Fake Recall:
  [0.5985915492957746, 0.602112676056338, 0.5880281690140845, 0.6126760563380281, 0.6338028169014085, 0.6514084507042254, 0.6443661971830986, 0.6302816901408451, 0.6443661971830986, 0.6267605633802817, 0.602112676056338, 0.6232394366197183, 0.6091549295774648, 0.6126760563380281, 0.6232394366197183, 0.6267605633802817, 0.5492957746478874, 0.6232394366197183, 0.6161971830985915, 0.66

In [21]:
print("Macro Precision:\n ", macro_precision)
print("Macro Recall:\n ", macro_recall)
print("Macro F1-Score:\n ", macro_f1)
print("Accuracy Score:\n ", accuracy)

Macro Precision:
  [0.7225482061008377, 0.7208134829994126, 0.7322950875752601, 0.7177473422086007, 0.7237533427717477, 0.7206569543705332, 0.7108644930244896, 0.7172552866797471, 0.7116817970415739, 0.7196539248072991, 0.7160922322264176, 0.7123304489450848, 0.7156674009513776, 0.7195920382696575, 0.720308532933184, 0.7214601630125734, 0.7486294232250464, 0.7239871659802701, 0.7189084919846064, 0.708264450740427, 0.7179639689866196, 0.7421206225680934, 0.7040731873134107, 0.7117079889807163, 0.7175312992443512, 0.7173860997390409, 0.7204303581863165, 0.7302974111429431, 0.713701930182377, 0.7209840201850295]
Macro Recall:
  [0.7335450386321856, 0.7338335706091308, 0.7346421512391326, 0.735680520808857, 0.7457532239561018, 0.7496492695130548, 0.7417120485424815, 0.7410485977691468, 0.7422027256769271, 0.7412507429266473, 0.731380184936903, 0.7360554396052468, 0.7334292802941298, 0.736661875077748, 0.7404715338152565, 0.7422320971955383, 0.7275428824171724, 0.7424342423530388, 0.7374410