Importing the dataset

In [None]:
import pandas as pd

dataset = pd.read_csv('WELFake_Dataset.csv')
dataset.drop(labels = ['Unnamed: 0'], axis = 1, inplace = True)
dataset.dropna(axis = 0, inplace = True, ignore_index = True)
num_rows, num_cols = dataset.shape

Concatenate title and text column for each article

In [None]:
dataset["content"] = dataset["title"]+ " "+ dataset["text"]

Initialize the tf-idf vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=1500)

Import NLTK tools for text pre-processing

In [None]:
import re
import nltk
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
from nltk.corpus import stopwords

ps = PorterStemmer()

stop_words = set(stopwords.words('english'))

Create a function to perform pre-processing steps to a chunk of the dataset

In [None]:
def preprocess_chunk(chunk):
    processed_chunk = []
    for text in chunk:
        rev = re.sub('[^a-zA-Z]', ' ', text)
        rev = rev.lower()
        rev = rev.split()

        rev = [ps.stem(word) for word in rev if not word in stop_words]
        rev = ' '.join(rev)
        processed_chunk.append(rev)
    return processed_chunk

Split the dataset to chunks and pre-process each chunk separately

In [None]:
chunk_size = 1000
num_chunks = (num_rows // chunk_size) + 1

corpus = []
for chunk_num in range(0, num_chunks, 2):
    start_idx = chunk_num * chunk_size
    end_idx = min((chunk_num + 1) * chunk_size, num_rows)
    chunk = dataset['content'][start_idx:end_idx]
    pc = preprocess_chunk(chunk)
    corpus.extend(pc)

    chunk_num+=1

    start_idx = chunk_num * chunk_size
    end_idx = min((chunk_num + 1) * chunk_size, num_rows)
    chunk = dataset['content'][start_idx:end_idx]
    pc = preprocess_chunk(chunk)
    corpus.extend(pc)

Perform feature extraction on the pre-processed dataset

In [None]:
import numpy as np
x = np.empty((0,1500))

for i in range(0, len(corpus), chunk_size):
    corpus_chunk = corpus[i:i+chunk_size]
    tfidf_chunk = tfidf.fit_transform(corpus_chunk).toarray()
    x = np.concatenate((x, tfidf_chunk), axis=0)

Perform train-test split

In [None]:
from sklearn.model_selection import train_test_split

y = dataset.iloc[:, 2]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

Create CNN sequential model

In [None]:
from tensorflow import keras
from keras import Sequential, utils
from keras.layers import Conv1D, MaxPool1D, Flatten, Dense, Input

def create_model(filters, kernel_size, pool_size):
    keras.backend.clear_session()
    classifier = Sequential()
    classifier.add(Input(shape=(1500,1)))
    classifier.add(Conv1D(filters=filters, kernel_size=kernel_size, activation='relu'))
    classifier.add(MaxPool1D(pool_size=pool_size))
    classifier.add(Flatten())
    classifier.add(Dense(units=64, activation='relu'))
    classifier.add(Dense(units=1, activation='sigmoid'))
    classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    print(classifier.summary())
    return classifier

Create the param grid for the hyper-parameters to be tuned

In [None]:
param_grid = {
    'filters': [32, 64, 128],
    'kernel_size': [5, 7],
    "pool_size": [2,3],
    "batch_size": [128, 64]
}

Setup the hyper-parameter tuning process

In [None]:
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import GridSearchCV

model = KerasClassifier(build_fn=create_model, batch_size=64, epochs=12, filters=32, kernel_size=7, pool_size=2)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=1, error_score='raise', verbose=4)
grid_search.fit(X_train_reshaped, y_train)

View hyper-parameter tuning results and test the best model on the test set

In [None]:
print(grid_search.cv_results_)
print(grid_search.best_params_)
print(grid_search.best_score_)

y_pred = grid_search.predict(X_test_reshaped)

Display hyper-parameter tuning results in a latex table

In [None]:
from IPython.display import display
gridres = pd.DataFrame(grid_search.cv_results_)[['params','mean_test_score','rank_test_score']]
print(gridres.sort_values(by=['rank_test_score']).to_latex(index=False))

Evaluate the model performance and plot its confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay, f1_score, recall_score, precision_score
import matplotlib.pyplot as plt

accuracy = accuracy_score(y_test, y_try)
precision = precision_score(y_test, y_try)
f1 = f1_score(y_test, y_try)
recall = recall_score(y_test, y_try)

print(f"Accuracy = {accuracy:.2f}\nPrecision = {precision:.2f}\nF1 Score = {f1:.2f}\nRecall = {recall:.2f}")

cm = confusion_matrix(y_test, y_try)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=grid_search.classes_)

disp.plot()
plt.show()