# Fake News project 
### Data science course 2022/2023

**Authors**:
 - Tove Eggert Olsen, kxd956
 - Isak Erkam Kilic, htp748 
 - Ellen Hørlyck Ebdrup, hxk874


# Importing packages
Some of the package are not used in the final report, but was used during exploration of the data and when testing with different models.

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab

from sklearn.model_selection import train_test_split
from sklearn import model_selection, metrics
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, r2_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neural_network import MLPRegressor, MLPClassifier

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import *

from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import *

import time
import re
import csv
import string
import itertools

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

from joblib import dump, load

nltk.download('punkt')
nltk.download('stopwords')
stemmer = SnowballStemmer("english")

# Importing files

You can find files including the cleaned data, saved models and raw data in the GoogleDrive: [Click here](https://drive.google.com/drive/folders/1IlzLLjC91VWXk47Z4oNbRCpK-oPjfYsV?usp=sharing)

If you insert the files into the same folder as the notebook, you can run the code. Make sure not to run all cells at once, as some cells take a long time to run.

In [None]:
# A dataframe containing all websites and their labels
websites_df = pd.read_csv('websites.csv')

# A dataframe containing 568.360 data points after cleaning 2 million data points.
df_600K_cleaned = pd.read_csv('600K_cleaned.csv', usecols=['domain', 'type', 'content', 'type_binary'])

# Preprocessing data
The function sorting() is used to sort the data into a dataframe. Noise from the data is removed and labels are replaced with 0 and 1, where 1 denotes a fake news article.

The function clean_text() is used to clean the text. It removes punctuation, insert, stopwords and stems the text. The function is used on the data efter it is sorted.

The function process() calls sorting() and then clean_text() for a dataframe. 

In [None]:
def sorting_df(df):
    df.dropna(axis=0, inplace=True) 
    df.drop(df[df['type'] == 'unknown'].index, inplace = True) 
    df.drop(df[df['type'] == 'rumor'].index, inplace = True) 
    df.drop_duplicates(subset=['content'], keep='first', inplace=True) 

    fake_group = ['fake','satire','bias','conspiracy','junksci','hate','unreliable']
    df['type_binary'] = df['type'].isin(fake_group)
    df['type_binary'] = df['type_binary'].astype(int) 
    return df

In [None]:
def clean_text(text):
    text = text.replace('\n', '')
    text = text.replace('  ', '')
    text = text.lower()

    text = re.sub(r'(\´)|(\`)|(\')|(\")|(\“)|(\”)', '', text)
    
    replace = re.sub("http\S+|www\S+", '<URL>', text)
    replace = re.sub(r'\S+@+\S+\.+\S', 'EMAIL', replace)
    replace = re.sub(r'\S+\.com\S', 'URL', replace)
    replace = re.sub(r'(\d{4}/\d{2}/\d{2} \d{2}\:\d{2}\:\d{2}\.\d)|(\d{4}-\d{2}-\d{2} \d{2}\:\d{2}\:\d{2}\.\d)', 'DATE', replace)
    replace = re.sub(r'\d{2}\:\d{2}\:\d{2}\.\d', 'TIME', replace)
    replace = re.sub(r'\d+,?\.?\d*\.?\d*', 'NUM', replace)
    replace = re.sub(r'(\-)|(\—)', '', replace)
    
    nopunc = [char for char in replace if char not in string.punctuation] 
    nopunc = ''.join(nopunc)
    
    clean_words = [word for word in nopunc.split() if word not in stopwords.words('english')]
    cleaned = [stemmer.stem(word) for word in clean_words]
    return cleaned

In [None]:
def process(dataframe):
    df = sorting_df(dataframe)
    df['content'] = df['content'].apply(clean_text)
    return df

# Simple model comparison

In this section we explore different models on a subset of 10.000 data points  from our prosseced data. 

First we split the data into a training, validation and test set. We use the function train_test_split() from sklearn.model_selection. We use 80% of the data for training and 10% both for validation and testing.

In [None]:
df_10k = pd.read_csv('600K_cleaned.csv', nrows=10000)
df_10k

In [None]:
df = df_10k
X  = df['content']
y = df['type_binary']

train_ratio = 0.80
validation_ratio = 0.10
test_ratio = 0.10

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-train_ratio, random_state=0)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio), random_state=0)

print ("x split into train, validation and test sets: ", X_train.shape, X_val.shape, X_test.shape)
print ("y split into train, validation and test sets: ", y_train.shape, y_val.shape, y_test.shape)


In [None]:
models = {}

# Logistic Regression
models['Logistic Regression'] = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', LogisticRegression())])
# Support Vector Machines
models['Support Vector Machines'] = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', LinearSVC())])

# Decision Trees
models['Decision Trees'] = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', DecisionTreeClassifier())])

# Random Forest
models['Random Forest'] = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', RandomForestClassifier())])


# K-Nearest Neighbors
models['K-Nearest Neighbor'] = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', KNeighborsClassifier())])

models['Multi-Layer Perceptron'] = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MLPClassifier(verbose=True))])

accuracy, precision, recall = {}, {}, {}

for key in models.keys():
    models[key].fit(X_train, y_train)
    
    predictions = models[key].predict(X_val)
    
    accuracy[key] = accuracy_score(predictions, y_val)
    precision[key] = precision_score(predictions, y_val)
    recall[key] = recall_score(predictions, y_val)

df_model = pd.DataFrame(index=models.keys(), columns=['Accuracy', 'Precision', 'Recall'])
df_model['Accuracy'] = accuracy.values()
df_model['Precision'] = precision.values()
df_model['Recall'] = recall.values()
df_model['f1 score'] = 2 * (df_model['Precision'] * df_model['Recall']) / (df_model['Precision'] + df_model['Recall'])

df_model

# Working with a larger dataset
From processing the 2 million data points, we ended up with a sample of around 600.000 data points.

### Splitting data 
Define the dataframe in which the data is stored in the variable "df". The data is split into a training set (80 %), a validation set (10 %) and a test set (10 %). The training set is used to train the model and the validation set is used to tune the hyperparameters and test the models. We will first be using the test set ind the end of the project to test the final model.

In [None]:
df = df_600K_cleaned

X  = df['content']
y = df['type_binary']

train_ratio = 0.80
validation_ratio = 0.10
test_ratio = 0.10

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-train_ratio, random_state=0)

X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio), random_state=0)

print ("X split into train, validation and test sets: ", X_train.shape, X_val.shape, X_test.shape)
print ("y split into train, validation and test sets: ", y_train.shape, y_val.shape, y_test.shape)

# Baseline model
### Using Logistic regression, TfidfVec

We will visualize the result from the model using a confusion matrix

In [None]:
pipe = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', LogisticRegression(max_iter=1000, n_jobs=-1, class_weight={0: 1, 1: 1}, random_state=0))
])

logreg_trained = pipe.fit(X_train,y_train)
predictions = pipe.predict(X_val)

TN, FP, FN, TP = confusion_matrix(y_val, predictions).ravel()

print('True Positive(TP)  = ', TP)
print('False Positive(FP) = ', FP)
print('True Negative(TN)  = ', TN)
print('False Negative(FN) = ', FN)

report = classification_report(y_val, predictions)
print (report)

# Saving the trained model as a file 
dump(logreg_trained, 'LogReg_600K.joblib') 

# Advanced model
### MLP Classifier, Neural Network 

We will visualize the result from model using a confusion matrix. 

In [None]:
MLP = MLPClassifier(solver = 'adam', alpha = 1e-5, hidden_layer_sizes = (5, 2), random_state = 0)

clf_pipe = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MLP)])

mlp_trained = clf_pipe.fit(X_train, y_train)
y_pred_MLP = clf_pipe.predict(X_val)

report = classification_report(y_val, y_pred_MLP)
print (report)

TN_MLP, FP_MLP, FN_MLP, TP_MLP = confusion_matrix(y_val, y_pred_MLP).ravel()

print('True Positive(TP)  = ', TP_MLP)
print('False Positive(FP) = ', FP_MLP)
print('True Negative(TN)  = ', TN_MLP)
print('False Negative(FN) = ', FN_MLP)

TP_MLP_pro = TP_MLP/(TP_MLP+FP_MLP+FN_MLP+TN_MLP)*100
FP_MLP_pro = FP_MLP/(TP_MLP+FP_MLP+FN_MLP+TN_MLP)*100
TN_MLP_pro = TN_MLP/(TP_MLP+FP_MLP+FN_MLP+TN_MLP)*100
FN_MLP_pro = FN_MLP/(TP_MLP+FP_MLP+FN_MLP+TN_MLP)*100

# Saving the trained model as a file 
dump(mlp_trained, 'MLP_600K.joblib') 

# Evaluation on unseen data
### Using the LIAR dataset

We will start by importing the data and processeing it. We will then use the simple and advance model to predict the labels of the data. <br>
Make sure to have downloadet the file "LogReg_600K.joblib" and "MLP_600K.joblib" from the GoogleDrive and placed it in the same folder as the notebook.
We will use these models to predict the labels of the data.

We will visualize the result from the model using a confusion matrix.

In [None]:
columns = ['json','type','content','keywords','idk','idk2','state','politics','num1','num2','num3','num4','num5','category']
df_liar_train = pd.read_csv('train.tsv', sep='\t', header=None, names=columns)
df_liar_test = pd.read_csv('test.tsv', sep='\t', header=None, names=columns)
df_liar_val = pd.read_csv('valid.tsv', sep='\t', header=None, names=columns)

# Concat LIAR train, test, and val files into one dataframe
df_LIAR = pd.concat([df_liar_train, df_liar_test, df_liar_val], ignore_index=True)

# Drop unused columns
df_LIAR = df_LIAR.drop(['json','keywords','idk','idk2','state','politics','num1','num2','num3','num4','num5','category'],inplace=False, axis=1)


In [None]:
def sorting_liar(df):
    liar_fake_group = ['false','pants-fire', 'barely-true','half-true']
    df['type_binary'] = df['type'].isin(liar_fake_group)
    df['type_binary'] = df['type_binary'].astype(int) # define types
    return df

# Preprocess dataframe
def preprocess_liar(df):
    df = sorting_liar(df) 
    df['content'] = df['content'].apply(clean_text) 
    df['content'] = df['content'].astype(str) 
    return df


df_LIAR = preprocess_liar(df_LIAR)

X_liar = df_LIAR['content']
y_liar = df_LIAR['type_binary']

### Testing with the baseline model

In [None]:
# Testing Logistic Regression on LIAR dataset 
logreg_model = load('LogReg_600K.joblib')
predictions = logreg_model.predict(X_liar)

TN, FP, FN, TP = confusion_matrix(y_liar, predictions).ravel()

print('True Positive(TP)  = ', TP)
print('False Positive(FP) = ', FP)
print('True Negative(TN)  = ', TN)
print('False Negative(FN) = ', FN)

report_dict_LR_liar = classification_report(y_liar, predictions, output_dict=True) # save classification report as a dictionary

report = classification_report(y_liar, predictions, output_dict=False)
print (report)

df_logreg_LIAR = pd.DataFrame({'lab':['TP', 'FP', 'TN', 'FN'], 'val':[(TP/len(y_liar)*100), FP/len(y_val)*100, TN/len(y_val)*100, FN/len(y_val)*100]})

### Testing with the advanced model

In [None]:
# Testing the MLP model on the LIAR dataset 
mlp_model = load('MLP_600K.joblib')

predictions = mlp_model.predict(X_liar)

TN, FP, FN, TP = confusion_matrix(y_liar, predictions).ravel()

print('True Positive(TP)  = ', TP)
print('False Positive(FP) = ', FP)
print('True Negative(TN)  = ', TN)
print('False Negative(FN) = ', FN)

report_dict_MLP_liar = classification_report(y_liar, predictions, output_dict=True) # save classification report as a dictionary

report = classification_report(y_liar, predictions)
print (report)

df_MLP_LIAR = pd.DataFrame({'lab':['TP', 'FP', 'TN', 'FN'], 'val':[(TP/len(y_liar)*100), FP/len(y_val)*100, TN/len(y_val)*100, FN/len(y_val)*100]})