In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
import string
import requests

In [None]:
## Download the csv files from a Google Drive

# File ID from Google Drive shareable link
fake_id = "1sGqcMtY9W67vBMj1lnMQXOAfkyIN8k_a"
true_id = "1ggpakEFE6J2UIUjaD0Kfu9eAeUmAlLYu"

fake_csv_download_url = f"https://drive.google.com/uc?id={fake_id}&export=download"
true_csv_download_url = f"https://drive.google.com/uc?id={true_id}&export=download"

def download_dataset(url, filename):
    response = requests.get(url)
    if response.status_code == 200:
        with open(filename, "wb") as file:
            file.write(response.content)
        print("File downloaded successfully!")
    else:
        print("Failed to download the file.")
download_dataset(fake_csv_download_url, "Fake.csv")
download_dataset(true_csv_download_url, "True.csv")

In [None]:
# Load the datasets into pandas dataframes
data_fake = pd.read_csv('Fake.csv')
data_true = pd.read_csv('True.csv')

In [None]:
data_fake.head()

In [None]:
data_true.tail()

In [None]:
data_fake.shape, data_true.shape

In [None]:
data_fake['class'] = 0
data_true['class'] = 1

In [None]:
data_fake.shape, data_true.shape

In [None]:
# Delete the last 10 rows in both the fake and true datasets for testing purposes
data_fake_manual_testing = data_fake.tail(10)
for i in range(23480, 23470, -1):
    data_fake.drop([i], axis = 0, inplace = True)


data_true_manual_testing = data_true.tail(10)
for i in range(21416, 21406, -1):
    data_true.drop([i], axis = 0, inplace = True)

In [None]:
data_fake.shape, data_true.shape

In [None]:
# Add a new feature to both dataframes that represents whether the article is true or fake
data_fake_manual_testing['class']= 0
data_true_manual_testing['class']= 1

In [None]:
data_fake_manual_testing.head(10)

In [None]:
data_true_manual_testing.head(10)

In [None]:
# Merge the rows of both the fake and true dataframe into one single dataframe
data_merge = pd.concat([data_fake, data_true], axis = 0)
data_merge.head(20)

In [None]:
data_merge.columns

In [None]:
data = data_merge.drop(['title', 'subject', 'date'], axis = 1)
data

In [None]:
data.isnull().sum()

In [None]:
data= data.sample(frac = 1) 
# Randomly shuffle the data to test the model and see if it is able to accurately point out fake news

In [None]:
data.head()

In [None]:
data.reset_index(inplace = True)
data.drop(['index'], axis = 1, inplace = True)

In [None]:
data.columns

In [None]:
data.head()

In [None]:
# Function used to remove certain characters in the article to aid the training of the model
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W", " ", text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?.>+', "", text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
data['text'] = data['text'].apply(wordopt)

In [None]:
x = data['text']
y = data['class']

In [None]:
# Split the dataset into training and testing splits
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size= 0.25)
x_test

In [None]:
# Transform textual data into numerical representation suitable for ML model
from sklearn.feature_extraction.text import TfidfVectorizer

vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)



In [None]:
# Generate the Logistic Regression model
from sklearn.linear_model import  LogisticRegression

LR = LogisticRegression()
LR.fit(xv_train, y_train)

In [None]:
pred_lr = LR.predict(xv_test)

In [None]:
LR.score(xv_test, y_test)

In [None]:
print(classification_report(y_test, pred_lr))

In [None]:
# Generate the Decision Tree Classifier model
from sklearn.tree import DecisionTreeClassifier

DT = DecisionTreeClassifier()
DT.fit(xv_train, y_train)


In [None]:
pred_dt = DT.predict(xv_test)

In [None]:
DT.score(xv_test, y_test)

In [None]:
print(classification_report(y_test, pred_dt))

In [None]:
# Generate the Gradient Boosting Classifier model

from sklearn.ensemble import GradientBoostingClassifier

GB = GradientBoostingClassifier(random_state = 0)
GB.fit(xv_train, y_train)

In [None]:
pred_gb = GB.predict(xv_test)

In [None]:
GB.score(xv_test, y_test)

In [None]:
print(classification_report(y_test, pred_gb))

In [None]:
# Generate the Random Forest Classifier model

from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier(random_state = 0)
RF.fit(xv_train, y_train)

In [None]:
pred_rf = RF.predict(xv_test)

In [None]:
RF.score(xv_test, y_test)

In [None]:
print(classification_report(y_test, pred_rf))

In [None]:
def output_label(n):
    if n == 0:
        return "Fake News"
    elif n == 1:
        return "Not Fake News"

# Function for testing to see if an article is fake news or not. Takes in the article text as an argument
def manual_testing(news):
    testing_news = {"text":[news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test["text"] = new_def_test["text"].apply(wordopt)
    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)
    pred_LR = LR.predict(new_xv_test)
    pred_DT = DT.predict(new_xv_test)
    pred_GB = GB.predict(new_xv_test)
    pred_RF = RF.predict(new_xv_test)

    return print("\n\nLR Prediction: {}\nDT Prediction: {} \nGB Prediction: {} \nRF Prediction: {}".format(output_label(pred_LR[0]),
                                                                                                           output_label(pred_DT[0]),
                                                                                                            output_label(pred_GB[0]),
                                                                                                            output_label(pred_RF[0])))

In [None]:
news = str(input())
manual_testing(news)