In [10]:
# Install Packages
# !pip install gdown

!pip install numpy
!pip install pandas
!pip install matplotlib

!pip install sklearn

!pip install bs4



In [11]:
# Import Packages
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

# import gdown
from zipfile import ZipFile
from joblib import dump, load

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import plot_confusion_matrix

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

from bs4 import BeautifulSoup
import requests
import re

seed = 0
np.random.seed(seed)

In [12]:
# Download Data
gdown.download('https://drive.google.com/uc?id=1O5-lPev_Z0XysVdtlC-RDjlanyCjDy3x', 'data.zip', quiet=False)
with ZipFile('data.zip', 'r') as zipObj:
   zipObj.extractall()

NameError: name 'gdown' is not defined

In [2]:
# Read Data and Extract Columns
train_data = pd.read_csv('train.csv')
train_data = train_data.fillna(' ')
train_data['combined'] = train_data['title'] + ' ' + train_data['author'] + ' '  + train_data['text']

test_data = pd.read_csv('test.csv')
test_labels = pd.read_csv('submit.csv')
test_data['label'] = test_labels['label']
test_data = test_data.fillna(' ')
test_data['combined'] = test_data['title'] + ' ' + test_data['author'] + ' '  + test_data['text']

train_text = train_data['combined'].to_numpy().astype(str)
y_train = train_data['label'].to_numpy()

test_text = test_data['combined'].to_numpy().astype(str)
y_test = test_data['label'].to_numpy()

In [1]:
# Vectorize Data for 'Article Text'

# fyi, running TfidfVectorizer() can take a long time, so just 
# load the joblib file if you can

# text_tf_idf_vectorizer = TfidfVectorizer(ngram_range=(1, 2),stop_words='english')
# text_tf_idf_vectorizer.fit(train_text)
# dump(text_tf_idf_vectorizer,'text_tf_idf_vectorizer.joblib')

text_tf_idf_vectorizer = load('text_tf_idf_vectorizer.joblib')

X_train = text_tf_idf_vectorizer.transform(train_text)
X_test = text_tf_idf_vectorizer.transform(test_text)

NameError: name 'load' is not defined

In [9]:
# Perform Hyperparameter Random Search
random_grid = {
                'n_estimators': [100, 500, 1000, 1500, 2000],
                'max_depth': [1, 2, 4, 8, 10, 16, 20, None],
                'max_features': ['sqrt', 'log2'],
               }

text_clf_rs = RandomizedSearchCV(estimator = RandomForestClassifier(random_state=seed),
                                param_distributions=random_grid, n_iter = 20, 
                                cv=3, verbose=10, random_state=seed, n_jobs = -1)

text_clf_rs.fit(X_train, y_train)

print(text_clf_rs.best_params_)

NameError: name 'RandomizedSearchCV' is not defined

In [None]:
# Train & Evaluate Classifier on 'Article Text'
text_clf = load('fake_news_classifier.joblib')

# text_clf = text_clf_rs.best_estimator_
# text_clf = RandomForestClassifier(n_estimators = 1000, max_depth=4, random_state=seed, n_jobs=-1, verbose=1)

# text_clf.fit(X_train, y_train)
# dump(text_clf,'fake_news_classifier.joblib')

text_clf_acc = text_clf.score(X_test, y_test)

print('Article Text Classifier Accuracy: ' + str(text_clf_acc))

plot_confusion_matrix(text_clf, X_test, y_test, cmap='Blues', values_format = 'd', display_labels=['Fake','True'])  
plt.show()  

In [8]:
from sklearn.metrics import precision_recall_fscore_support

y_pred = text_clf.predict(X_test)
y_true = y_test

print(precision_recall_fscore_support(y_true, y_pred, average='binary'))

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

print(accuracy_score(y_true, y_pred))
print(precision_score(y_true, y_pred))
print(recall_score(y_true, y_pred))

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.7s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    4.4s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:   11.0s
[Parallel(n_jobs=16)]: Done 768 tasks      | elapsed:   19.9s
(0.7984803435744962, 0.8448095071653268, 0.8209918478260869, None)
0.7984803435744962
0.8448095071653268
[Parallel(n_jobs=16)]: Done 1000 out of 1000 | elapsed:   25.6s finished


### Extract Text From Webpage Link

In [6]:
post_text = ' Woah, check out https://www.thegatewaypundit.com/2021/02/ignored-media-dirtbag-joe-biden-says-us-veterans-former-police-officers-fueling-white-supremacism-america/ here'

links_in_post = re.findall(r'(https?://\S+)', post_text)

for link in links_in_post:
    raw_html = requests.get(link).text
    webpage_text = BeautifulSoup(raw_html).text

    webpage_vector = text_tf_idf_vectorizer.transform([webpage_text])

    print(text_clf.predict(webpage_vector)[0])
    print(text_clf.predict_proba(webpage_vector)[0])

NameError: name 're' is not defined

In [46]:
post_text = ' Woah, check out https://www.breitbart.com/politics/2015/08/08/climate-change-the-hoax-that-costs-us-4-billion-a-day/ here'

links_in_post = re.findall(r'(https?://\S+)', post_text)

for link in links_in_post:
    raw_html = requests.get(link).text
    webpage_text = BeautifulSoup(raw_html).text

    webpage_vector = text_tf_idf_vectorizer.transform([webpage_text])

    print(text_clf.predict(webpage_vector)[0])
    print(text_clf.predict_proba(webpage_vector)[0])

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.5s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    1.6s
[Parallel(n_jobs=16)]: Done 768 tasks      | elapsed:    3.0s
[Parallel(n_jobs=16)]: Done 1000 out of 1000 | elapsed:    3.9s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
0
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.6s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    1.6s
[Parallel(n_jobs=16)]: Done 768 tasks      | elapsed:    3.1s
[0.50874747 0.49125253]
[Parallel(n_jobs=16)]: Done 1000 out of 1000 | elapsed:    4.1s finished


In [43]:
post_text = 'This is not very cash money: https://money.cnn.com/2018/05/10/technology/russian-facebook-ads-targeted-mexican-americans/index.html'

links_in_post = re.findall(r'(https?://\S+)', post_text)

for link in links_in_post:
    raw_html = requests.get(link).text
    webpage_text = BeautifulSoup(raw_html).text

    webpage_vector = text_tf_idf_vectorizer.transform([webpage_text])

    print(text_clf.predict(webpage_vector)[0])
    print(text_clf.predict_proba(webpage_vector)[0])

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.4s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    1.2s
[Parallel(n_jobs=16)]: Done 768 tasks      | elapsed:    2.3s
[Parallel(n_jobs=16)]: Done 1000 out of 1000 | elapsed:    3.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
0
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.5s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    1.2s
[Parallel(n_jobs=16)]: Done 768 tasks      | elapsed:    2.4s
[0.51139215 0.48860785]
[Parallel(n_jobs=16)]: Done 1000 out of 1000 | elapsed:    3.1s finished
