In [1]:
import os
import json
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, Sentence, DocumentRNNEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from flair.datasets import CSVClassificationCorpus
import os
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

from collections import Counter

In [2]:
DATA_PATH = '../fakenewsnet_dataset'
DATASET_NAME = 'politifact'
DATASET_PATH = '{}/{}'.format(DATA_PATH, DATASET_NAME)
REAL_DATA_PATH = '{}/real'.format(DATASET_PATH)
FAKE_DATA_PATH = '{}/fake'.format(DATASET_PATH)

In [3]:
def load_json_from_file(path):
    with open(path) as json_file:
        data = json.load(json_file)
    return data

In [4]:
class Article():
    def __init__(self, name, path):
        self.path = path
        self.name = name
        self.content = None
        self.tweets = []
        
    def load_content(self):
        content_path = "{}/news content.json".format(self.path)
        if os.path.isfile(content_path):
            self.content = load_json_from_file(content_path)
    
    def load_tweets(self):
        tweets_path = "{}/tweets".format(self.path)
        if os.path.isdir(tweets_path):
            tweets_files = os.listdir(tweets_path)
            self.tweets = [load_json_from_file("{}/{}".format(tweets_path, file)) for file in tweets_files]

In [5]:
def load_single_article(name, path):
    art = Article(name, path)
    art.load_content()
#     art.load_tweets()
    return art

def load_all_articles(path):
    articles = []
    if os.path.isdir(path):
        articles_files = os.listdir(path)
        articles = [load_single_article(file, "{}/{}".format(path, file)) for file in articles_files]
    return articles

In [6]:
fake_arts = load_all_articles(FAKE_DATA_PATH)
real_arts = load_all_articles(REAL_DATA_PATH)

In [7]:
fake_arts_with_content = [art for art in fake_arts if art.content is not None]
real_arts_with_content = [art for art in real_arts if art.content is not None]

In [8]:
fake_data = [(art, 'fake') for art in fake_arts_with_content]
real_data = [(art, 'real') for art in real_arts_with_content]

In [9]:
np.random.shuffle(fake_data)
np.random.shuffle(real_data)

In [10]:
train_data = fake_data[0:int(len(fake_data)*0.8)] + real_data[0:int(len(real_data)*0.8)]
test_data = fake_data[int(len(fake_data)*0.8):] + real_data[int(len(real_data)*0.8):]
np.random.shuffle(train_data)
np.random.shuffle(test_data)

In [11]:
print(len(train_data))
print(len(test_data))
print(len(train_data) + len(test_data))
print(len(fake_data) + len(real_data))

643
162
805
805


In [12]:
train_content = [(x.content, label) for x, label in train_data] 
test_content = [(x.content, label) for x, label in test_data]

In [13]:
all_content = train_content + test_content

In [14]:
all_content[0][0].keys()

dict_keys(['url', 'text', 'images', 'top_img', 'keywords', 'authors', 'canonical_link', 'title', 'meta_data', 'movies', 'publish_date', 'source', 'summary'])

In [35]:
title = Counter([x['title'] for x, label in all_content])
title.most_common()

a = list(title.most_common())[:10]
for e in a:
    print(e[0][:40] + ' & ' + str(e[1]) + '\\' )

 & 41\
CQ.com & 14\
- The Washington Post & 13\
Wiadomości, Pogoda, Outlook, Hotmail, Sk & 11\
YouTube & 10\
Transcripts & 6\
Political TV Ad Archive » PolAd & 4\
Time & 4\
LexisNexis(R) Publisher & 3\
MoveOn.org Political Action: 10 things t & 3\


In [16]:
search = ''
title_search = [(x,label) for x, label in all_content if x['title'] == search]
title_search[2]

({'url': 'http://reflectionofmind.org/nasa-will-pay-18000-usd-stay-bed-smoke-weed-70-straight-days/',
  'text': '',
  'images': [],
  'top_img': '',
  'keywords': [],
  'authors': [],
  'canonical_link': '',
  'title': '',
  'meta_data': {'viewport': 'width=device-width, initial-scale=1',
   'description': 'See related links to what you are looking for.'},
  'movies': [],
  'publish_date': None,
  'source': 'http://reflectionofmind.org',
  'summary': ''},
 'fake')

In [17]:
search = 'YouTube'
title_search = [(x,label) for x, label in all_content if x['title'] == search]
title_search[0]

({'url': 'http://www.youtube.com/watch?v=jOVEHGnwGhA',
  'text': 'Język, w którym oglądasz YouTube, to. Możesz zmienić to ustawienie poniżej',
  'images': ['https://s.ytimg.com/yts/img/favicon-vfl8qSV2F.ico'],
  'top_img': 'https://s.ytimg.com/yts/img/favicon-vfl8qSV2F.ico',
  'keywords': [],
  'authors': [],
  'canonical_link': '',
  'title': 'YouTube',
  'meta_data': {'robots': 'noindex',
   'referrer': 'origin',
   'theme-color': '#ff0000'},
  'movies': [],
  'publish_date': None,
  'source': 'http://www.youtube.com',
  'summary': ''},
 'real')

In [34]:
text = Counter([x['text'] for x, label in all_content])
a = list(text.most_common())[:10]
for e in a:
    print(e[0][:40] + '...' + ' & ' + str(e[1]) + '\\' )

... & 73\
Need help? Contact the CQ Hotline at (80... & 14\
Please enable cookies on your web browse... & 13\
About Trendolizer™

Trendolizer™ (patent... & 13\
Język, w którym oglądasz YouTube, to. Mo... & 11\
Autoodtwarzanie Jeśli masz włączone auto... & 10\
Pomiń wszystko

Witamy! Na osi czas spęd... & 7\
Używamy plików cookie, aby pomóc w perso... & 5\
About Your Privacy on this Site

Welcome... & 4\
Use this guide to help you find the full... & 3\


In [19]:
search = 'Giant Squid Washes Ashore on Lake Michigan Michigan DNR were called to Leland'
text_search = [(x['url'],label) for x, label in all_content if x['text'].startswith(search)]
text_search

[('http://www.breakingnews365.net/59f509292adf6/alabama-football-crimson-tide-disqualified-from-national-championship.html',
  'fake'),
 ('http://www.breakingnews365.net/59c36cdd7b326/washington-state-legislature-votes-to-change-its-name-because-george-washington-owned-slaves.html',
  'fake'),
 ('http://www.breakingnews365.net/5a00d7972338f/snapchat-is-shutting-down.html',
  'fake')]

In [20]:
text = Counter([x['summary'] for x, label in all_content])
text.most_common()

[('', 805)]

In [21]:
text = Counter([str(x['authors']) for x, label in all_content])
text.most_common()

[('[]', 566),
 ("['Abc News']", 16),
 ("['Trending Story Found']", 13),
 ("['W Odpowiedzi Do']", 5),
 ("['Alex Stevan']", 4),
 ("['Jim Hoft']", 4),
 ("['Please Enter Your Name Here']", 4),
 ("['About Flagg Eagleton', 'Flagg Eagleton Is The Son Of An American Potato Farmer', 'A Patriot. After Spending Years In The Navy', 'On Welfare Picking Himself Up The Bootstraps', 'Flagg Finally Got His Hvac Certificate', 'Is Hard At Work Keeping The Mobile Homes Of Tallahassee At A Comfy Degrees.']",
  3),
 ("['Sean Adl-Tabatabai']", 3),
 ("['Yotvat Kariti']", 3),
 ("['Paul Krugman']", 3),
 ("['George Swenson']", 2),
 ("['Pbs Newshour']", 2),
 ('[\'About Stryker\', "Stryker Is A Constitutional Conservative Who Can\'T Stand The Lazy", \'Cryba\', \'Fantasy World Liberals Live In. Knowledge Is Power. The Truth Will Set You Free.\', \'May The Good Lord Bless\', \'Keep The United States Of America.\', \'C. King On\', \'K. Freed On\']',
  2),
 ("['Bob The Empire News Potato']", 2),
 ("['Jay Greenberg']",

In [22]:
text = Counter([x['url'] for x, label in all_content])
text.most_common()

[('https://web.archive.org/web/20050322064340/http://www6.lexisnexis.com:80/publisher/EndUser?Action=UserDisplayFullDocument',
  3),
 ('https://web.archive.org/web/20080506120114/http://pol.moveon.org:80/mccain10/email.html?',
  3),
 ('http://transcripts.cnn.com/TRANSCRIPTS/0706/05/se.01.html', 2),
 ('http://politicaladarchive.org/ad/polad_donaldtrump_k1mkc/', 2),
 ('http://www.msnbc.msn.com/id/3080247/', 2),
 ('http://www.desmoinesregister.com/article/20100324/OPINION01/3250323/1036',
  2),
 ('http://frwebgate.access.gpo.gov/cgi-bin/getdoc.cgi?dbname=111_cong_bills&docid=f:h3200ih.txt.pdf',
  2),
 ('http://www.politifact.com/ohio/statements/2016/mar/14/rob-portman/most-heroin-us-comes-over-mexican-border/',
  2),
 ('http://www.bls.gov/ces/', 2),
 ('http://www.nytimes.com/2008/06/03/us/politics/03text-obama.html?_r=1&oref=slogin',
  2),
 ('http://www.eia.gov/cfapps/ipdbproject/IEDIndex3.cfm?tid=50&pid=53&aid=1',
  2),
 ('https://web.archive.org/web/20090425195727/http://republicanleade

In [23]:
search = 'https://web.archive.org/web/20050322064340/http://www6.lexisnexis.com:80/publisher/EndUser?Action=UserDisplayFullDocument'
url_search = [(x,label) for x, label in all_content if x['url'] == search]
url_search

[({'url': 'https://web.archive.org/web/20050322064340/http://www6.lexisnexis.com:80/publisher/EndUser?Action=UserDisplayFullDocument',
   'text': 'COPYRIGHT © 2005 LexisNexis, a division of Reed Elsevier Inc. All rights reserved.',
   'images': ['https://web.archive.org/web/20050322064340im_/http://www6.lexisnexis.com/publisher/images/Logo_LNPublisher.gif',
    'https://web.archive.org/web/20050322064340im_/http://www6.lexisnexis.com/publisher/images/blank.gif',
    'https://web.archive.org/web/20050322064340im_/http://www6.lexisnexis.com/publisher/images/upper_left_corner_red.gif',
    'https://web.archive.org/web/20050322064340im_/http://www6.lexisnexis.com/publisher/images/upper_right_corner_red.gif'],
   'top_img': '',
   'keywords': [],
   'authors': [],
   'canonical_link': '',
   'title': 'LexisNexis(R) Publisher',
   'meta_data': {},
   'movies': [],
   'publish_date': None,
   'source': 'https://web.archive.org',
   'summary': ''},
  'real'),
 ({'url': 'https://web.archive.org