# Fake news tweet analysis on FakeNewsNet dataset

In [24]:
import re
import pandas as pd
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

from utils import collect_tweets, collect_news

In [2]:
root_dir = "/media/jakob/DATA/Projects/FakeNewsNet/code/fakenewsnet_dataset/"

datasets = [
    "politifact",
    "gossipcop"
]

random_seed = 42    # For TSNE

In [35]:
politifact_news = pd.read_json(collect_news(root_dir + datasets[0]))
politifact_news.head(5)

Unnamed: 0,url,text,images,top_img,keywords,authors,canonical_link,title,meta_data,movies,publish_date,source,summary,truth
0,http://www.senate.gov/legislative/LIS/roll_cal...,Roll Call Vote 111th Congress - 1st Session\n\...,[http://www.senate.gov/resources/images/usFlag...,http://www.senate.gov/resources/images/us_sen.ico,[],[],,U.S. Senate: U.S. Senate Roll Call Votes 111th...,"{'viewport': 'width=device-width, initial-scal...",[],,http://www.senate.gov,,real
1,http://www.politico.com/news/stories/0509/2241...,Barack Obama ended up in the middle of an unli...,"[data:image/gif;base64,R0lGODlhAQABAAAAACH5BAE...",http://s3-origin-images.politico.com/news/0905...,[],[Alexander Burns],https://www.politico.com/story/2009/05/trump-o...,Trump on Prejean: Same as Obama,"{'article': {'opinion': 'false'}, 'og': {'titl...",[],1242120000.0,http://www.politico.com,,real
2,http://pollingreport.com/CongJob.htm,Follow @pollreport CONGRESS – Job Rating See a...,[http://pollingreport.com/images/logo3.gif],,[],[],,Congress: Job Ratings,"{'GENERATOR': 'Microsoft FrontPage 5.0', 'Micr...",[],,http://pollingreport.com,,real
3,https://web.archive.org/web/20090514023215/htt...,House GOP Introduces Keep Terrorists Out of Am...,[https://web.archive.org/web/20090514023215im_...,https://web.archive.org/web/20090514023215im_/...,[],[],,House GOP Introduces Keep Terrorists Out of Am...,{},[],,https://web.archive.org,,real
4,https://web.archive.org/web/20061116002411/htt...,'); //--> E-mail | Print | Comments | Request ...,[https://web.archive.org/web/20061116002411im_...,https://web.archive.org/web/20061116002411im_/...,[],[Robert Lenzner],,Is The End In Sight For Supermax?,{'description': 'Americas most secure and most...,[],1145052000.0,https://web.archive.org,,real


In [39]:
text_data = politifact_news[["text", "title", "source", "truth"]]
text_data

Unnamed: 0,text,title,source,truth
0,Roll Call Vote 111th Congress - 1st Session\n\...,U.S. Senate: U.S. Senate Roll Call Votes 111th...,http://www.senate.gov,real
1,Barack Obama ended up in the middle of an unli...,Trump on Prejean: Same as Obama,http://www.politico.com,real
2,Follow @pollreport CONGRESS – Job Rating See a...,Congress: Job Ratings,http://pollingreport.com,real
3,House GOP Introduces Keep Terrorists Out of Am...,House GOP Introduces Keep Terrorists Out of Am...,https://web.archive.org,real
4,'); //--> E-mail | Print | Comments | Request ...,Is The End In Sight For Supermax?,https://web.archive.org,real
...,...,...,...,...
926,10:30 pm last night police received a call abo...,Manager killed 10 employees at checkers on 183...,http://channel24news.com,fake
927,"WASHINGTON, D.C. (AP) — At a press conference ...",Paul Ryan: “Donald Trump Plans To Resign From ...,https://web.archive.org,fake
928,For full functionality of this site it is nece...,,http://politicot.com,fake
929,"As we promised, Freedom Crossroads has just le...",BREAKING: The Democrat James Comey Has Been Ha...,https://web.archive.org,fake


In [43]:
text_data = text_data.apply(lambda x: x.astype(str).str.lower())
text_data


Unnamed: 0,text,title,source,truth
0,roll call vote 111th congress - 1st session\n\...,u.s. senate: u.s. senate roll call votes 111th...,http://www.senate.gov,real
1,barack obama ended up in the middle of an unli...,trump on prejean: same as obama,http://www.politico.com,real
2,follow @pollreport congress – job rating see a...,congress: job ratings,http://pollingreport.com,real
3,house gop introduces keep terrorists out of am...,house gop introduces keep terrorists out of am...,https://web.archive.org,real
4,'); //--> e-mail | print | comments | request ...,is the end in sight for supermax?,https://web.archive.org,real
...,...,...,...,...
926,10:30 pm last night police received a call abo...,manager killed 10 employees at checkers on 183...,http://channel24news.com,fake
927,"washington, d.c. (ap) — at a press conference ...",paul ryan: “donald trump plans to resign from ...,https://web.archive.org,fake
928,for full functionality of this site it is nece...,,http://politicot.com,fake
929,"as we promised, freedom crossroads has just le...",breaking: the democrat james comey has been ha...,https://web.archive.org,fake


In [None]:
nltk.download("stopwords")
stopwords = nltk.corpus.stopwords.words('english')
stemmer = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@'



First, we load the tweets from our dataset to a Pandas DataFrame object for easier usage. We use a helper function ``collect_tweets`` that also adds a ``truth`` value to the tweet, depending on what kind of news it is talking about. Let's see how the tweets look like. 

In [None]:
politifact_tweets = pd.read_json(collect_tweets(root_dir + datasets[0]))
politifact_tweets.head(1)

We can see that the most important attributes of a tweet are ``text`` and ``user``. The ``place`` attribute is also very interesting, but is unfortunately not present in all tweets. Now, let's take a closer look at the ``user`` objects.

In [None]:
politifact_tweets["user"].head(1)


There are a number of interesting fields in the user object:
- ``verified`` (tells us whether the account is verified or not)
- ``followers_count``
- ``friends_count``
- ``favourites_count`` (the number of tweets the user has liked in their lifetime)
- ``statuses_count`` (the number of tweets and retweets the user has issued in their lifetime)

The above fields all have integer or boolean values, which makes them easy to work with in the context of data analysis. We will use these fields to try to visualize the users that tweeted real or fake news. Because we have 5 features, we will need some kind of dimensionality reduction to visualize them in 2D. 

In [None]:
features = ["verified", "followers_count", "friends_count", "favourites_count", "statuses_count"]

politifact_tweets.drop(politifact_tweets.columns.difference(["user", "truth"]), 1, inplace=True)


