In [None]:
"""
In this notebook, we use the news api to get links for current news.  We use those links to scrape
data, which we then label using our logistic regression model from adfontesmedia.ipynb.  We take 
the articles labeled fake news by our model and find the topic clusters.
"""

In [8]:
from newsapi import NewsApiClient
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from bs4 import BeautifulSoup
import requests
import time

In [9]:
newsapi = NewsApiClient(api_key='c67afa4848924d24b07f9a5d79c92b09')

In [26]:
# Looks at all the sources available through the news API
newsapi.get_sources(language='en')['sources']

[{'id': 'abc-news',
  'name': 'ABC News',
  'description': 'Your trusted source for breaking news, analysis, exclusive interviews, headlines, and videos at ABCNews.com.',
  'url': 'https://abcnews.go.com',
  'category': 'general',
  'language': 'en',
  'country': 'us'},
 {'id': 'abc-news-au',
  'name': 'ABC News (AU)',
  'description': "Australia's most trusted source of local, national and world news. Comprehensive, independent, in-depth analysis, the latest business, sport, weather and more.",
  'url': 'http://www.abc.net.au/news',
  'category': 'general',
  'language': 'en',
  'country': 'au'},
 {'id': 'al-jazeera-english',
  'name': 'Al Jazeera English',
  'description': 'News, analysis from the Middle East and worldwide, multimedia and interactives, opinions, documentaries, podcasts, long reads and broadcast schedule.',
  'url': 'http://www.aljazeera.com',
  'category': 'general',
  'language': 'en',
  'country': 'us'},
 {'id': 'ars-technica',
  'name': 'Ars Technica',
  'descript

In [98]:
# Gets all the articles from fox news
#all_articles = newsapi.get_everything(sources = 'fox-news', page=1)

In [11]:
# Gets 100 of the most recent news articles from Breitbart News and saves the information
# into a list
articles_dict = []
for i in range(5):
    articles = newsapi.get_everything(sources = 'breitbart-news', page = i+1)
    articles_dict += articles['articles']

In [12]:
# Creates a dataframe from the list of articles
df = pd.DataFrame(articles_dict)

In [103]:
df.head()

Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content
0,"{'id': 'breitbart-news', 'name': 'Breitbart Ne...",Jack Montgomery,Only Eight Migrants on Deportation Flight Afte...,Yet another attempted deportation flight has e...,https://www.breitbart.com/europe/2020/11/13/on...,https://media.breitbart.com/media/2020/11/prit...,2020-11-13T09:46:11Z,Yet another attempted deportation flight has e...
1,"{'id': 'breitbart-news', 'name': 'Breitbart Ne...",Victoria Friedman,Migrants Complain Taxpayer-Funded Housing Is L...,Migrants have complained that their tax-payer ...,https://www.breitbart.com/europe/2020/11/13/mi...,https://media.breitbart.com/media/2020/11/UK-m...,2020-11-13T09:27:55Z,Migrants who came to Britain seeking asylum ha...
2,"{'id': 'breitbart-news', 'name': 'Breitbart Ne...",Jack Montgomery,"Report: Vote Leave Boss Out as BoJo Adviser, P...",Dominic Cummings will be out as the chief advi...,https://www.breitbart.com/europe/2020/11/13/re...,https://media.breitbart.com/media/2020/11/cumm...,2020-11-13T09:12:47Z,Vote Leave mastermind Dominic Cummings will be...
3,"{'id': 'breitbart-news', 'name': 'Breitbart Ne...",Ian Hanchett,Klain: Biden Will Take Executive Action ‘Fixin...,On Thursday’s broadcast of MSNBC’s “The Last W...,https://www.breitbart.com/clips/2020/11/12/kla...,https://media.breitbart.com/media/2020/03/Gett...,2020-11-13T04:27:04Z,On Thursday’s broadcast of MSNBC’s “The Last W...
4,"{'id': 'breitbart-news', 'name': 'Breitbart Ne...","Ian Hanchett, \nIan Hanchett",Klain: Harris Is 'Going to Be a Very Influenti...,On Thursday’s broadcast of MSNBC’s “The Last W...,https://www.breitbart.com/clips/2020/11/12/kla...,https://media.breitbart.com/media/2020/03/Gett...,2020-11-13T04:24:32Z,On Thursday’s broadcast of MSNBC’s “The Last W...


In [13]:
# Uses the given url to scrape the data from news website and store the text into a list
current = []
for index, row in df.iterrows():
    try:
        url = row['url']
        if url[0:7] not in ['https:/', 'http://']:
            url = 'https://' + url
        source_code = requests.get(url)
        soup = BeautifulSoup(source_code.text, 'html5lib')
        visible = soup.find_all(text=True)
        text = ''
        for tag in visible:
            if tag.parent.name in ['p', 'a']:
                text += ' ' + tag
        time.sleep(1)
        print(index)
        current.append(text)
    except:
        print('Not a valid URL', row['id'])



0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [14]:
# Creates a dataframe from the list of text
df_current = pd.DataFrame(current, columns = ['text'])

In [15]:
# Cleans up the text
import re
import string

us = lambda x: re.sub("U.S", 'us', x)
cleanup2 = lambda x: re.sub("\n", '', x)
cleanup3 = lambda x: re.sub("\t", '', x)
apostrophe = lambda x: re.sub("\'", '', x)
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())
df_current['text'] = df_current['text'].map(us).map(cleanup2).map(cleanup3).map(apostrophe).map(alphanumeric).map(punc_lower)

katie = lambda x: re.sub("katie", '', x)
hobbs = lambda x: re.sub("hobbs", '', x)
df_current['text'] = df_current['text'].map(katie).map(hobbs)

In [17]:
# Loads logisitic regression model from a pickle file
lr = pickle.load(open('log_reg', 'rb'))

In [18]:
# Loads tf-idf vectorizer from a pickle file
tfidf1 = pickle.load(open('tfidf1.pickle', 'rb'))

In [19]:
# Transforms our scraped current data using the tf-idf vectorizer
test = tfidf1.transform(df_current['text'])

In [20]:
# Predicts whether the articles are fake or real using our logistic regression model
y_pred_tfidf1_lr = lr.predict(test)

In [21]:
y_pred_tfidf1_lr

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0])

In [93]:
# Takes all the articles labeled fake and saves it into a list
fake_news = []
for i in range(len(y_pred_tfidf1_lr)):
    if y_pred_tfidf1_lr[i] == 0:
        new = df_current.loc[i]['text']
        new = new[500:]
        fake_news.append(new[500:-216])
        

  and should_run_async(code)


In [23]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [24]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

  and should_run_async(code)


In [95]:
# Generates a tf-idf vectorizer from a CountVectorizer
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english')
dtm_tf = tf_vectorizer.fit_transform(fake_news)

  and should_run_async(code)


In [102]:
# Use the tf-idf vectorizer from the CountVectorizer
lda_tf = LatentDirichletAllocation(n_components=8, random_state=0)
lda_tf.fit(dtm_tf)

  and should_run_async(code)


LatentDirichletAllocation(n_components=8, random_state=0)

In [103]:
# Generates visualizations for the LDA
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)

  and should_run_async(code)


In [122]:
"""
Method takes in a model, count vectorizer, and n_top_words and prints
the top n_top_words number of words for each topic
"""
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

  and should_run_async(code)


In [133]:
# prints the top 10 words for each topic
print_topics(lda_tf, tf_vectorizer, 10)

  and should_run_async(code)


[['Topic #1:',
  'trump',
  'votes',
  'election',
  'arizona',
  'state',
  'august',
  'claims',
  'said',
  'nazis',
  'https'],
 ['Topic #2:',
  'percent',
  'christians',
  'new',
  'trump',
  'inflation',
  'prices',
  'harassment',
  'october',
  'countries',
  'persecution'],
 ['Topic #3:',
  'trump',
  'election',
  'news',
  'president',
  'biden',
  'said',
  'fox',
  'going',
  'politics',
  'state'],
 ['Topic #4:',
  'percent',
  'https',
  'iran',
  'year',
  'november',
  'nuclear',
  'migrants',
  'prices',
  'said',
  'win'],
 ['Topic #5:',
  'black',
  'whitlock',
  'white',
  'people',
  'america',
  'culture',
  'olympics',
  'cummings',
  'politics',
  'leaving'],
 ['Topic #6:',
  'biden',
  'election',
  'project',
  'cortez',
  'ocasio',
  'politics',
  'senate',
  'trump',
  'november',
  'president'],
 ['Topic #7:',
  'hezbollah',
  'said',
  'county',
  'group',
  'recount',
  'vote',
  'statues',
  'lawsuit',
  'major',
  'israel'],
 ['Topic #8:',
  'migrants

In [139]:
"""
Method takes in a model, count vectorizer, and n_top_words and saves
the top n_top_words number of words for each topic in a dictionary
"""
def dict_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    dict_words = dict()
    for topic_idx, topic in enumerate(model.components_):
        topic_name = ("Topic #%d" % (topic_idx+1))
        dict_words[topic_name] = []
        for i in topic.argsort()[:-n_top_words - 1:-1]:
            dict_words[topic_name].append(words[i])
    return dict_words

  and should_run_async(code)


In [140]:
# Takes the dictionary of words in each topic and saves it as a dataframe
df_topics = pd.DataFrame(dict_topics(lda_tf, tf_vectorizer, 10))

  and should_run_async(code)


In [141]:
df_topics.head()

  and should_run_async(code)


Unnamed: 0,Topic #1,Topic #2,Topic #3,Topic #4,Topic #5,Topic #6,Topic #7,Topic #8
0,trump,percent,trump,percent,black,biden,hezbollah,migrants
1,votes,christians,election,https,whitlock,election,said,voight
2,election,new,news,iran,white,project,county,said
3,arizona,trump,president,year,people,cortez,group,political
4,state,inflation,biden,november,america,ocasio,recount,war
