# Scraper

In [1]:
# Scraping and cleaning
import praw
import pandas as pd
from numpy import arange
from datetime import datetime

In [2]:
# NLPre-processing
import sys
sys.path.append('../')
import nlp_helper_funcs
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords

In [3]:
# Topic Extraction using SVD
from sklearn.decomposition import TruncatedSVD

In [4]:
# Interactive Visualizations with Bokeh
from bokeh.plotting import figure, show, output_file
from bokeh.embed import components
from bokeh.models import CategoricalColorMapper, HoverTool
from bokeh.models.sources import ColumnDataSource
import bokeh.palettes

# Scrape from ```subreddit```

 Note that you will need a ```praw.ini``` file with valid authentication in order to scrape reddit

In [5]:
reddit = praw.Reddit("bot1")

In [6]:
subreddit = 'worldnews'
limit = 100

In [7]:
raw_posts = reddit.subreddit(subreddit).hot(limit=limit)

In [8]:
posts = { "title":[],
        "url":[],
        "id":[], 
        "created_utc":[], 
        "num_comments": [], 
        "upvote_ratio": [], 
        "score":[], 
        "edited":[]
       }

for post in raw_posts:
    posts["title"].append(post.title)
    posts['url'].append(post.url)
    posts["id"].append(post.id)
    posts["created_utc"].append(post.created_utc)
    posts["num_comments"].append(post.num_comments)
    posts["upvote_ratio"].append(post.upvote_ratio)
    posts["score"].append(post.score)
    posts["edited"].append(post.edited)
posts

{'title': ['Livethread 12: Global COVID-19 Pandemic',
  'Boris Johnson drops NHS fee for migrant healthcare workers in humiliating U-turn 24 hours after defending policy',
  "A woman must delete photographs of her grandchildren that she posted on Facebook and Pinterest without their parents' permission, a court in the Netherlands has ruled.",
  'Scotland bans companies based in tax havens from accessing coronavirus bailout money',
  "China doesn't seem to understand independence of Canada's judiciary: Trudeau",
  'Beijing to introduce national security law for Hong Kong',
  'Wuhan bans eating wild animals as coronavirus drives a crackdown in China',
  '70% of Dubai companies expect to go out of business within six months due to coronavirus pandemic, survey says',
  'Finland’s state epidemiologist says it would be a risk for his country to accept Swedish tourists',
  'Mount Everest is Visible From Kathmandu, Nepal for First Time in Living Memory',
  "Sweden is still nowhere near 'herd i

# Put posts in pandas, do a little cleaning

In [9]:
df = pd.DataFrame(posts)
del posts
print('First five rows of reddit posts')
# convert unix epoch utc to datetime objects, get log of score
df['Datetime'] = pd.to_datetime(df.created_utc, unit='s')
df['Datetime'] = df['Datetime'].apply(lambda x: x.strftime('%Y-%m-%d %H:%M:%S'))
# df = df.drop()
df.head(5)

First five rows of reddit posts


Unnamed: 0,title,url,id,created_utc,num_comments,upvote_ratio,score,edited,Datetime
0,Livethread 12: Global COVID-19 Pandemic,https://www.reddit.com/live/14d816ty1ylvo/,gfeei0,1588883000.0,2451,0.95,737,False,2020-05-07 20:23:17
1,Boris Johnson drops NHS fee for migrant health...,https://www.independent.co.uk/news/uk/politics...,gnzcql,1590076000.0,826,0.92,18076,False,2020-05-21 15:52:03
2,A woman must delete photographs of her grandch...,https://www.bbc.co.uk/news/technology-52758787,go05wg,1590079000.0,706,0.97,10168,False,2020-05-21 16:35:19
3,Scotland bans companies based in tax havens fr...,https://www.independent.co.uk/news/business/ne...,gnw528,1590065000.0,2116,0.95,95293,False,2020-05-21 12:41:26
4,China doesn't seem to understand independence ...,https://www.reuters.com/article/us-canada-chin...,go0j26,1590080000.0,394,0.96,2939,False,2020-05-21 16:54:44


In [10]:
# get list of all titles

titles = [title for title in df['title']]

# NLPre-process

In [11]:
vectorizer = TfidfVectorizer(ngram_range=(2,3),stop_words=stopwords.words('english'))
tokenizer = TreebankWordTokenizer().tokenize
stemmer = None # PorterStemmer()

In [12]:
nlp = nlp_helper_funcs.nlp_preprocessor(vectorizer=vectorizer, cleaning_function=None, 
                       tokenizer=tokenizer, stemmer=stemmer)
nlp.fit(titles)
doc_word = nlp.transform(titles).toarray()

doc_word.shape

(100, 1761)

In [13]:
doc_word[:3]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# NLP

In [14]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '", topic_names[ix], "'")
        print(", ".join([feature_names[i]
                         for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [15]:
num_topics = 10
num_words_per_topic = 10

In [16]:
lsa = TruncatedSVD(num_topics)
doc_topic = lsa.fit_transform(doc_word)
lsa.explained_variance_ratio_

array([0.02174696, 0.01984089, 0.01804848, 0.01727393, 0.01533355,
       0.01164771, 0.0115345 , 0.01089194, 0.01037557, 0.01027984])

In [17]:
display_topics(lsa, vectorizer.get_feature_names(), num_words_per_topic)


Topic  0
security law, hong kong, law could end, could end hong, end hong, china security law, law could, china security, security law could, could end

Topic  1
professor release shows, professor release, shows beijing, shows beijing forces, beijing forces loyalty, release shows beijing, release shows, imprisoned uighur professor, imprisoned uighur, beijing forces

Topic  2
law could end, end hong, law could, could end hong, could end, china security law, china security, security law could, end hong kong, magnetic field rapidly

Topic  3
chechen leader kadyrov, chechen leader, hospitalized coronavirus, hospitalized coronavirus moscow, leader kadyrov, leader kadyrov hospitalized, kadyrov hospitalized coronavirus, kadyrov hospitalized, coronavirus moscow, moscow times

Topic  4
hong kong security, kong security, china moves impose, kong security law, moves impose, new hong, new hong kong, moves impose new, china moves, impose new hong

Topic  5
improvement hong kong, improvement hong, 

___

In [18]:
# import pyLDAvis
# import pyLDAvis.sklearn
# pyLDAvis.enable_notebook()
# pyLDAvis.sklearn.prepare(lda, doc_word, tfidf)

# Merge topic grams into semi-coherent sentences

In [19]:
from collections import deque

In [20]:
def display_topics4(model, feature_names, no_top_words, topic_names=None, verbose=False):
    topics = {}
    for ix, topic in enumerate(model.components_):
        gram_list = [feature_names[i].split()
                         for i in topic.argsort()[:-no_top_words - 1:-1]]

        #instantiate variables
        deques = {}
        for i in range(0,len(gram_list)):
            deques[i] = deque(gram_list[i])

        final_text = []
        clauses =  []

        if verbose: print("DEQUES:",deques)

        while len(deques) > 0:
            # initialize clause as first deque in dictionary
            clauses.append(deques[list(deques)[0]])
            del deques[list(deques)[0]]

            for clause in clauses:
                while len(deques) > 0:
                    for i in deques.copy():
                        gram = deques[i]
                        overlap=False
                        # check for overlapping words and append
                        for word in gram:
                            if word in clause:
                                overlap=True

                            elif word not in clause and overlap==True:
                                clause.append(word)
                        overlap=False
                        # reverse words in gram to prepend
                        for word in deque(reversed(gram)):
                            if word in clause:
                                overlap=True

                            elif word not in clause and overlap==True:
                                clause.appendleft(word)

                        if overlap==True:
                            del deques[i]
                        if verbose: print("OVERLAP=",overlap, "for gram",i)
                    if verbose:
                        print("CLAUSE:",clauses)
                        print("DEQUES:",deques)
                        print('end of while loop')
                    if overlap == False:
                        break
            #convert clause from deque to list of strings
            final_text.append(list(clause))


            topics[ix+1] = final_text
    return topics

In [21]:
display_topics4(lsa, vectorizer.get_feature_names(), num_words_per_topic)

{1: [['china', 'security', 'law', 'could', 'end', 'hong', 'kong']],
 2: [['imprisoned',
   'uighur',
   'professor',
   'release',
   'shows',
   'beijing',
   'forces',
   'loyalty']],
 3: [['china', 'security', 'law', 'could', 'end', 'hong', 'kong'],
  ['magnetic', 'field', 'rapidly']],
 4: [['chechen',
   'leader',
   'kadyrov',
   'hospitalized',
   'coronavirus',
   'moscow',
   'times']],
 5: [['china', 'moves', 'impose', 'new', 'hong', 'kong', 'security', 'law']],
 6: [['china',
   'supports',
   'improvement',
   'hong',
   'kong',
   'political',
   'system']],
 7: [['supports', 'improvement', 'hong', 'kong', 'political', 'system'],
  ['new', 'coronavirus', 'cases']],
 8: [['000', 'new', 'coronavirus', 'cases', 'across', 'world'],
  ['covid', '19'],
  ['jump', 'ever', 'single', 'day']],
 9: [['gold', 'coast', 'man', 'cut', 'shark', 'nets', 'save'],
  ['shark', 'nets']],
 10: [['boris', 'johnson'],
  ['police', 'watchdog', 'trudeau', 'says'],
  ['palestinians', 'reject'],
  ['r

# Bokeh

In [22]:
# Add column 'topic' with integer representing topic
df_topics = pd.DataFrame(doc_topic, columns=[str(i) for i in arange(0, num_topics)])
df_topics['topic'] = df_topics.idxmax(axis=1)

In [23]:
# The goal here is to join the doc_topic dataframe on the right side of the reddit data
df = df.join(df_topics)

In [24]:
df.head()

Unnamed: 0,title,url,id,created_utc,num_comments,upvote_ratio,score,edited,Datetime,0,1,2,3,4,5,6,7,8,9,topic
0,Livethread 12: Global COVID-19 Pandemic,https://www.reddit.com/live/14d816ty1ylvo/,gfeei0,1588883000.0,2451,0.95,737,False,2020-05-07 20:23:17,0.002432,-0.008569,0.010717,-0.008527,0.067468,0.153673,-0.303665,0.001945,0.021736,0.025406,5
1,Boris Johnson drops NHS fee for migrant health...,https://www.independent.co.uk/news/uk/politics...,gnzcql,1590076000.0,826,0.92,18076,False,2020-05-21 15:52:03,0.001076,-0.014969,0.014852,0.013728,0.032229,0.097934,0.013853,-0.077124,-0.06554,0.16974,9
2,A woman must delete photographs of her grandch...,https://www.bbc.co.uk/news/technology-52758787,go05wg,1590079000.0,706,0.97,10168,False,2020-05-21 16:35:19,-0.003396,0.005966,0.000386,-0.00259,0.005593,-0.023501,-0.047993,-0.038171,0.01941,-0.118596,8
3,Scotland bans companies based in tax havens fr...,https://www.independent.co.uk/news/business/ne...,gnw528,1590065000.0,2116,0.95,95293,False,2020-05-21 12:41:26,-0.001664,-0.005141,0.001626,0.004625,0.009382,0.084369,-0.061466,-0.127933,-0.063299,-0.067257,5
4,China doesn't seem to understand independence ...,https://www.reuters.com/article/us-canada-chin...,go0j26,1590080000.0,394,0.96,2939,False,2020-05-21 16:54:44,0.001121,-0.003336,-0.003149,0.002009,0.018097,0.039674,0.046604,-0.064405,0.100436,-0.030169,8


___

In [25]:
def bokeh_viz(source, x, y, labels:tuple, num_topics=10):

    # simple plot
    simpleScatterPlot = figure(plot_width=800, plot_height=400)

    topics = [str(i) for i in arange(0, num_topics)]

    color_mapper = CategoricalColorMapper(factors=topics,
                                        palette=bokeh.palettes.d3['Category10'][10])

    for topic in topics:
        simpleScatterPlot.circle(x, y, source=source[source.topic == topic], size=10,
                                legend_label=topic,
                                color={
            'field': 'topic',
            'transform': color_mapper
        },
            alpha=0.7)

    simpleScatterPlot.xaxis.axis_label = labels[0]
    simpleScatterPlot.yaxis.axis_label = labels[1]
    simpleScatterPlot.legend.location = "top_left"
    simpleScatterPlot.legend.click_policy = "hide"

    tooltips = [
        ("", "@title"),
        ("Date", "@Datetime"),
        ("URL", "@url"),
        ("Topic", "@topic")
    ]

    hover = HoverTool(tooltips=tooltips)
    simpleScatterPlot.add_tools(hover)

    #return components(simpleScatterPlot)
    show(simpleScatterPlot)

In [26]:
x = 'upvote_ratio'
y = 'num_comments'
bokeh_labels = ('Upvote Ratio', 'Number of Comments')
bokeh_viz(df, x, y, labels=bokeh_labels, num_topics=num_topics)
# The line below should only be used if line 35 in bokeh_viz is uncommented
# It is used to directly embed a bokeh plot as a div, with corresponding javascript
# bokeh_js1, bokeh_div1 = bokeh_viz(df, x, y, labels=bokeh_labels, num_topics=num_topics)
