In [1]:
import pandas as pd
import numpy as np
import re
import requests
from bs4 import BeautifulSoup
import signal
from contextlib import contextmanager
import matplotlib.pyplot as plt


# Data Scraping

- import the urls from csv file
- use `requests` to check if the status code is 200
- for all good urls, scrape the text using Beautiful soup
- use basic NLP techniques to clean, tokenize, and stem the words 

In [2]:
#Collect urls from text file containing all URL sources
def collect_urls(text_file):
    urls = []
    with open(text_file) as file:
        for line in file:
            if "http" in line:
                urls.append('http' + line.split('http')[-1][:-2].replace(' ',''))
    return urls

#Some urls do not exist or take too long to load and halt the program ,
#so we implement a timer to skip bad urls and collect all urls that
#return a 200 code
@contextmanager
def timeout(time):
    # Register a function to raise a TimeoutError on the signal.
    signal.signal(signal.SIGALRM, raise_timeout)
    # Schedule the signal to be sent after ``time``.
    signal.alarm(time)

    try:
        yield
    except TimeoutError:
        pass
    finally:
        # Unregister the signal so it won't be triggered
        # if the timeout is not reached.
        signal.signal(signal.SIGALRM, signal.SIG_IGN)


def raise_timeout(signum, frame):
    raise TimeoutError


def check_status(url):
    # Add a timeout block.
    with timeout(5):
        try:
            html_page = requests.get(url)
            if html_page.status_code == 200:
                print(url)
                return url
        except:
            pass

In [3]:
#Collect the urls from the text file
urls = collect_urls('list_of_articles.txt')


In [None]:
#from the list of urls, collect good urls
urls_to_clean = []
for url in urls:
    if check_status(url):
        urls_to_clean.append(url)


http://www.theaustralian.com.au/in-depth/terror/failed-bids-for-living-safe-together-funding-awarded-money/news-story/8f2998c8d5257b70d307f015f2188b58
https://www.aclu.org/cases/aclu-v-department-homeland-security-foia-lawsuit-seeking-records-countering-violent-extremism
https://www.aclu.org/coalition-letter-obama-administration-countering-violent-extremism-cve-program
http://www.strokeassociation.org/STROKEORG/LifeAfterStroke/HealthyLivingAfterStroke/ManagingMedicines/Anti-Clotting-Agents-Explained_UCM_310452_Article.jsp#.Vj_Cxyvl3L8
https://www.adl.org/news/press-releases/adl-report-white-supremacist-murders-more-than-doubled-in-2017
https://www.adl.org/sites/default/files/documents/assets/pdf/combating-hate/Aryan-Circle-Report.pdf
https://www.adl.org/sites/default/files/documents/assets/pdf/combating-hate/CR_4499_WhiteSupremacist-Report_web_vff.pdf
http://www.aaiusa.org/countering_violent_extremism_cve
http://www.aph.gov.au/About_Parliament/Parliamentary_Departments/Parliamentary_Li

In [None]:
#save list of good urls to a file
urls_df = pd.DataFrame(urls_to_clean)
urls_df.to_csv('good_urls_df.csv')

We are interested in analyzing the topics of an article, so we will create a list of articles that consist of tokens of stemmed words, using the `NLTK` stemming tool. We will remove stop words with `SpaCy`'s stopword collection. 

In [None]:
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.stem import PorterStemmer

articles =[]
i=0
for url in urls_to_clean:    
    try:
        html_page = requests.get(url)
        soup = BeautifulSoup(html_page.text, 'html.parser')
        article_text = []
        for x in soup.find_all('p'):
            txt = x.get_text()
            if txt:
                txt = re.sub(r"[^a-zA-Z0-9]", " ", txt.lower())
                words = txt.split() #words withhin a paragraph
                words = [w for w in words if w not in list(STOP_WORDS)] # Remove stopwords
                words = [PorterStemmer().stem(w) for w in words] # stem
                for word in words:
                    article_text.append(word)
        articles.append(article_text)
    except:
        print('{} was a bad article'.format(url))
    if i%10==0:
        print(i,'th url parsed')
    i+=1        

In [None]:
i=0
for article in articles:
    if len(article) < 4:
        articles.remove(article)
        i+=1
print('Removed {} redundant articles, we now have {} articles to model.'.format(i,len(articles)))

## Using Bigrams for Topic Modeling

Phrases and bigrams are helpful NLP tools offered in the `gensim` library that aid in topic modeling. For a given list of topics, we can feed it to a a Phraser and it will return word phrases (for example, the words "law" and "enforcement" are typically part of a phrase, so the phraser will return a new word "law_enforcement").

In [None]:
from gensim.models.phrases import Phrases, Phraser
phrases = Phrases(articles, min_count=1, threshold=2)
bigram = Phraser(phrases)
#here's an example of a bigram thhat we capture from the 4th article
bigram[articles[3]]

In [None]:
#lets collect all of our bigrams into a list for each article:

bigram_articles = []
for article in bigram[articles]:
    bigram_articles.append(article)