### Initialization:

- python: 3.7.0
- environment: recommendations

### Install packages

In [1]:
# !pip3 install pandas
# !pip3 install feedparser
# !pip3 install beautifulsoup4
# !pip3 install Scrapy
# !pip3 install newspaper3k
# !pip3 install lxml

### Detailed Steps:

- Step 1: Include all the packages here.
- Step 2: Initialise variables to hold all article links, raw and scrubbed contents.
- Step 3: Retrieve all the articles links from a blog provided.
- Step 4: Crawl all articles links to retrieve all title, content and link accordingly.
- Step 5: Save raw content in `raw_machine_learning_mastery.json` and scrubbed content in `scrubbed_machine_learning_mastery.json` files accordingly.

In [2]:
### Step 1: Import necessary packages here ###

import csv
import pandas
import numpy

# feedparser helps to xml to hash
# Install: conda install feedparser
import feedparser

# BeautifulSoup helps to grab text out of html
# Install: conda install beautifulsoup4
from bs4 import BeautifulSoup

import json
import urllib3

from collections import Counter

import scrapy
import newspaper
from newspaper import Article
from pandas import read_csv
from lxml import html
import requests

In [3]:
### Step 2: Initialise variables to store raw article, scrubbed article & article links.

# for raw article
raw_json = list()

# for scrubbed article
scrubbed_json = list()

# for article links
article_links = list()

In [4]:
### Step 3: Feed in the blog URL to aggregate content ###
# 2.1 Select the blog that we want to crawl.
# 2.2 Grab all article links from the current page.
# 2.3 Crawl pages until we reach at a page that returns empty results.

feed_url = "http://machinelearningmastery.com/blog/"
urllib3.disable_warnings()
http = urllib3.PoolManager()

# define a method that will accept a URL with pagination parameter.
def parse(page):
    print("Running for: ", page)
    response = http.request('GET', page)
    html = response.data
    soup = BeautifulSoup(html, 'lxml')
    page_links = soup.findAll('a', attrs={ 'rel': "bookmark" })

    if len(page_links) == 0:
        return False
    else:
        for link in page_links:
            article_links.append(link.get('href'))
        return True

In [5]:
# initialise feed_url into a variable to make it incremental to be compatible with pagination.
current_page = feed_url

# maintain a counter for pagination
count = 1

# iterate until we get empty results from the page.
while True:
    status = parse(current_page)
    if status == False:
        print("I'm done!")
        break
    count += 1
    current_page = feed_url + "page/" + str(count)

Running for:  http://machinelearningmastery.com/blog/
Running for:  http://machinelearningmastery.com/blog/page/2
Running for:  http://machinelearningmastery.com/blog/page/3
Running for:  http://machinelearningmastery.com/blog/page/4
Running for:  http://machinelearningmastery.com/blog/page/5
Running for:  http://machinelearningmastery.com/blog/page/6
Running for:  http://machinelearningmastery.com/blog/page/7
Running for:  http://machinelearningmastery.com/blog/page/8
Running for:  http://machinelearningmastery.com/blog/page/9
Running for:  http://machinelearningmastery.com/blog/page/10
Running for:  http://machinelearningmastery.com/blog/page/11
Running for:  http://machinelearningmastery.com/blog/page/12
Running for:  http://machinelearningmastery.com/blog/page/13
Running for:  http://machinelearningmastery.com/blog/page/14
Running for:  http://machinelearningmastery.com/blog/page/15
Running for:  http://machinelearningmastery.com/blog/page/16
Running for:  http://machinelearningmas

In [6]:
print("Total number of links to crawl: ", len(article_links))

Total number of links to crawl:  684


In [7]:
### Step 4: Crawl every articles to pull in title, content and URL ###

# machinelearningmastery.com expects User-Agent in the request header.
# else, it throws 403 Forbidden error.
headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" }

for link in article_links:
    print("Parsing URL: ", link)

    # open the URL
    current_page = requests.get(link, headers = headers)
    # pass in the html content to BeautifulSoup
    tree = BeautifulSoup(current_page.content, "lxml")
    # parse to find title of the article
    title = tree.body.find('h1', attrs={'class': 'entry-title'}).text
    # parse to find summary of the article
    raw_content = tree.html
    # scrub all the html tags, lowercase all the letters, get rid of \t and \n.
    scrubbed_content = raw_content.text.replace('\t', '').replace('\n', '').lower()
    # add raw data to `raw_json`
    raw_entry = { 'title': title, 'link': link, 'content': raw_content }
    raw_json.append(raw_entry)
    # add scrubbed data to `scrubbed_json`
    scrubbed_entry = { 'title': title, 'link': link, 'content': scrubbed_content }
    scrubbed_json.append(scrubbed_entry)

Parsing URL:  https://machinelearningmastery.com/early-stopping-to-avoid-overtraining-neural-network-models/
Parsing URL:  https://machinelearningmastery.com/how-to-reduce-overfitting-with-dropout-regularization-in-keras/
Parsing URL:  https://machinelearningmastery.com/dropout-for-regularizing-deep-neural-networks/
Parsing URL:  https://machinelearningmastery.com/how-to-reduce-generalization-error-in-deep-neural-networks-with-activity-regularization-in-keras/
Parsing URL:  https://machinelearningmastery.com/activation-regularization-for-reducing-generalization-error-in-deep-learning-neural-networks/
Parsing URL:  https://machinelearningmastery.com/how-to-reduce-overfitting-in-deep-neural-networks-with-weight-constraints-in-keras/
Parsing URL:  https://machinelearningmastery.com/introduction-to-weight-constraints-to-reduce-generalization-error-in-deep-learning/
Parsing URL:  https://machinelearningmastery.com/how-to-reduce-overfitting-in-deep-learning-with-weight-regularization/
Parsin

Parsing URL:  https://machinelearningmastery.com/statistical-methods-in-an-applied-machine-learning-project/
Parsing URL:  https://machinelearningmastery.com/controlled-experiments-in-machine-learning/
Parsing URL:  https://machinelearningmastery.com/statistical-significance-tests-for-comparing-machine-learning-algorithms/
Parsing URL:  https://machinelearningmastery.com/chi-squared-test-for-machine-learning/
Parsing URL:  https://machinelearningmastery.com/how-to-calculate-the-5-number-summary-for-your-data-in-python/
Parsing URL:  https://machinelearningmastery.com/statistical-sampling-and-resampling/
Parsing URL:  https://machinelearningmastery.com/critical-values-for-statistical-hypothesis-testing/
Parsing URL:  https://machinelearningmastery.com/statistical-data-distributions/
Parsing URL:  https://machinelearningmastery.com/data-visualization-methods-in-python/
Parsing URL:  https://machinelearningmastery.com/estimation-statistics-for-machine-learning/
Parsing URL:  https://machi

Parsing URL:  https://machinelearningmastery.com/encoder-decoder-models-text-summarization-keras/
Parsing URL:  https://machinelearningmastery.com/teacher-forcing-for-recurrent-neural-networks/
Parsing URL:  https://machinelearningmastery.com/prepare-news-articles-text-summarization/
Parsing URL:  https://machinelearningmastery.com/encoder-decoder-deep-learning-models-text-summarization/
Parsing URL:  https://machinelearningmastery.com/gentle-introduction-text-summarization/
Parsing URL:  https://machinelearningmastery.com/develop-a-deep-learning-caption-generation-model-in-python/
Parsing URL:  https://machinelearningmastery.com/develop-a-caption-generation-model-in-keras/
Parsing URL:  https://machinelearningmastery.com/deep-learning-caption-generation-models/
Parsing URL:  https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
Parsing URL:  https://machinelearningmastery.com/prepare-univariate-time-series-data-long-short-term-memory-networks/
Parsing URL:  https://

Parsing URL:  https://machinelearningmastery.com/reproducible-results-neural-networks-keras/
Parsing URL:  https://machinelearningmastery.com/how-to-use-an-encoder-decoder-lstm-to-echo-sequences-of-random-integers/
Parsing URL:  https://machinelearningmastery.com/learn-echo-random-integers-long-short-term-memory-recurrent-neural-networks/
Parsing URL:  https://machinelearningmastery.com/5-step-life-cycle-long-short-term-memory-models-keras/
Parsing URL:  https://machinelearningmastery.com/calculate-bootstrap-confidence-intervals-machine-learning-results-python/
Parsing URL:  https://machinelearningmastery.com/report-classifier-performance-confidence-intervals/
Parsing URL:  https://machinelearningmastery.com/evaluate-skill-deep-learning-models/
Parsing URL:  https://machinelearningmastery.com/large-data-files-machine-learning/
Parsing URL:  https://machinelearningmastery.com/suitability-long-short-term-memory-networks-time-series-forecasting/
Parsing URL:  https://machinelearningmaster

Parsing URL:  https://machinelearningmastery.com/how-to-go-from-working-in-a-bank-to-hired-as-senior-data-scientist-at-target/
Parsing URL:  https://machinelearningmastery.com/time-series-forecasting-supervised-learning/
Parsing URL:  https://machinelearningmastery.com/time-series-forecasting/
Parsing URL:  https://machinelearningmastery.com/time-series-datasets-for-machine-learning/
Parsing URL:  https://machinelearningmastery.com/machine-learning-podcasts/
Parsing URL:  https://machinelearningmastery.com/standard-machine-learning-datasets/
Parsing URL:  https://machinelearningmastery.com/machine-learning-performance-improvement-cheat-sheet/
Parsing URL:  https://machinelearningmastery.com/books-on-time-series-forecasting-with-r/
Parsing URL:  https://machinelearningmastery.com/confusion-matrix-machine-learning/
Parsing URL:  https://machinelearningmastery.com/implementing-stacking-scratch-python/
Parsing URL:  https://machinelearningmastery.com/implement-random-forest-scratch-python/

Parsing URL:  https://machinelearningmastery.com/image-augmentation-deep-learning-keras/
Parsing URL:  https://machinelearningmastery.com/standard-machine-learning-datasets-used-practice-weka/
Parsing URL:  https://machinelearningmastery.com/handwritten-digit-recognition-using-convolutional-neural-networks-python-keras/
Parsing URL:  https://machinelearningmastery.com/crash-course-convolutional-neural-networks/
Parsing URL:  https://machinelearningmastery.com/load-csv-machine-learning-data-weka/
Parsing URL:  https://machinelearningmastery.com/using-learning-rate-schedules-deep-learning-models-python-keras/
Parsing URL:  https://machinelearningmastery.com/tour-weka-machine-learning-workbench/
Parsing URL:  https://machinelearningmastery.com/dropout-regularization-deep-learning-models-keras/
Parsing URL:  https://machinelearningmastery.com/display-deep-learning-model-training-history-in-keras/
Parsing URL:  https://machinelearningmastery.com/download-install-weka-machine-learning-workbe

Parsing URL:  https://machinelearningmastery.com/data-visualization-in-r/
Parsing URL:  https://machinelearningmastery.com/descriptive-statistics-examples-with-r/
Parsing URL:  https://machinelearningmastery.com/r-crash-course-for-developers/
Parsing URL:  https://machinelearningmastery.com/steps-to-the-best-machine-learning-algorithm/
Parsing URL:  https://machinelearningmastery.com/find-machine-learning-landmarks/
Parsing URL:  https://machinelearningmastery.com/how-to-use-r-for-machine-learning/
Parsing URL:  https://machinelearningmastery.com/use-r-for-machine-learning/
Parsing URL:  https://machinelearningmastery.com/why-implement-a-machine-learning-algorithm-from-scratch/
Parsing URL:  https://machinelearningmastery.com/extend-machine-learning-tools/
Parsing URL:  https://machinelearningmastery.com/investigate-machine-learning-tools/
Parsing URL:  https://machinelearningmastery.com/proceduralize-machine-learning-tools/
Parsing URL:  https://machinelearningmastery.com/machine-lear

Parsing URL:  https://machinelearningmastery.com/master-kaggle-by-competing-consistently/
Parsing URL:  https://machinelearningmastery.com/going-beyond-predictions/
Parsing URL:  https://machinelearningmastery.com/how-to-kick-ass-in-competitive-machine-learning/
Parsing URL:  https://machinelearningmastery.com/how-to-get-better-at-machine-learning/
Parsing URL:  https://machinelearningmastery.com/non-linear-classification-in-r/
Parsing URL:  https://machinelearningmastery.com/linear-classification-in-r/
Parsing URL:  https://machinelearningmastery.com/clever-application-of-a-predictive-model/
Parsing URL:  https://machinelearningmastery.com/improve-model-accuracy-with-data-pre-processing/
Parsing URL:  https://machinelearningmastery.com/model-prediction-versus-interpretation-in-machine-learning/
Parsing URL:  https://machinelearningmastery.com/non-linear-regression-in-r-with-decision-trees/
Parsing URL:  https://machinelearningmastery.com/non-linear-regression-in-r/
Parsing URL:  https

Parsing URL:  https://machinelearningmastery.com/6-practical-books-for-beginning-machine-learning/
Parsing URL:  https://machinelearningmastery.com/how-to-implement-a-machine-learning-algorithm/
Parsing URL:  https://machinelearningmastery.com/applied-machine-learning-is-a-meritocracy/
Parsing URL:  https://machinelearningmastery.com/what-if-im-not-a-good-programmer/
Parsing URL:  https://machinelearningmastery.com/what-if-i-dont-have-a-degree/
Parsing URL:  https://machinelearningmastery.com/what-if-im-not-good-at-mathematics/
Parsing URL:  https://machinelearningmastery.com/how-to-learn-a-machine-learning-algorithm/
Parsing URL:  https://machinelearningmastery.com/hands-on-big-data-by-peter-norvig/
Parsing URL:  https://machinelearningmastery.com/reproducible-machine-learning-results-by-default/
Parsing URL:  https://machinelearningmastery.com/what-is-data-mining-and-kdd/
Parsing URL:  https://machinelearningmastery.com/self-study-machine-learning-projects/
Parsing URL:  https://mach

In [8]:
### Step 6: Download all articles in Machine Learning Mastery articles folder ###

for entry in raw_json:
    file_name = "Machine Learning Mastery articles/" + entry['title'] + ".html"
    with open(file_name, 'w') as outfile:
        outfile.write(str(entry['content']))
        outfile.close()
print("All HTML files downloaded!!!")

All HTML files downloaded!!!


{'title': 'A Gentle Introduction to Early Stopping to Avoid Overtraining Deep Learning Neural Network Models', 'link': 'https://machinelearningmastery.com/early-stopping-to-avoid-overtraining-neural-network-models/', 'content': 'a gentle introduction to early stopping to avoid overtraining deep learning neural network models{"@context":"https:\\/\\/schema.org","@type":"organization","url":"https:\\/\\/machinelearningmastery.com\\/","sameas":["https:\\/\\/www.facebook.com\\/machine-learning-mastery-1429846323896563\\/","https:\\/\\/www.linkedin.com\\/in\\/jasonbrownlee","https:\\/\\/plus.google.com\\/u\\/0\\/b\\/117073416089354242117\\/+machinelearningmasteryhome\\/","https:\\/\\/twitter.com\\/teachthemachine"],"@id":"https:\\/\\/machinelearningmastery.com\\/#organization","name":"machine learning mastery","logo":"https:\\/\\/machinelearningmastery.com\\/wp-content\\/uploads\\/2016\\/09\\/cropped-icon.png"}window._wpemojisettings = {"baseurl":"https:\\/\\/s.w.org\\/images\\/core\\/emoji

In [12]:
## Step 7: Write the plain texts to `Scrubbed MLM articles.json`
with open("Scrubbed MLM articles.json", 'w') as outfile:
    outfile.write(json.dumps(scrubbed_json, indent=4))
    outfile.close()
print("Plain texts written to Scrubbed MLM articles.json")

Plain texts written to Scrubbed MLM articles.json
