# Prerequisites

In [1]:
import datalake
import logging
import os
import pymongo
import stackoverflow

TAG = 'data-science'

# Set up logging ... DEBUG for my code, WARNING for libraries
logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)
logging.getLogger("hdfs").setLevel(logging.WARNING)
logging.getLogger('chardet').setLevel(logging.WARNING)
logging.basicConfig(format='%(asctime)s - %(levelname)-6s - %(name)15s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.DEBUG)

mylake = datalake.DataLake('raw', 'www.stackoverflow.com', 'questions-by-tag')

so = stackoverflow.Scraper()

client = pymongo.MongoClient(host='mongodb')
db = client.stackoverflow
coll = db.questions_by_tag


# Retrieve Pages from Stack Overflow, Store in HDFS

In [2]:

class HtmlLakeWriter(object):

    def __init__(self):
        self.page = 1

    def on_result(self, result):
        mylake.store_html('tag-{tag}-{page:06d}'.format(tag=TAG, page=self.page),
                          result)
        self.page = self.page + 1

lakeWriter = HtmlLakeWriter()

so.search_tag(lakeWriter.on_result, TAG)


2021-10-02 02:41:43 - INFO   - stackoverflow.Scraper - Searching StackOverflow for questions with tag: data-science
2021-10-02 02:41:44 - DEBUG  - stackoverflow.Scraper - Retrieving page 2 of 139
2021-10-02 02:41:46 - DEBUG  - stackoverflow.Scraper - Retrieving page 3 of 139
2021-10-02 02:41:47 - DEBUG  - stackoverflow.Scraper - Retrieving page 4 of 139
2021-10-02 02:41:49 - DEBUG  - stackoverflow.Scraper - Retrieving page 5 of 139
2021-10-02 02:41:51 - DEBUG  - stackoverflow.Scraper - Retrieving page 6 of 139
2021-10-02 02:41:52 - DEBUG  - stackoverflow.Scraper - Retrieving page 7 of 139
2021-10-02 02:41:53 - DEBUG  - stackoverflow.Scraper - Retrieving page 8 of 139
2021-10-02 02:41:55 - DEBUG  - stackoverflow.Scraper - Retrieving page 9 of 139
2021-10-02 02:41:56 - DEBUG  - stackoverflow.Scraper - Retrieving page 10 of 139
2021-10-02 02:41:57 - DEBUG  - stackoverflow.Scraper - Retrieving page 11 of 139
2021-10-02 02:41:59 - DEBUG  - stackoverflow.Scraper - Retrieving page 12 of 139
2

# Verify HDFS Contents

In [3]:
mylake.list()

/raw/www.stackoverflow.com/questions-by-tag/2021/10/02/tag-data-science-000001.html
/raw/www.stackoverflow.com/questions-by-tag/2021/10/02/tag-data-science-000002.html
/raw/www.stackoverflow.com/questions-by-tag/2021/10/02/tag-data-science-000003.html
/raw/www.stackoverflow.com/questions-by-tag/2021/10/02/tag-data-science-000004.html
/raw/www.stackoverflow.com/questions-by-tag/2021/10/02/tag-data-science-000005.html
/raw/www.stackoverflow.com/questions-by-tag/2021/10/02/tag-data-science-000006.html
/raw/www.stackoverflow.com/questions-by-tag/2021/10/02/tag-data-science-000007.html
/raw/www.stackoverflow.com/questions-by-tag/2021/10/02/tag-data-science-000008.html
/raw/www.stackoverflow.com/questions-by-tag/2021/10/02/tag-data-science-000009.html
/raw/www.stackoverflow.com/questions-by-tag/2021/10/02/tag-data-science-000010.html
/raw/www.stackoverflow.com/questions-by-tag/2021/10/02/tag-data-science-000011.html
/raw/www.stackoverflow.com/questions-by-tag/2021/10/02/tag-data-science-0000

# Retrieve from HDFS, Store in MongoDB

In [4]:
for path, dirs, files in mylake.walk():
    for f in files:
        content = mylake.get_html(os.path.join(path, f),
                                  stackoverflow.PageOfTaggedQuestions)

        content.iterate_questions(lambda question: coll.update_one({'qid': question['qid']},
                                                                   {'$set': question},
                                                                   upsert=True))


# Verify Contents of MongoDB

In [7]:
%%bash

echo "Count of items: $(mongosh --eval "db.questions_by_tag.countDocuments()" --quiet mongodb://mongodb:27017/stackoverflow)"

echo "One random example:"
mongosh --eval "db.questions_by_tag.findOne()" --quiet mongodb://mongodb:27017/stackoverflow


Count of items: 6951
One random example:
{
  _id: ObjectId("6157b90f4fc9c909416ef64a"),
  qid: '35123939',
  link: 'https://stackoverflow.com/questions/35123939/cannot-see-ipython-notebook-interface-on-a-browser-using-docker-kitematic',
  tags: [ 'docker', 'data-science', 'docker-image', 'kitematic' ],
  title: 'Cannot see Ipython notebook interface on a browser using Docker Kitematic'
}
