## Discovering my interests in Hacker News with NLP
# Part II: Scrapping Hacker News favorities

For scrapping we use Selenium instead of simple "requests" because the major of Hacker News aggregated pages are JS content generated.

In [6]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from typing import Union, Tuple, List
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import pandas as pd
import time
import multiprocessing
import os
import json

In [7]:
CHROME_DRIVER_PATH = os.path.join(os.getcwd(), 'contrib/chromedriver')

Basic class for scrap with Selenium and Chrome

In [8]:
class ScraperWebJS:
    
    prepared = False
    
    def prepare(self):
        """
        Prepare the headless browser for a scrap session
        """
        if self.prepared:
            return
        self.prepared = True
        opts = Options()
        opts.add_argument('--headless')
        opts.add_argument('windows-size=1920,1080')
        self.drv = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH, options=opts)
        self.drv.implicitly_wait(10)
    
    def scrape(self, url: str) -> Union[str, None]:
        """
        Scrape the "url" using selenium with Chrome backend
        """
        if not self.prepared:
            self.prepare()
        try:
            self.drv.get(url)
        except Exception as err:
            self.finish()
            return None
        return self.drv.page_source

    def finish(self):
        """
        Finish scrapping session
        """
        if self.prepared:
            self.drv.quit()
            self.prepared = False

## Load the favorite URLS to scrap

In [9]:
hn_favs = json.loads(open('hn_favs.json', 'r').read())
urls = [x['url'] for x in hn_favs]

In [10]:
scrapper = ScraperWebJS()
scraped_urls = []
for url in tqdm(urls):
    try:
        content = scrapper.scrape(url)
    except Exception as err:
        print(f"Error can't scrape {url} => {err}")
        continue
    if content is None:
        continue
    scraped_urls.append({'url': url, 'content': content})

HBox(children=(FloatProgress(value=0.0, max=207.0), HTML(value='')))




We use Pandas to "dump" de scrapped data

We saved the scrapped content to process them in the last part

In [11]:
df = pd.DataFrame(scraped_urls)
df.to_pickle('hn_favs_scrapped.pickle')