This Notebook crawls all Investopedia articles and returns a matrix representation of the link relationships between articles. This matrix will be used for the PageRank in another Notebook.

Input: The root domain of the Investopedia web page. The input stays the same and will not be user-defined. Other pages that Investopedia will most likely not work with this crawler as this crawler is specifically designed for Investopedia articles.

Output: A matrix prepresentation of a graph network (with only 0s and 1). The graph represents the relationships between Investopedia articles. All relationships are one-way relationships. A relationship from article A to article B is defined as A having a link that leads to article B.

In [140]:
# Imports

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException

import urllib.request
from bs4 import BeautifulSoup
# For sleeper
import time
# For random sleeper times
import random
# For decimal numbers
import decimal
# For permanent disk storage of results as CSV
import csv
# For matrix manipulation
import numpy as np
# For finding duplicates, https://stackoverflow.com/questions/38386387/find-duplicates-in-a-array-list-of-integers
import collections
# For fileNotFound error
import errno
import os

In [141]:
# Settings

# https://stackoverflow.com/questions/44503576/selenium-python-how-to-stop-page-loading-when-certain-element-gets-loaded
caps = DesiredCapabilities.CHROME
caps["pageLoadStrategy"] = "none"  #  interactive, don't wait for the whole page to load
driver = webdriver.Chrome("/home/janspoerer/code/janspoerer/page_rank_simple/chromedriver", desired_capabilities=caps)
wait = WebDriverWait(driver, 20)

root_investopedia_url = "https://www.investopedia.com/dictionary/"
sleeper_time = 1.5
crawl_everything_again = False

In [142]:
def _map_and_write_link_to_matrix_position(link, matrix):
    pass 
    return matrix

def _get_all_links_from_article():
    pass

def _open_article(article_link):
    pass

def _get_all_article_links_from_overview():
    list_with_articles_from_this_page = []
    article_elements = driver.find_elements_by_css_selector(".item-title a")
    for article_element in article_elements:
        list_with_articles_from_this_page.append(article_element.get_attribute("href"))
    return list_with_articles_from_this_page

def _open_link_and_select_window(link):
    try:
        driver.find_element_by_tag_name('body').send_keys(Keys.COMMAND + 't') 
    except:
        driver.refresh()
        driver.find_element_by_tag_name('body').send_keys(Keys.COMMAND + 't') 
    driver.switch_to.window(driver.window_handles[-1])
    driver.get(link)
    time.sleep(sleeper_time)
    return
    
def _get_all_article_urls(root_investopdia_url):
    list_all_urls = []
    
    # Open article overview (high-level overview with
    # list of available categories sorted by starting
    # letter of the article)
    driver.get(root_investopedia_url)
    
    letter_elements = driver.find_elements_by_css_selector(".alphabet a")
    
    letter_links = []
    for letter_element in letter_elements:
        letter_links.append(letter_element.get_attribute("href"))

    for letter_link in letter_links:
        _open_link_and_select_window(letter_link)
        
        # Some letters do not have several pages (because
        # there are only few results). If this is the case,
        # the article links are scraped right away without
        # looping through (inexistent) sub-pages.
        try:            
            # This selects only the element that points to the last page
            # for this letter. This allows us to set an upper boundary
            # for our loop. We will stop our loop when this is reached.
            last_element_number = driver.find_element_by_css_selector(".layout-content div:nth-child(1) .ellipsis+ li .btn").text                
        except NoSuchElementException:
            # Open, read links, and close
            list_all_urls = list_all_urls + _get_all_article_links_from_overview()
        else:
            try:
               last_element_number = int(last_element_number) 
            except:
                list_all_urls = list_all_urls + _get_all_article_links_from_overview()
            else:
                page_links = []
                for element_number in range(1, last_element_number):
                    page_links.append(letter_link + "?page=" + str(element_number))
                for page_link in page_links:
                    # Open, read links, and close
                    _open_link_and_select_window(page_link)
                    all_article_links_from_overview = _get_all_article_links_from_overview()
                    list_all_urls = list_all_urls + all_article_links_from_overview
                    # Write to csv
                    with open('links.csv','a') as fd: # 'a' option allows for appending. https://stackoverflow.com/questions/2363731/append-new-row-to-old-csv-file-python#2363742
                        writer = csv.writer(fd, delimiter=',')
                        for url in all_article_links_from_overview:
                            print(url)
                            writer.writerow([url])
                    driver.execute_script("window.close();")
    print("Crawl successful, ", len(list_all_urls), " articles were found.")
    return list_all_urls

def _get_article_texts(list_all_urls):
    # Open one tab; otherwise, the open_link_and_select_window() function will fail
    driver.get(list_all_urls[-1])
    time.sleep(sleeper_time)
    for url in list_all_urls:
        _open_link_and_select_window(url)
        try:  
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#article-heading_2-0")))
            article_caption = driver.find_element_by_css_selector("#article-heading_2-0").text
            article_caption = article_caption.replace(",","")
            article_caption = article_caption.replace("'","")
            article_caption = article_caption.replace('"',"")
            
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mntl-sc-page_1-0 p")))
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mntl-sc-page_1-0 a")))
            
            # Gets links/hrefs only
            # JAN STUCK HERE
            article_link_elements = driver.find_elements_by_css_selector("#mntl-sc-page_1-0 a")
            string_article_links = ""
            print("\n\n", article_caption, "with", len(article_link_elements) ,"elements:\n") # delete
            for article_link in article_link_elements:
                print("Element: ", article_link) # delete
                print("Text: ", article_link.text) # delete
                print("href: ", article_link.get_attribute("href")) # delete
                attrs = driver.execute_script('var items = {}; for (index = 0; index < arguments[0].attributes.length; ++index) { items[arguments[0].attributes[index].name] = arguments[0].attributes[index].value }; return items;', article_link) # delete
                pprint(attrs) # delete
                string_article_links = string_article_links + ";" + article_link.get_attribute("href")
            # Remove leading semicolon
            string_article_links = string_article_links[1:]
            
            current_article_row = [url, article_caption, string_article_links]
        except Exception as e:
            current_article_row = [url, "ERROR FOR THIS URL", str(e)] 
            
        # Write to csv
        with open('articles.csv','a') as fd: # 'a' option allows for appending. https://stackoverflow.com/questions/2363731/append-new-row-to-old-csv-file-python#2363742
            writer = csv.writer(fd, delimiter=',')
            writer.writerow(current_article_row)
        driver.execute_script("window.close();")
        
    return
            
def _load_csv_with_links(filename_of_csv_with_links):
        try:
            csvfile = open(filename_of_csv_with_links, "rt")
        except:
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), filename_of_csv_with_links)
        csvReader = csv.reader(csvfile, delimiter=",")
        list_all_urls = []
        for row in csvReader:
            list_all_urls.append((row[0]))
        print("Loaded links successfully from csv.", len(list_all_urls), "links were found.\n\n\n")
        return list_all_urls

def run_crawler(root_investopedia_url,filename_of_csv_with_links=""):
    """
    This Notebook crawls all Investopedia articles and returns a matrix representation of the link relationships between articles. This matrix will be used for the PageRank in another Notebook.

    Input: The root url of the Investopedia web page. The input stays the same and will not be user-defined. Other pages that Investopedia will most likely not work with this crawler as this crawler is specifically designed for Investopedia articles.

    Output: A matrix prepresentation of a graph network (with only 0s and 1). The graph represents the relationships between Investopedia articles. All relationships are one-way relationships. A relationship from article A to article B is defined as A having a link that leads to article B.
    """
    if filename_of_csv_with_links=="":
        list_all_urls = _get_all_article_urls(root_investopedia_url)
    else:
        list_all_urls = _load_csv_with_links(filename_of_csv_with_links)
    
    # Check for duplicate links
    print("Duplicates:\n", [item for item, count in collections.Counter(list_all_urls).items() if count > 1], "\n\n\n")
    
    # Result is stored in a csv, return is empty
    _get_article_texts(list_all_urls)
    
    matrix_representation_of_article_graph = []
    
    return matrix_representation_of_article_graph

In [143]:
run_crawler(root_investopedia_url,filename_of_csv_with_links="links.csv") #  root_investopedia_url defined above

Loaded links successfully from csv. 12270 links were found.



Duplicates:
 [] 





 TZS (Tanzanian Shilling) with 11 elements:

Element:  <selenium.webdriver.remote.webelement.WebElement (session="79313bef8e7fc6961224af143d33be92", element="0.8421822799879741-4")>
Text:  
href:  None


NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=70.0.3538.77)
  (Driver info: chromedriver=2.45.615279 (12b89733300bd268cff3b78fc76cb8f3a7cc44e5),platform=Linux 4.15.0-43-generic x86_64)
