pip install html5print

# Wikipedia Tables Crawler

## Imports

In [284]:
from bs4 import BeautifulSoup as soup
import scrapy
import os
import pandas as pd
from urllib import request, response, error
from html5print import HTMLBeautifier
from typing import NewType, Set
from time import sleep
from datetime import datetime
from enum import Enum

## Types

In [155]:
URL = NewType('URL', str)
HTML = NewType('HTML', str)
SoupObj = NewType('SoupObj', soup)
Path = NewType('Path', str)

## Initial Configuration
Import the whole Wikipedia links

In [349]:
ROOT_PATH = "/Users/jvsn/Documents/tcc"
WIKIPEDIA_PATH = '/Users/jvsn/Documents/tcc/csv_folder/links_webtables.csv'
WIKIPEDIA_ROOT_LINK = 'https://en.wikipedia.org'
SLEEP_TIME = 5
LOG_PATH = f'{ROOT_PATH}/logs.txt'

## Regex Configuration

In [255]:
WIKI_PAGES = r'^/wiki/.*'

## Enums

In [297]:
class LogTypes(Enum):
    INFO = 0
    WARNING = 1
    ERROR = 2

## Code

In [290]:
class Logger:
    def __init__(self):
        # Check if exists a log file
        if not os.path.exists(LOG_PATH) or not os.path.isfile(LOG_PATH):
            self._create_log()

    # Private Methods
    def _create_log(self):
        now = self._get_now()
        
        with open(LOG_PATH, 'w+') as log_file:
            log_file.write(f'Starting Log at {now}')

    def _get_now(self):
        return str(datetime.now())

    # Public Methods
    @classmethod
    def log(self, log_type: LogTypes, message: str):
        now = self._get_now()

        with open(LOG_PATH, 'a+') as log_file:
            log_file.write(f'[{now}][{log_type}] - {message}')

In [205]:
def get_html(url: Url) -> response:
    return request.urlopen(url)

In [317]:
def get_urls_from_csv(csv_path: str, limit: int = -1) -> {URL}:
    '''
    Function that reads a CSV file with urls. The CSV should follow the pattern:
    file.csv:
        <title> # This line will be ignored
        <url_1>
        ...
        <url_n>
    
    Arguments:
    path: URL to be donwloaded
    
    Optional arguments:
    limit: limit of the number of lines returned, default infinite
    
    Return:
    A set of URL. The set approach is to avoid repeated urls.
    '''
    with open(csv_path, 'r') as paths_file:
        paths_file.readline() # The first line is useless
        output = set()
        
        while paths_file and limit:
            output.add(paths_file.readline())
            # If limit is negative, it value will never change
            limit = limit - 1 if limit > 0 else limit
            
        return output

In [211]:
def to_beautiful_soup(html: HTML, parser = "html5lib") -> SoupObj:
    return soup(html, parser)

In [212]:
def print_html(soup_object: SoupObj) -> None:
    print(soup_object.prettify())

In [325]:
def main():
    urls = get_urls_from_csv(WIKIPEDIA_PATH, 1)
    size_urls = len(urls)
    should_sleep = size_urls > 1
    page_references = dict()
    
    for (idx, url) in enumerate(urls):
        try:
            soup_obj = to_beautiful_soup(get_html(url))
        except Exception as e:
            Logger.log(log_type=LogTypes.ERROR, message = e.strerror)
        
        page_references[url] = wikipedia_table_parser(soup_obj, url)
    
    return page_references

In [345]:
def wikipedia_table_parser(soup_obj, page_url: URL):
    '''
    This is a custom parser to get all the tables from a wikipedia page.
    From each table, the goal is to get all references. The references follow the pattern:
    - /wiki/.* : other wiki pages. Build the url as {WIKIPEDIA_ROOT_LINK}{<match>}
    - #cite : citations in the same wiki. Need to search the {current_page}<citation> and get the href of it
    - #CITE : same as #cite
    - http[s]?://<url> common url
    - //<url> : same as putting http[s]?://<url>. This case needs to add the http
    '''
    references_url = set()
    invalid_urls_regex = r'.*(Wikipedia:|Talk:)'
    
    for soup_obj_table in soup_obj.find_all('table', attrs={'class': 'wikitable'}):
        soup_obj_with_href = soup_obj_table.find_all(href=True)
        
        for href in soup_obj_with_href:
            href = href.get('href')
            
            # Filter invalid urls
            href = href.split('#')[0] # Get only the first url slice if the link is <url>#session
            if re.match(invalid_urls_regex, href):
                continue
            
            # Match possibilities
            if re.match(r'^/wiki/.*', href):
                references_url.add(f'{WIKIPEDIA_ROOT_LINK}{href}')
            elif re.match(r'^#(cite|CITE).*', href):
                new_url = f'{page_url}{href}'
            elif re.match(r'^//.*', href):
                references_url.add(f'https:{href}')
            elif re.match(r'^https?.*', href):
                references_url.add(href)
            
    return references_url

In [305]:
def parser(tag: str, attrs: dict):
    pass

## Testing

In [85]:
import unittest

In [183]:
class TestHTMLRequests(unittest.TestCase):
    def test_sucessful_request(self):
        url = "http://www.google.com"
        response = get_html(url)
        self.assertEqual(200, response.status)
        
    def test_fail_request(self):
        url = "http://www.aodhasda.com"
        with self.assertRaises(error.URLError):
            get_html(url)

In [184]:
a = TestHTMLRequests()
a.test_fail_request()

In [185]:
suite = unittest.TestSuite()

In [188]:
suite.addTest(TestHTMLRequests('test_sucessful_request'))
suite.addTest(TestHTMLRequests('test_fail_request'))

In [303]:
try:
    del a
    a = 2 / 0
except Exception as e:
    print(e)
a

division by zero


NameError: name 'a' is not defined

In [315]:
re.match(r'https?', 'https')

<re.Match object; span=(0, 5), match='https'>

In [350]:
main()

{'https://en.wikipedia.org/wiki/Timeline_of_the_evolutionary_history_of_life\n': {'https://en.wikipedia.org/wiki/Abiogenesis',
  'https://en.wikipedia.org/wiki/Acanthostega',
  'https://en.wikipedia.org/wiki/Acasta_Gneiss',
  'https://en.wikipedia.org/wiki/Accretion_disc',
  'https://en.wikipedia.org/wiki/Acipenseridae',
  'https://en.wikipedia.org/wiki/Acritarch',
  'https://en.wikipedia.org/wiki/Actinopterygii',
  'https://en.wikipedia.org/wiki/Adelobasileus',
  'https://en.wikipedia.org/wiki/Adenosine_triphosphate',
  'https://en.wikipedia.org/wiki/Africa',
  'https://en.wikipedia.org/wiki/Agnatha',
  'https://en.wikipedia.org/wiki/Alaska',
  'https://en.wikipedia.org/wiki/Alligator',
  'https://en.wikipedia.org/wiki/Alluvium',
  'https://en.wikipedia.org/wiki/Ammonite',
  'https://en.wikipedia.org/wiki/Ammonoidea',
  'https://en.wikipedia.org/wiki/Amniote',
  'https://en.wikipedia.org/wiki/Angiosperm',
  'https://en.wikipedia.org/wiki/Animal',
  'https://en.wikipedia.org/wiki/Ant',

In [344]:
re.match(r'.*(Wikipedia:|Talk:)', 'https://en.wikipedia.org/wiki/Toothed_whale')

<re.Match object; span=(0, 35), match='https://en.wikipedia.org/wiki/Talk:'>

In [373]:
def a(*args):
    my_set = set([3])
    my_set.update(args)
    print(my_set)

In [375]:
a(1,2)

{1, 2, 3}


In [376]:
a = set([1,2,3,4,5])

In [377]:
for idx, k in enumerate(a):
    print(idx, k)

0 1
1 2
2 3
3 4
4 5
