In [None]:
from IPython.core.display import HTML
with open ("../style.css", "r") as file:
    css = file.read()
HTML(css)

# A Simple Email Harvester

We will use three libraries:
* `re` is the library for regular expressions.  
   + `re.compile(r)` compiles a regular expression `r` into a
     *finite state machine* that implements this regular expression.
   + `o.findall(s)` takes a finite state machine `o` and 
     a string `s`.  It returns a list containing all substrings of `s`
     that are matched by the regular expression that was compiled into `o`.
   + `re.sub(o, t, s)` receives three arguments:
     1. `o` is a finite state machine that is the result of compiling some 
        regular expression `r`.
     2. `t` is a string.
     3. `s` is a string.
     
     The function finds all substrings that are matched by `r` and replaces these substrings with `t`.
     The resulting string is returned.
* `requests` is used the send `HTTP` requests.

  We will use this library to download webpages.  The function
  ```
  response = requests.get(url)
  ```
  is used to download a web page.  The text of this webpage can then be retrieved as
  ```
  page = response.text
  ```
* `urllib.parse`  defines functions to manipulate URLs and their components parts.
  + `urljoin(base_url, relative_url)` combines `base_url` and `relative_url` into a url.
  + `urlparse(url)` creates an object that has the attribute `netloc`.  This attribute can be used
    to check the host that provides the given url.

In [None]:
import re
import requests
from urllib.parse import urljoin, urlparse

First, we compile some regular expressions into finite state machines and stores these FSMs
in global variables.
1. The regular expression `r'\<span style="display: none;"\>[^<>]*\</span\>'`
   is used because most email adresses are disguised as follows:
   ```
   karl.stroetmann<span style="display: none;"> No Spam \</span>@dhbw-mannheim.de`
   ```
   We have to remove the part `<span style="display: none;"> No Spam \</span>` from the web page
   so that email adresses can be recognized.
2. The regular expression `r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'` is used
   to locate email addresses.
3. The regular expression `r'<a [^>]*href=["\'](.*?)["\']'` is used
   to locate hypertext links.

In [None]:
INVISIBLE = re.compile(r'\<span style="display: none;"\>[^<>]*\</span\>')
EMAIL     = re.compile(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}')
LINK      = re.compile(r'<a [^>]*href=["\'](.*?)["\']')

The function `retrieve_page(url)` retrieves the text of the web page at the given `url`.
It also removes text like `<span style="display: none;"\> No Spam \</span>` from this text.
Text of this kind is sometimes inserted in email adresses to make it more difficult to
harvest them.

In [None]:
def retrieve_page(url):
    response = requests.get(url)     # make http request 
    page     = response.text         # retrieve webpage
    return re.sub(INVISIBLE, '', page)

The function `extract_emails(url)` takes a web address and tries to find all email adresses that occur on web pages
that are reachable from `url`.

In [None]:
def extract_emails(url): 
    Emails        = set()                # set to store unique Emails
    ProcessedUrls = set()                # set to store unique processed URLs
    URL_Stack = [ url ]                  # stack to manage URLs to be scraped
    while URL_Stack:
        url = URL_Stack.pop()
        if url in ProcessedUrls:         # check if URL has already been processed
            continue
        ProcessedUrls.add(url)           # mark URL as processed
        page = retrieve_page(url)
        # extract Emails using regex
        for mail in EMAIL.findall(page):
            if mail not in Emails:
                print(mail)
                Emails.add(mail)
        # extract links using regex
        links = LINK.findall(page)
        for link in links:  # find and process links to other pages on the same server
            next_url = urljoin(url, link)
            # check if the URL is on the same server
            if urlparse(url).netloc == urlparse(next_url).netloc:
                next_str = str(next_url)
                endings  = { '.pdf', 'docx', '.png', '.jpg', 'xlsx', '.mp4' }
                if '?' not in next_str and next_str[-4:] not in endings:
                    URL_Stack.append(next_url)
    return Emails

In [None]:
# Example usage:
url = 'https://dhbw-mannheim.de'
emails = extract_emails(url)
for email in emails:
    print(email)