### Web Scraping


## Import all necessary libraries

In [None]:
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup

## Scraping bioinformatics jobs

### Get the page

In [None]:
LENGTH = 10 # number of jobs to retrieve
JOB_URL = 'https://www.bioinformatics.org/jobs/?group_id=101&summaries=1&length=%s' % (LENGTH-1)

# get the HTML of the page with requests.get()
r = requests.get(JOB_URL)

# now pass the page content to bs4
soup = BeautifulSoup(r.content)

### Get the elements of interest

Looking at the HTML of the web page, job opportunities are in  _tables_. The job opportunities are in tables where the text "Opportunity" is part of the content.

Get all the tables, and then retain only those with "Opportunity" in the content, storing them in a list.

In [None]:
# get all the tables
tables = soup.find_all('table')

# the jobs are prefixed by the text "Opportunity".
# Check this with the attribute "text" available for each table.
for table in tables:
    print(table.text)

In [None]:
# store the interesting tables in a list
entries = list()
for table in tables:
    if 'Opportunity' in table.text:
        entries.append(table)

In [None]:
# remove the first two items of the list and retain the remaining items
entries = entries[2:]

In [None]:
# check what an entry looks like, using entries[0] as an example
entries[0]

In [None]:
# each entry is a BeatifulSoup structure
type(entries[0])

In [None]:
#  We can apply several bs4 methods to it.
entries[0].text

In [None]:
# use the bs4 find_all method to find the 'a' HTML tags for a single entry
# The "a" HTML tags identify links.
links = entries[0].find_all('a', href=True)

# check what we got
links

In [None]:
# we got a list
# retain only the first element
# because the second refers to the person who posted the job opportunity.

# print out the "href", i.e. the actual link of the HTML tag
for link in links:
    print(link['href'])

In [None]:
#  the first element:
print(links[0]['href'])

### Parse the text and store it in a list

Parse the text of each item of the 'entries' list using some Python constructs, as well as a _regular expression_. Look at the text for an item (see above); it is something like:

```
'\n\n\n\nOpportunity: Bioinformatics Data Analyst @ Bowie State University -- Bowie, MD (US)\nSubmitted by Konda Reddy Karnati; posted on Friday,\xa0January\xa028,\xa02022 \n\n\n\n'
```

- use strip() to remove the \n characters at the beginning and at the end of each line
- the text contained between ':' and '@' is the job title
- the text contained between '@' and '\n' is the job location
- the text after 'posted on ' is the publication data. Here we also need to replace \xa0 with a regular space

Also, a link will be something like this (see above):

```
https://www.bioinformatics.org/forums/forum.php?forum_id=14619
```

- the text after 'forum_id=' is the job number, so  extract that as well

Put all the extracted elements in the list called `my_jobs`.

In [None]:
my_jobs = list()
for entry in entries:
    for l in entry.find_all('a', href=True):
        # parse the links until we find 'forum' in the URL
        if 'forum' in l['href']:
            link = l['href']
            break
    else:
        link = None
    
    text = entry.text.strip()
    
    # parse the text, grouping the interesting parts as explained above
    m = re.search('Opportunity: (.+?) @ (.+?)\n.+?; posted on (.+?)$', text)
    
    # extract the various groups of the regular expression
    title = m.group(1)
    location = m.group(2)
    date = m.group(3).replace('\xa0', ' ') # replace also \xa0 with a space
    
    # extract the job id from the URL
    if link:
        m = re.search('.*forum_id=(.+)$', link)
        job_id = m.group(1)
    else:
        job_id = None
    
    # finally, append all the extracted elements to a list
    my_jobs.append([job_id, title, location, link, date])

In [None]:
# check what we got in the end
my_jobs

In [None]:
# print it with some formatting
for job in my_jobs:
    print("Job number: %s" % job[0])
    print("\tTitle: %s" % job[1])
    print("\tLocation: %s" % job[2])
    print("\tURL: %s" % job[3])
    print("\tPublished on: %s\n" % job[4])

### Getting information from a secondary page

Apply the same web scraping techniques also to the pages detailing each job. For example, go through the list of jobs that are retrieved so far, visit the respective URLs, and fetch the "DEADLINE". Store this deadline into a python dictionary called `my_deadlines`; in this dictionary, define the key to be the job ID, and as value the deadline.

Looking at the HTML for one of the URLs, the deadline is contained in a class called `sf-news`, and the text of that class is "DEADLINE". There might be more than one instance of the `sf-news` class, so loop through all of them, and stop when find the one with the DEADLINE text. The actual deadline is a container in the next element, which can be found in the `next_siblings` attribute of the class.

The deadline string will have some \r or \n characters before and after it, so we will remove them with `strip()`.

__However__, the "DEADLINE" field is _not_ mandatory, so it might not be present on a page. Therefore consider also this case.

In [None]:
# verify how a job looks like, printing for example my_jobs[0]
my_jobs[0]

In [None]:
my_deadlines = dict()
for job in my_jobs:
    job_id = job[0]
    job_link = job[3]
    
    # get the page referenced by the current job
    r = requests.get(job_link)
    
    # pass the page content to bs4
    soup = BeautifulSoup(r.content)
    
    # find all the "sf-news" classes and get the one with text "DEADLINE"
    for c in soup.find_all(class_ = 'sf-news'):
        if c.text == "DEADLINE":
            # get the deadline, create a dictionary item and then exit from this loop
            my_deadlines[job_id] = c.next_sibling.strip()
            break
    else:
        # this part gets executed if no "break" was encountered.
        my_deadlines[job_id] = 'No deadline'

In [None]:
# check that my_deadlines contains what we want
my_deadlines

### Combine everything into a single block of code and store the info in a database

Combine everything into a single block of code, and for example print out all the gathered informations


In [None]:
# this would be the final code, from start to end

LENGTH = 10 # number of jobs to retrieve
JOB_URL = 'https://www.bioinformatics.org/jobs/?group_id=101&summaries=1&length=%s' % (LENGTH-1)

# get the HTML of the page with requests.get()
r = requests.get(JOB_URL)

# pass the page content to bs4
soup = BeautifulSoup(r.content)

# get all the tables
tables = soup.find_all('table')

# store the interesting tables in a list
entries = list()
for table in tables:
    if 'Opportunity' in table.text:
        entries.append(table)

# remove the first two entries
entries = entries[2:]

for entry in entries:
    for l in entry.find_all('a', href=True):
        # parse the links until we find 'forum' in the URL
        if 'forum' in l['href']:
            link = l['href']
            break
    else:
        link = None

    text = entry.text.strip()
    
    # parse the text, grouping the interesting parts 
    m = re.search('Opportunity: (.+?) @ (.+?)\n.+?; posted on (.+?)$', text)
    
    # extract the various groups of the regular expression
    title = m.group(1)
    location = m.group(2)
    date = m.group(3).replace('\xa0', ' ') # replace also \xa0 with a space
    
    # extract the job id from the link
    if link:
        m = re.search('.*forum_id=(.+)$', link)
        job_id = m.group(1)
        # get the deadline by scraping the link
        link_page = requests.get(link)
        # pass the page content to bs4
        link_soup = BeautifulSoup(link_page.content)
        
        # find all the "sf-news" classes and get the one with text "DEADLINE"
        for c in link_soup.find_all(class_ = 'sf-news'):
            if c.text == "DEADLINE":
                # get the deadline text. remove spurious characters and exit from this loop
                deadline = c.next_sibling.strip()
                break
        else:
            # this part gets executed if no "break" was encountered.
            deadline = 'No deadline'
    else:
        job_id = None
    
    # print out all that we have gathered, with some formatting
    print("Job number: %s" % job_id)
    print("\tTitle: %s" % title)
    print("\tLocation: %s" % location)
    print("\tURL: %s" % link)
    print("\tPublished on: %s" % date)
    if link:
        print("\tDeadline: %s" % deadline)
    else:
        print("\tDeadline: unknown")
    