In [70]:
from bs4 import BeautifulSoup
import requests

# Web Scrapping

In [6]:
def get_job_links(url_search,url_base):
    """ 
    Return a list of links to jobs returned from the url_search link
    """
    # use requests to get the search url contents, convert to a soup object
    r = requests.get(url_search)
    soup = BeautifulSoup(r.content)
    
    # extract the links from the job titles
    jobs = soup.find_all("a",{"class":"SerpJob-titleLink"})
    
    # return a list of job-links
    job_links = []
    for job in jobs:
        job_links.append(f"{url_base}{job.get('href')}")
    return job_links

In [7]:
def get_job_details(urls):
    """
    """
    print("Scrapping RSS Job Search Feed")
    print("="*100)
    job_detail = []
    for url in urls:
        try:
            # get url content, convert to soup
            r = requests.get(url)
            soup = BeautifulSoup(r.content)

            # extract content
            title = soup.find("div",{'class':'ViewJobHeaderTitle'}).text
            company = soup.find("div",{'class':'ViewJobHeaderCompany'}).text
            location = soup.find("span",{'class':'ViewJobHeaderPropertiesLocation'}).text
            desc = soup.find("div",{'class':'viewjob-description'}).text

            # extract key words (skills, education, benefits)
            keys = soup.find_all('li',{'class':'nav-item'})
            key_words = [w.text for w in keys]

            # build result-set
            job_detail.append(
                {'title':title,'company':company,'location':location,'desc':desc,'key-words':key_words}
            )
            # print progress
            print(f"  =>{title[:35]:{40}}| {company[:25]:{25}}| {location} | {key_words[:4]}...")
        except Exception as e:
            print("   =>Skipping Job...")
            continue
    print("="*100)
    return job_detail

In [8]:

url_search = 'https://www.workopolis.com/jobsearch/find-jobs?l=Toronto%2C%20ON&ak=data%20science%20-intern&t=7&sr=10&s=r&job=zD_1ItN-5JXVPUCwXgZvg_joKB7qyQNp5ttncJBtECQ1FoqVIwMR8znJuj3fF7lK'
url_base  ='https://www.workopolis.com'

job_links = get_job_links(url_search, url_base)
job_details = get_job_details(job_links)


Scrapping RSS Job Search Feed
  =>Data Scientist                          | Zany Consulting Group    | Toronto, ON | ['Doctoral degree', "Master's degree", 'Data modeling', 'Internet of things']...
  =>Data Scientist                          | Rakuten Kobo             | Toronto, ON | ['R', 'Tableau', 'SQL', 'Machine learning']...
  =>Senior Data Analyst                     | Cineplex                 | Toronto, ON | ['Post-secondary education', 'Business intelligence', 'Tableau', 'SQL']...
  =>Senior Data Analyst                     | NielsenIQ                | Toronto, ON | ["Master's degree", "Bachelor's", 'Azure', 'Writing skills']...
  =>Business Intelligence Specialist (R     | Altus Group              | Toronto, ON | ["Bachelor's", 'Power BI', 'Azure', 'Business intelligence']...
  =>Trainee Developer - RPA/AI              | Tangentia                | Toronto, ON | []...
  =>Data Scientist MAP Centre on Drug P     | St. Michael's Hospital   | Toronto, ON | ["Master's degree", 'SAS

# Resume Scrapping

# NLP

In [109]:
# Sumy
import sumy
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
import nltk # required for sumy
nltk.download('punkt') # required for sumy

# spacy
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_lg')
# from spacy.matcher import Matcher,PhraseMatcher
# from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
# from sklearn.decomposition import LatentDirichletAllocation, NMF

from tabulate import tabulate
from collections import Counter

[nltk_data] Downloading package punkt to /home/ken/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [19]:
def show_pos(text):    
    """Extracts POS from a spacy document and prints the results"""
    d=[]
    for token in doc:
        d.append({
            "Token":token.text,
            "POS":token.pos_,
            "POS-Desc":spacy.explain(token.pos_),
            "Lemma":token.lemma_,
            "Stop-Word":token.is_stop,
            "POS-Detail":token.tag_,
            "POS-Detail-Desc":spacy.explain(token.tag_)
            }
        )
    print(tabulate(d,headers="keys",tablefmt="github"))

In [20]:
def show_entities(doc,display=True):
    """Print a list of named entities and their descriptions"""
    d=[]
    if doc.ents:
        for entity in doc.ents:
            d.append({
                "Word":entity,
                "Entity":entity.label_,
                "Start":entity.start,
                "Stop":entity.end,
                "Entity-Desc":spacy.explain(entity.label_),
                }
            )
        if display:
            print(tabulate(d,headers="keys",tablefmt="github"))
            print("\n")
            displacy.render(doc,style='ent',jupyter=True)
            print("\n")
    else:
        print("no entities found")
    return d   

In [21]:
def attr_counts(doc,att):
    """Return a table of counts by attributes"""
    if att.lower() == 'pos':
        cnts = doc.count_by(spacy.attrs.POS)
    elif att.lower() =='tag':
        cnts = doc.count_by(spacy.attrs.TAG)
    elif att.lower() =='lemma':
        cnts = doc.count_by(spacy.attrs.LEMMA)
    elif att.lower() =='entity':
        cnts = doc.count_by(spacy.attrs.ENT_TYPE)
    elif att.lower() =='email':
        cnts = doc.count_by(spacy.attrs.LIKE_EMAIL)
    elif att.lower() =='url':
        cnts = doc.count_by(spacy.attrs.LIKE_URL)
    elif att.lower() =='currency':
        cnts = doc.count_by(spacy.attrs.IS_CURRENCY)
    elif att.lower() =='numbers':
        cnts = doc.count_by(spacy.attrs.IS_DIGIT)
        
    # lookup attribute name
    d = {doc.vocab[k].text:v for k,v in cnts.items()}
    return d

In [87]:
def get_text_summary(text,text_source,summarizer,num_sentences,print_summary=True):
    """
    Generate a text summary from a document (extractive)
    Params:
        text: plain text or path to a document
        text_source: text or file
        summarizer: text-summarization method. lex-rank, text-rank,lsa
        num_sentences: Number of summary sentences to generate
        print_summary: T/F
    Returns:
        A list of sentences.
    """
    # Create parser from text-source
    if text_source=='text':
        parser = PlaintextParser.from_string(text,Tokenizer('english'))
    elif text_source=='file':
        parser = PlaintextParser.from_file(text,Tokenizer('english'))
    else:
        raise Exception(f"text source must be 'text' or 'file'!")
        
    # Select summarizer method
    if summarizer=='lex-rank':
        # Lex Rank:  A graph-based summarization method that uses keyword extractions
        s = LexRankSummarizer()
    elif summarizer=='text-rank':
        # Text rank: A graph-based summarization method that uses keyword extractions
        s = TextRankSummarizer()
    elif summarizer=='lsa':
        # Latent semantic analysis 
        # Combines term frequency with singular value decomposition
        s = LsaSummarizer(Stemmer("english"))
        s.stop_words = get_stop_words("english")
    else:
        raise Exception(f"Summarizer must be 'lex-rank','luhn','text-rank' or 'lsa'!")
    
    # create summary
    summary = s(parser.document,num_sentences)
    
    # print summary
    if print_summary:
        print(f'Text Summary:Top {num_sentences} sentences using a {summarizer} summarizer')
        print('='*100)
        for sentence in summary:
            print(sentence)
        print('='*100,'\n')
    return s

In [116]:
def get_keywords(text,num, print_words=True):
    words = []
    pos = ['NOUN','PROPN']
    tokens = nlp(text.lower())
    
    for token in tokens:
        if (token.text not in nlp.Defaults.stop_words and token.pos_ in pos):
            words.append(token.text)
            
    # get the top keywords by frequency
    keyword_frequency = [word for word in Counter(words).most_common(num)]
    keywords = [word[0] for word in keyword_frequency]
    
    if print_words:
        print(keyword_frequency)
    return keywords            

In [115]:
doc = nlp(job_details[0]['desc'])
keywords = get_keywords(job_details[0]['desc'],5)
s = get_text_summary(job_details[0]['desc'],'text','lsa',2)
show_entities(doc,True)
print(attr_counts(doc,'entity'))
show_pos(doc)

[('data', 16), ('experience', 9), ('process', 5), ('ability', 3), ('job', 2)]
Text Summary:Top 2 sentences using a lsa summarizer
Key Accountabilities: Partner with internal stakeholders from multiple departments to identify opportunity for applying data science to solve complex business challenges such as maximize yield, process robustness, predictable supply, proactive identification of potential issues.
Experience with developing business requirements, use cases and user stories in a data analytics context.

| Word                                       | Entity      |   Start |   Stop | Entity-Desc                             |
|--------------------------------------------|-------------|---------|--------|-----------------------------------------|
| PhD                                        | WORK_OF_ART |     153 |    154 | Titles of books, songs, etc.            |
| Chemical Process or Industrial Engineering | ORG         |     155 |    160 | Companies, agencies, institutions, et



{'': 315, 'WORK_OF_ART': 2, 'ORG': 9, 'DATE': 2}
| Token            | POS   | POS-Desc                 | Lemma            | Stop-Word   | POS-Detail   | POS-Detail-Desc                                    |
|------------------|-------|--------------------------|------------------|-------------|--------------|----------------------------------------------------|
| Job              | PROPN | proper noun              | Job              | False       | NNP          | noun, proper singular                              |
| Posting          | NOUN  | noun                     | posting          | False       | NN           | noun, singular or mass                             |
| Start            | PROPN | proper noun              | Start            | False       | NNP          | noun, proper singular                              |
| Date             | PROPN | proper noun              | Date             | False       | NNP          | noun, proper singular                              |
| :    