In [238]:
from bs4 import BeautifulSoup
import requests
import os
import PyPDF2 

# Web Scrapping

In [6]:
def get_job_links(url_search,url_base):
    """ 
    Return a list of links to jobs returned from the url_search link
    """
    # use requests to get the search url contents, convert to a soup object
    r = requests.get(url_search)
    soup = BeautifulSoup(r.content)
    
    # extract the links from the job titles
    jobs = soup.find_all("a",{"class":"SerpJob-titleLink"})
    
    # return a list of job-links
    job_links = []
    for job in jobs:
        job_links.append(f"{url_base}{job.get('href')}")
    return job_links

In [124]:
def get_job_details(urls):
    """
    """
    print("Scrapping RSS Job Search Feed")
    print("="*100)
    job_detail = []
    for url in urls:
        try:
            # get url content, convert to soup
            r = requests.get(url)
            soup = BeautifulSoup(r.content)

            # extract content
            title = soup.find("div",{'class':'ViewJobHeaderTitle'}).text
            company = soup.find("div",{'class':'ViewJobHeaderCompany'}).text
            location = soup.find("span",{'class':'ViewJobHeaderPropertiesLocation'}).text
            desc = soup.find("div",{'class':'viewjob-description'}).text

            # extract key words (skills, education, benefits)
            keys = soup.find_all('li',{'class':'nav-item'})
            key_words = [w.text for w in keys]

            # build result-set
            job_detail.append(
                {'title':title,'company':company,'location':location,'desc':desc,'keywords':key_words}
            )
            # print progress
            print(f"  =>{title[:35]:{40}}| {company[:25]:{25}}| {location} | {key_words[:4]}...")
        except Exception as e:
            print("   =>Skipping Job...")
            continue
    print("="*100)
    return job_detail

In [165]:

url_search = 'https://www.workopolis.com/jobsearch/find-jobs?l=Toronto%2C%20ON&ak=data%20science%20-intern&t=7&sr=10&s=r&job=zD_1ItN-5JXVPUCwXgZvg_joKB7qyQNp5ttncJBtECQ1FoqVIwMR8znJuj3fF7lK'
url_base  ='https://www.workopolis.com'

job_links = get_job_links(url_search, url_base)
job_details = get_job_details(job_links)


Scrapping RSS Job Search Feed
  =>Data Scientist                          | Zany Consulting Group    | Toronto, ON | ['Doctoral degree', "Master's degree", 'Data modeling', 'Internet of things']...
  =>Data Scientist II - Machine Learnin     | Credit Sesame            | Toronto, ON | ['Git', 'SQL', 'Machine learning', 'Python']...
  =>Trainee Developer - RPA/AI              | Tangentia                | Toronto, ON | []...
  =>Senior Data Analyst                     | NielsenIQ                | Toronto, ON | ["Master's degree", "Bachelor's", 'Azure', 'Writing skills']...
  =>Data Scientist II                       | TD Bank                  | Toronto, ON | ["Bachelor's", 'Data mining', 'SAS', 'R']...
  =>Business Intelligence Specialist (R     | Altus Group              | Toronto, ON | ["Bachelor's", 'Power BI', 'Azure', 'Business intelligence']...
  =>Manager, Data Scientist                 | Rogers Communications    | Toronto, ON | ["Master's degree", "Bachelor's", 'Power BI', 'Data m

# Resume Scrapping

# NLP

In [247]:
# Sumy
import sumy
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
import nltk # required for sumy
nltk.download('punkt') # required for sumy

# spacy
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_lg')
# from spacy.matcher import Matcher,PhraseMatcher
# from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
# from sklearn.decomposition import LatentDirichletAllocation, NMF

from tabulate import tabulate
from collections import Counter
import docx2txt

[nltk_data] Downloading package punkt to /home/ken/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [160]:
def show_pos(text):    
    """Extracts POS from a spacy document and prints the results"""
    d=[]
    for token in doc:
        d.append({
            "Token":token.text,
            "POS":token.pos_,
            "POS-Desc":spacy.explain(token.pos_),
            "Lemma":token.lemma_,
            "Stop-Word":token.is_stop,
            "POS-Detail":token.tag_,
            "POS-Detail-Desc":spacy.explain(token.tag_)
            }
        )
    print(tabulate(d,headers="keys",tablefmt="github"))

In [161]:
def show_entities(doc,display=True):
    """Print a list of named entities and their descriptions"""
    d=[]
    if doc.ents:
        for entity in doc.ents:
            d.append({
                "Word":entity,
                "Entity":entity.label_,
                "Start":entity.start,
                "Stop":entity.end,
                "Entity-Desc":spacy.explain(entity.label_),
                }
            )
        if display:
            print(tabulate(d,headers="keys",tablefmt="github"))
            print("\n")
            displacy.render(doc,style='ent',jupyter=True)
            print("\n")
    else:
        print("no entities found")
    return d   

In [162]:
def attr_counts(doc,att):
    """Return a table of counts by attributes"""
    if att.lower() == 'pos':
        cnts = doc.count_by(spacy.attrs.POS)
    elif att.lower() =='tag':
        cnts = doc.count_by(spacy.attrs.TAG)
    elif att.lower() =='lemma':
        cnts = doc.count_by(spacy.attrs.LEMMA)
    elif att.lower() =='entity':
        cnts = doc.count_by(spacy.attrs.ENT_TYPE)
    elif att.lower() =='email':
        cnts = doc.count_by(spacy.attrs.LIKE_EMAIL)
    elif att.lower() =='url':
        cnts = doc.count_by(spacy.attrs.LIKE_URL)
    elif att.lower() =='currency':
        cnts = doc.count_by(spacy.attrs.IS_CURRENCY)
    elif att.lower() =='numbers':
        cnts = doc.count_by(spacy.attrs.IS_DIGIT)
        
    # lookup attribute name
    d = {doc.vocab[k].text:v for k,v in cnts.items()}
    return d

In [163]:
def get_text_summary(text,text_source,summarizer,num_sentences,print_summary=True):
    """
    Generate a text summary from a document (extractive)
    Params:
        text: plain text or path to a document
        text_source: text or file
        summarizer: text-summarization method. lex-rank, text-rank,lsa
        num_sentences: Number of summary sentences to generate
        print_summary: T/F
    Returns:
        A list of sentences.
    """
    # Create parser from text-source
    if text_source=='text':
        parser = PlaintextParser.from_string(text,Tokenizer('english'))
    elif text_source=='file':
        parser = PlaintextParser.from_file(text,Tokenizer('english'))
    else:
        raise Exception(f"text source must be 'text' or 'file'!")
        
    # Select summarizer method
    if summarizer=='lex-rank':
        # Lex Rank:  A graph-based summarization method that uses keyword extractions
        s = LexRankSummarizer()
    elif summarizer=='text-rank':
        # Text rank: A graph-based summarization method that uses keyword extractions
        s = TextRankSummarizer()
    elif summarizer=='lsa':
        # Latent semantic analysis 
        # Combines term frequency with singular value decomposition
        s = LsaSummarizer(Stemmer("english"))
        s.stop_words = get_stop_words("english")
    else:
        raise Exception(f"Summarizer must be 'lex-rank','luhn','text-rank' or 'lsa'!")
    
    # create summary
    summary = s(parser.document,num_sentences)
    
    # print summary
    if print_summary:
        print(f'Text Summary:Top {num_sentences} sentences using a {summarizer} summarizer')
        print('='*100)
        for sentence in summary:
            print(sentence)
        print('='*100,'\n')
    return s

In [206]:
def get_terms(path):
    """read a csv of data science terms & return themn as a list"""
    with open(path,'r') as f:
        lst = [line.lower().strip() for line in f.readlines()]
    return set(lst)

In [164]:
def get_keywords(text,num, print_words=True):
    words = []
    pos = ['NOUN','PROPN']
    common_job_keys = ['experience','job','ability','posting','process','date','stakeholders','skills','solutions','project']
    tokens = nlp(text.lower())
    
    for token in tokens:
        if (token.text not in nlp.Defaults.stop_words and token.text.lower() not in common_job_keys and token.pos_ in pos):
            words.append(token.text)
            
    # get the top keywords by frequency
    keyword_frequency = [word for word in Counter(words).most_common(num)]
    keywords = [word[0] for word in keyword_frequency]
    
    if print_words:
        print(keyword_frequency)
    return keywords            

In [267]:
def get_resume_text(path,print_text=True):
    """ 
    Read resume text content from file (pdf or docx)
    """
    
    # open file and get file extension
    file = open(path,'rb')
    file_name, file_ext = os.path.splitext(path)
    
    # extract text content
    if file_ext=='.pdf':
        # read pdf
        pdf = PyPDF2.PdfFileReader(file)
        
        text = """"""
        for page_num in range(pdf.numPages):
            page = pdf.getPage(page_num)
            content = page.extract_text()
            text += content
    elif file_ext=='.docx':
        text = docx2txt.process(file)
    
    if print_text:
        print(text)

    file.close()
    return text

get_resume_text('data/resume.pdf')
get_resume_text('data/resume.docx')

 
 
KEN  CONSTABLE , CFA , MSC  
647.349.6500  
ken.constable@gmail.com   
Github: https://github.com/kconstable  
LinkedIn: kenconstableconsulting  
• DATA SCIENTIST, QUANTITATIVE ANALYST/DEVELOPER AND CFA CHARTER HOLDER WITH OVER 15 YEARS OF 
EXPERIENCE ANALYSING MARKET DATA  
• IDEALLY SUITED IN THE APPLICATION OF  MACHINE LEARNING IN INVESTMENT  SETTINGS  
• A WEA LTH OF EXPERIENCE IN DATA WAREHOUSING AND AUTOMATING DATA -INTENSIVE PROCESSES  
• MASTER 'S DEGREE IN DATA SCIENCE/MACHINE LEARNING  
EDUCATION  
MASTER'S IN DATA SCIENCE (MSC), REGIS UNIVERSITY  
Focus on machine learning, deep learning, reinforcement learning, natural language processing, and data 
engineering . Master’s capstone project – Predicting Bitcoin Prices and Applied Trading Strategy using LSTM Deep 
Learning Ensemble Models  
 
BACHELOR'S IN SCIENCE (BSC), UNIVERSITY OF WATERLOO  
 
CHARTERED FINANCIAL ANALYST (CFA), CFA INSTITUTE  
REL EVA NT HIGHLIGHTS  
MACHINE LEARNING, FINANCIAL MODELING , QUANTITATIVE 

"Contact\n\n Ken Constable, BSc. MSc. CFA\n\n ken.constable@gmail.com\n\n 647-349-6500\n\n\n\n1706-11 Charlotte St\n\nToronto\n\nOntario\n\nM5V 0M6\n\n\n\nEducation\n\nMSc.\n\nRegis University\n\nData Science\n\n\n\nBSc.\n\nChemistry\n\nUniversity of Waterloo\n\n\n\nCFA Charter holder\n\nCFA Institute\n\n\n\nTechnical skills\n\nFinancial Modeling: Monte Carlo Simulations, back-testing, scenario-testing\n\nData Analysis: Python, Pandas, numpy, Scikit-learn, Tensorflow, Keras, R, Excel,VBA, xlwings\n\nMarket Data: Bloomberg, Factset, BarraOne, Eagle, Alphavantage\n\nBusiness Intelligence: T-SQL, SQL SERVER, ORACLE, HIVE, POSTGRE, SSIS, SSRS, MySQL, Redshift, S3, Airflow, AWS Glue\n\nMachine Learning: NLP, natural language processing, Deep Learning, Reinforcement Learning, time-series analysis, Supervised Learning, Unsupervised learning, Statistics, Optimization\n\nData Visualization: Tableau, Plotly, Dash, R-Shiny, matplotlib\n\n\n\n\n\nAcademic Projects in machine learning\n\nBitcoin pr

In [217]:
def search_terms(text,terms_path,print_words=True):
    
    # get key terms
    terms = get_terms(terms_path)
    
    # tokenize the text content
    tokens = nlp(text.lower())
    
    # term freq dict
    term_frequency = {}
    
    # count the number of times each term is found in the text
    for token in tokens:
        if token.text in terms:
            if token.text in term_frequency.keys():
                term_frequency[token.text] = term_frequency[token.text] + 1
            else:
                term_frequency[token.text] = 1
    return sorted(term_frequency.items(),key=lambda x:x[1],reverse=True)
        
        
search_terms(job_details[1]['desc'],'data/terms.csv',True)

[('models', 10),
 ('python', 3),
 ('ml', 2),
 ('sql', 2),
 ('programming', 2),
 ('optimization', 2),
 ('predictive', 1),
 ('mlops', 1),
 ('analytics', 1),
 ('looker', 1),
 ('modeling', 1),
 ('xgboost', 1),
 ('algorithms', 1),
 ('regression', 1),
 ('boosting', 1),
 ('matplotlib', 1),
 ('seaborn', 1),
 ('plotly', 1),
 ('streamlit', 1),
 ('git', 1),
 ('quantitative', 1),
 ('statistics', 1),
 ('excel', 1)]

In [154]:
job_key = 15

doc = nlp(job_details[job_key]['desc'])
keywords = get_keywords(job_details[job_key]['desc'],5)
s = get_text_summary(job_details[job_key]['desc'],'text','lsa',2)
job_details[job_key]['keywords']


show_entities(doc,True)
# print(attr_counts(doc,'entity'))
# show_pos(doc)


[('analytics', 12), ('diabetes', 10), ('data', 10), ('canada', 9), ('insights', 8)]
Text Summary:Top 2 sentences using a lsa summarizer
We’re sharing knowledge and creating connections for individuals and the health-care professionals who care for them; advocating through public policy; and funding research to improve treatments and find a cure to end diabetes.
The successful candidate will work closely with cross-functional teams to establish KPIs, champion a test and learn culture, track performance, report regularly and identify opportunities for improvement.

| Word                         | Entity      |   Start |   Stop | Entity-Desc                                    |
|------------------------------|-------------|---------|--------|------------------------------------------------|
| Diabetes Canada              | ORG         |      76 |     78 | Companies, agencies, institutions, etc.        |
| Digital Analytics & Insights | ORG         |      92 |     96 | Companies, agencies





[{'Word': Diabetes Canada,
  'Entity': 'ORG',
  'Start': 76,
  'Stop': 78,
  'Entity-Desc': 'Companies, agencies, institutions, etc.'},
 {'Word': Digital Analytics & Insights,
  'Entity': 'ORG',
  'Start': 92,
  'Stop': 96,
  'Entity-Desc': 'Companies, agencies, institutions, etc.'},
 {'Word': Canada,
  'Entity': 'GPE',
  'Start': 104,
  'Stop': 105,
  'Entity-Desc': 'Countries, cities, states'},
 {'Word': digital analytics program,
  'Entity': 'ORG',
  'Start': 126,
  'Stop': 129,
  'Entity-Desc': 'Companies, agencies, institutions, etc.'},
 {'Word': 1300,
  'Entity': 'CARDINAL',
  'Start': 187,
  'Stop': 188,
  'Entity-Desc': 'Numerals that do not fall under another type'},
 {'Word': University Avenue,
  'Entity': 'ORG',
  'Start': 190,
  'Stop': 192,
  'Entity-Desc': 'Companies, agencies, institutions, etc.'},
 {'Word': Toronto,
  'Entity': 'GPE',
  'Start': 193,
  'Stop': 194,
  'Entity-Desc': 'Countries, cities, states'},
 {'Word': Ontario,
  'Entity': 'GPE',
  'Start': 195,
  'St