In [268]:
from bs4 import BeautifulSoup
import requests
import os
import PyPDF2 

# Web Scrapping

In [269]:
def get_job_links(url_search,url_base):
    """ 
    Return a list of links to jobs returned from the url_search link
    """
    # use requests to get the search url contents, convert to a soup object
    r = requests.get(url_search)
    soup = BeautifulSoup(r.content)
    
    # extract the links from the job titles
    jobs = soup.find_all("a",{"class":"SerpJob-titleLink"})
    
    # return a list of job-links
    job_links = []
    for job in jobs:
        job_links.append(f"{url_base}{job.get('href')}")
    return job_links

In [270]:
def get_job_details(urls):
    """
    """
    print("Scrapping RSS Job Search Feed")
    print("="*100)
    job_detail = []
    for url in urls:
        try:
            # get url content, convert to soup
            r = requests.get(url)
            soup = BeautifulSoup(r.content)

            # extract content
            title = soup.find("div",{'class':'ViewJobHeaderTitle'}).text
            company = soup.find("div",{'class':'ViewJobHeaderCompany'}).text
            location = soup.find("span",{'class':'ViewJobHeaderPropertiesLocation'}).text
            desc = soup.find("div",{'class':'viewjob-description'}).text

            # extract key words (skills, education, benefits)
            keys = soup.find_all('li',{'class':'nav-item'})
            key_words = [w.text for w in keys]

            # build result-set
            job_detail.append(
                {'title':title,'company':company,'location':location,'desc':desc,'keywords':key_words}
            )
            # print progress
            print(f"  =>{title[:35]:{40}}| {company[:25]:{25}}| {location} | {key_words[:4]}...")
        except Exception as e:
            print("   =>Skipping Job...")
            continue
    print("="*100)
    return job_detail

In [271]:

url_search = 'https://www.workopolis.com/jobsearch/find-jobs?l=Toronto%2C%20ON&ak=data%20science%20-intern&t=7&sr=10&s=r&job=zD_1ItN-5JXVPUCwXgZvg_joKB7qyQNp5ttncJBtECQ1FoqVIwMR8znJuj3fF7lK'
url_base  ='https://www.workopolis.com'

job_links = get_job_links(url_search, url_base)
job_details = get_job_details(job_links)


Scrapping RSS Job Search Feed
  =>Data Scientist                          | Zany Consulting Group    | Toronto, ON | ['Doctoral degree', "Master's degree", 'Data modeling', 'Internet of things']...
  =>Data Science Manager - Contract         | Scotiabank               | Toronto, ON | ["Master's degree", 'Microsoft Powerpoint', 'Microsoft Word', 'Power BI']...
  =>Data Scientist II - Machine Learnin     | Credit Sesame            | Toronto, ON | ['Git', 'SQL', 'Machine learning', 'Python']...
  =>Data Scientist II                       | TD Bank                  | Toronto, ON | ["Bachelor's", 'Data mining', 'SAS', 'R']...
  =>Senior Data Analyst                     | NielsenIQ                | Toronto, ON | ["Master's degree", "Bachelor's", 'Azure', 'Writing skills']...
  =>Manager, Data Scientist                 | Rogers Communications    | Toronto, ON | ["Master's degree", "Bachelor's", 'Power BI', 'Data modeling']...
  =>Data Quality Analyst GEMINI Data Op     | St. Michael's Hospita

# Resume Scrapping

# NLP

In [376]:
# Sumy
import sumy
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
import nltk # required for sumy
nltk.download('punkt') # required for sumy

# spacy
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_lg')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


from tabulate import tabulate
from collections import Counter
import docx2txt
import pandas as pd

[nltk_data] Downloading package punkt to /home/ken/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [273]:
def show_pos(text):    
    """Extracts POS from a spacy document and prints the results"""
    d=[]
    for token in doc:
        d.append({
            "Token":token.text,
            "POS":token.pos_,
            "POS-Desc":spacy.explain(token.pos_),
            "Lemma":token.lemma_,
            "Stop-Word":token.is_stop,
            "POS-Detail":token.tag_,
            "POS-Detail-Desc":spacy.explain(token.tag_)
            }
        )
    print(tabulate(d,headers="keys",tablefmt="github"))

In [274]:
def show_entities(doc,display=True):
    """Print a list of named entities and their descriptions"""
    d=[]
    if doc.ents:
        for entity in doc.ents:
            d.append({
                "Word":entity,
                "Entity":entity.label_,
                "Start":entity.start,
                "Stop":entity.end,
                "Entity-Desc":spacy.explain(entity.label_),
                }
            )
        if display:
            print(tabulate(d,headers="keys",tablefmt="github"))
            print("\n")
            displacy.render(doc,style='ent',jupyter=True)
            print("\n")
    else:
        print("no entities found")
    return d   

In [275]:
def attr_counts(doc,att):
    """Return a table of counts by attributes"""
    if att.lower() == 'pos':
        cnts = doc.count_by(spacy.attrs.POS)
    elif att.lower() =='tag':
        cnts = doc.count_by(spacy.attrs.TAG)
    elif att.lower() =='lemma':
        cnts = doc.count_by(spacy.attrs.LEMMA)
    elif att.lower() =='entity':
        cnts = doc.count_by(spacy.attrs.ENT_TYPE)
    elif att.lower() =='email':
        cnts = doc.count_by(spacy.attrs.LIKE_EMAIL)
    elif att.lower() =='url':
        cnts = doc.count_by(spacy.attrs.LIKE_URL)
    elif att.lower() =='currency':
        cnts = doc.count_by(spacy.attrs.IS_CURRENCY)
    elif att.lower() =='numbers':
        cnts = doc.count_by(spacy.attrs.IS_DIGIT)
        
    # lookup attribute name
    d = {doc.vocab[k].text:v for k,v in cnts.items()}
    return d

In [276]:
def get_text_summary(text,text_source,summarizer,num_sentences,print_summary=True):
    """
    Generate a text summary from a document (extractive)
    Params:
        text: plain text or path to a document
        text_source: text or file
        summarizer: text-summarization method. lex-rank, text-rank,lsa
        num_sentences: Number of summary sentences to generate
        print_summary: T/F
    Returns:
        A list of sentences.
    """
    # Create parser from text-source
    if text_source=='text':
        parser = PlaintextParser.from_string(text,Tokenizer('english'))
    elif text_source=='file':
        parser = PlaintextParser.from_file(text,Tokenizer('english'))
    else:
        raise Exception(f"text source must be 'text' or 'file'!")
        
    # Select summarizer method
    if summarizer=='lex-rank':
        # Lex Rank:  A graph-based summarization method that uses keyword extractions
        s = LexRankSummarizer()
    elif summarizer=='text-rank':
        # Text rank: A graph-based summarization method that uses keyword extractions
        s = TextRankSummarizer()
    elif summarizer=='lsa':
        # Latent semantic analysis 
        # Combines term frequency with singular value decomposition
        s = LsaSummarizer(Stemmer("english"))
        s.stop_words = get_stop_words("english")
    else:
        raise Exception(f"Summarizer must be 'lex-rank','luhn','text-rank' or 'lsa'!")
    
    # create summary
    summary = s(parser.document,num_sentences)
    
    # print summary
    if print_summary:
        print(f'Text Summary:Top {num_sentences} sentences using a {summarizer} summarizer')
        print('='*100)
        for sentence in summary:
            print(sentence)
        print('='*100,'\n')
    return s

In [277]:
def get_terms(path):
    """read a csv of data science terms & return themn as a list"""
    with open(path,'r') as f:
        lst = [line.lower().strip() for line in f.readlines()]
    return set(lst)

In [278]:
def get_keywords(text,num, print_words=True):
    words = []
    pos = ['NOUN','PROPN']
    common_job_keys = ['experience','job','ability','posting','process','date','stakeholders','skills','solutions','project']
    tokens = nlp(text.lower())
    
    for token in tokens:
        if (token.text not in nlp.Defaults.stop_words and token.text.lower() not in common_job_keys and token.pos_ in pos):
            words.append(token.text)
            
    # get the top keywords by frequency
    keyword_frequency = [word for word in Counter(words).most_common(num)]
    keywords = [word[0] for word in keyword_frequency]
    
    if print_words:
        print(keyword_frequency)
    return keywords            

In [283]:
def get_resume_text(path,print_text=True):
    """ 
    Read resume text content from file (pdf or docx)
    """
    
    # open file and get file extension
    file = open(path,'rb')
    file_name, file_ext = os.path.splitext(path)
    
    # extract text content
    if file_ext=='.pdf':
        # read pdf
        pdf = PyPDF2.PdfFileReader(file)
        
        text = """"""
        for page_num in range(pdf.numPages):
            page = pdf.getPage(page_num)
            content = page.extract_text()
            text += content
    elif file_ext=='.docx':
        text = docx2txt.process(file)
    
    if print_text:
        print(text)

    file.close()
    return text

In [282]:
def search_terms(text,terms_path,print_words=True):
    
    # get key terms
    terms = get_terms(terms_path)
    
    # tokenize the text content
    tokens = nlp(text.lower())
    
    # term freq dict
    term_frequency = {}
    
    # count the number of times each term is found in the text
    for token in tokens:
        if token.text in terms:
            if token.text in term_frequency.keys():
                term_frequency[token.text] = term_frequency[token.text] + 1
            else:
                term_frequency[token.text] = 1
    return sorted(term_frequency.items(),key=lambda x:x[1],reverse=True)

In [348]:
def preprocess_text(text):
    
    # replace carriage returns and convert to lowercase
    doc = text.replace("\n","").replace("  ","").lower()
    
    # tokenize
    tokens = nlp(doc)

    text_list = []
    for token in tokens:
        if token.text in nlp.Defaults.stop_words:
            continue
        if token.is_punct:
            continue
        if token.is_digit:
            continue
        if token.like_email:
            continue
        if token.lemma_=='-PRON-':
            continue
        # if token.text in '\n\n \n\n\n !"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ':
        #     continue
        # lemmetize
        text_list.append(token.lemma_)
    return " ".join(text_list)

In [350]:
def calc_similarity(doc1,doc2):
    """ """
    d1 = preprocess_text(doc1)
    d2 = preprocess_text(doc2)
    
    X = TfidfVectorizer().fit_transform([d1,d2])
    score = cosine_similarity(X[0],X[1])
    return score[0][0]
    

In [382]:
def calc_similarities(resume, jobs):
    """ """
    scores = []
    titles = []
    companies =[]
    locs = []
    for job in jobs:
        titles.append(job['title'])
        companies.append(job['company'])
        locs.append(job['location'])
        scores.append(calc_similarity(resume, job['desc']))
    
    df = pd.DataFrame(list(zip(titles, companies,locs,scores)),columns = ['title','company','location','similarity-score'])
    
    return df.sort_values(by=['similarity-score'], ascending=False)

In [383]:
calc_similarities(r2,job_details)

Unnamed: 0,title,company,location,similarity-score
18,Sr. IT Data Analyst,TD Bank,"Toronto, ON",0.338126
8,Senior Business Intelligence (BI) & Data Analyst,University Pension Plan,"Toronto, ON",0.278989
14,Sr. Data Analyst (Pharma),Tiger Analytics,"Toronto, ON",0.267301
1,Data Science Manager - Contract,Scotiabank,"Toronto, ON",0.258833
5,"Manager, Data Scientist",Rogers Communications,"Toronto, ON",0.257125
24,"Manager, AML/ATF Models & Analytics (Contract)",Scotiabank,"Toronto, ON",0.239999
0,Data Scientist,Zany Consulting Group,"Toronto, ON",0.238062
22,Senior Power BI Developer,BMO Financial Group,"Toronto, ON",0.227275
3,Data Scientist II,TD Bank,"Toronto, ON",0.22539
7,Data Scientist (HYBRID),Workplace Safety and Insurance Board,"Toronto, ON",0.222095


In [286]:
# r1=get_resume_text('data/resume.pdf',False)
# r2=get_resume_text('data/resume.docx',False)

print(search_terms(job_details[1]['desc'],'data/terms.csv',True))
print(search_terms(r1,'data/terms.csv',True))
print(search_terms(r2,'data/terms.csv',True))

[('models', 13), ('analysis', 6), ('modeling', 5), ('quantitative', 4), ('modelling', 2), ('programming', 2), ('python', 2), ('analytics', 1), ('optimization', 1), ('sql', 1), ('sas', 1), ('r', 1), ('regression', 1), ('statistics', 1), ('excel', 1)]
[('analysis', 35), ('sql', 26), ('models', 14), ('vba', 12), ('python', 12), ('quantitative', 8), ('modeling', 8), ('forecasting', 5), ('aws', 5), ('r', 5), ('analytics', 5), ('plotly', 5), ('optimization', 5), ('redshift', 4), ('tensorflow', 4), ('pandas', 4), ('predictive', 3), ('sagemaker', 3), ('oracle', 3), ('nlp', 3), ('excel', 3), ('communication', 3), ('lstm', 2), ('etl', 2), ('xgboost', 2), ('s3', 2), ('tableau', 2), ('prediction', 1), ('programming', 1), ('keras', 1), ('hive', 1), ('glue', 1), ('statistics', 1), ('modelling', 1)]
[('sql', 31), ('analysis', 27), ('vba', 18), ('models', 16), ('python', 11), ('modeling', 7), ('quantitative', 7), ('plotly', 6), ('modelling', 6), ('aws', 5), ('optimization', 5), ('forecasting', 5), ('p

In [299]:
# get_keywords(r2,5)
# get_text_summary(r2,'text','lsa',2)

# doc = nlp(r1)
# show_entities(doc,True)

In [300]:
job_key = 1

doc = nlp(job_details[job_key]['desc'])
keywords = get_keywords(job_details[job_key]['desc'],5)
s = get_text_summary(job_details[job_key]['desc'],'text','lsa',2)
job_details[job_key]['keywords']


show_entities(doc,True)
# print(attr_counts(doc,'entity'))
# show_pos(doc)


[('risk', 20), ('models', 13), ('business', 10), ('credit', 9), ('bank', 9)]
Text Summary:Top 2 sentences using a lsa summarizer
You will be responsible for understanding the goals & priorities set for you, executing them efficiently with a perpetual eye on quality, asking questions often and delivering results in harmony with your teammates.
You believe, first and foremost, in always doing the right thing and doing things the right way You are comfortable with ambitious but realistic goals and are committed to achieving them You are social, engaging, not afraid to ask questions and relish the opportunity to learn from global partners across the firm You relish the opportunity and yet also understand the responsibility and weight that comes with the fact that the output of your models will directly influence the bank’s financial statements You can read complex and lengthy regulatory requirements and identify opportunities for optimization while always remaining committed to compliance 





[{'Word': 174574,
  'Entity': 'CARDINAL',
  'Start': 3,
  'Stop': 4,
  'Entity-Desc': 'Numerals that do not fall under another type'},
 {'Word': The Internal Ratings Management,
  'Entity': 'ORG',
  'Start': 26,
  'Stop': 30,
  'Entity-Desc': 'Companies, agencies, institutions, etc.'},
 {'Word': Global Risk Management,
  'Entity': 'ORG',
  'Start': 32,
  'Stop': 35,
  'Entity-Desc': 'Companies, agencies, institutions, etc.'},
 {'Word': Bank’s Business Banking,
  'Entity': 'ORG',
  'Start': 42,
  'Stop': 46,
  'Entity-Desc': 'Companies, agencies, institutions, etc.'},
 {'Word': Business Banking,
  'Entity': 'ORG',
  'Start': 67,
  'Stop': 69,
  'Entity-Desc': 'Companies, agencies, institutions, etc.'},
 {'Word': The Leading Bank,
  'Entity': 'ORG',
  'Start': 109,
  'Stop': 112,
  'Entity-Desc': 'Companies, agencies, institutions, etc.'},
 {'Word': Americas,
  'Entity': 'LOC',
  'Start': 114,
  'Stop': 115,
  'Entity-Desc': 'Non-GPE locations, mountain ranges, bodies of water'},
 {'Word