# Web Scrapping

In [1]:
from bs4 import BeautifulSoup
import requests
import os
import PyPDF2 

In [2]:
def get_job_links(url_search,url_base):
    """ 
    Return a list of links to jobs returned from the url_search link
    """
    # use requests to get the search url contents, convert to a soup object
    r = requests.get(url_search)
    soup = BeautifulSoup(r.content)
    
    # extract the links from the job titles
    jobs = soup.find_all("a",{"class":"SerpJob-titleLink"})
    
    # return a list of job-links
    job_links = []
    for job in jobs:
        job_links.append(f"{url_base}{job.get('href')}")
    return job_links

In [3]:
def get_job_details(urls):
    """
    """
    print("Scrapping RSS Job Search Feed")
    print("="*100)
    job_detail = []
    for url in urls:
        try:
            # get url content, convert to soup
            r = requests.get(url)
            soup = BeautifulSoup(r.content)

            # extract content
            title = soup.find("div",{'class':'ViewJobHeaderTitle'}).text
            company = soup.find("div",{'class':'ViewJobHeaderCompany'}).text
            location = soup.find("span",{'class':'ViewJobHeaderPropertiesLocation'}).text
            desc = soup.find("div",{'class':'viewjob-description'}).text

            # extract key words (skills, education, benefits)
            keys = soup.find_all('li',{'class':'nav-item'})
            key_words = [w.text for w in keys]

            # build result-set
            job_detail.append(
                {'title':title,'company':company,'location':location,'desc':desc,'keywords':key_words}
            )
            # print progress
            print(f"  =>{title[:35]:{40}}| {company[:25]:{25}}| {location} | {key_words[:4]}...")
        except Exception as e:
            print("   =>Skipping Job...")
            continue
    print("="*100)
    return job_detail

In [4]:

url_search = 'https://www.workopolis.com/jobsearch/find-jobs?l=Toronto%2C%20ON&ak=data%20science%20-intern&t=7&sr=10&s=r&job=zD_1ItN-5JXVPUCwXgZvg_joKB7qyQNp5ttncJBtECQ1FoqVIwMR8znJuj3fF7lK'
url_base  ='https://www.workopolis.com'

job_links = get_job_links(url_search, url_base)
job_details = get_job_details(job_links)


Scrapping RSS Job Search Feed
  =>Data Science Co-op (R&D) - Summer t     | Swift Medical            | Toronto, ON | ["Master's degree", "Bachelor's", 'Machine learning', 'Software development']...
  =>Data Analyst - Operations               | Flair Airlines Ltd.      | Toronto, ON | ["Bachelor's", 'Power BI', 'Microsoft Excel', 'Business intelligence']...
  =>Data Scientist II - Machine Learnin     | Credit Sesame            | Toronto, ON | ['Git', 'SQL', 'Machine learning', 'Python']...
  =>Data Scientist                          | Wish                     | Toronto, ON | ["Master's degree", "Bachelor's", 'MongoDB', 'Tableau']...
  =>Data Science Manager - Contract         | Scotiabank               | Toronto, ON | ["Master's degree", 'Microsoft Powerpoint', 'Microsoft Word', 'Power BI']...
  =>Data Scientist II                       | TD Bank                  | Toronto, ON | ["Bachelor's", 'Data mining', 'SAS', 'R']...
  =>Data Quality Analyst GEMINI Data Op     | St. Michael's Hosp

# NLP

In [139]:
# Sumy
import sumy
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
import nltk # required for sumy
nltk.download('punkt') # required for sumy

# spacy
import spacy
from spacy import displacy
NLP = spacy.load('en_core_web_lg')
from spacy.matcher import PhraseMatcher
# from spacy.matcher import Matcher

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


from tabulate import tabulate
from collections import Counter
import docx2txt
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

[nltk_data] Downloading package punkt to /home/ken/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Resume Scrapping

In [6]:
def get_resume_text(path,print_text=True):
    """ 
    Read resume text content from file (pdf or docx)
    Params:
        path: string- path to the resume file
        print_text: boolean.  Prints the text
    Returns: A text object with the resume file content
    """
    
    # open file and get file extension
    file = open(path,'rb')
    file_name, file_ext = os.path.splitext(path)
    
    # extract text content
    if file_ext=='.pdf':
        # read pdf
        pdf = PyPDF2.PdfFileReader(file)
        
        text = """"""
        for page_num in range(pdf.numPages):
            page = pdf.getPage(page_num)
            content = page.extract_text()
            text += content
    elif file_ext=='.docx':
        text = docx2txt.process(file)
    
    if print_text:
        print(text)

    file.close()
    return text

In [7]:
r1 = get_resume_text('data/resume.docx',False)

In [8]:
def show_pos(text):    
    """Extracts POS from a spacy document and prints the results"""
    d=[]
    for token in doc:
        d.append({
            "Token":token.text,
            "POS":token.pos_,
            "POS-Desc":spacy.explain(token.pos_),
            "Lemma":token.lemma_,
            "Stop-Word":token.is_stop,
            "POS-Detail":token.tag_,
            "POS-Detail-Desc":spacy.explain(token.tag_)
            }
        )
    print(tabulate(d,headers="keys",tablefmt="github"))

In [9]:
def show_entities(doc,display=True):
    """Print a list of named entities and their descriptions"""
    d=[]
    if doc.ents:
        for entity in doc.ents:
            d.append({
                "Word":entity,
                "Entity":entity.label_,
                "Start":entity.start,
                "Stop":entity.end,
                "Entity-Desc":spacy.explain(entity.label_),
                }
            )
        if display:
            print(tabulate(d,headers="keys",tablefmt="github"))
            print("\n")
            displacy.render(doc,style='ent',jupyter=True)
            print("\n")
    else:
        print("no entities found")
    return d   

In [10]:
def attr_counts(doc,att):
    """Return a table of counts by attributes"""
    if att.lower() == 'pos':
        cnts = doc.count_by(spacy.attrs.POS)
    elif att.lower() =='tag':
        cnts = doc.count_by(spacy.attrs.TAG)
    elif att.lower() =='lemma':
        cnts = doc.count_by(spacy.attrs.LEMMA)
    elif att.lower() =='entity':
        cnts = doc.count_by(spacy.attrs.ENT_TYPE)
    elif att.lower() =='email':
        cnts = doc.count_by(spacy.attrs.LIKE_EMAIL)
    elif att.lower() =='url':
        cnts = doc.count_by(spacy.attrs.LIKE_URL)
    elif att.lower() =='currency':
        cnts = doc.count_by(spacy.attrs.IS_CURRENCY)
    elif att.lower() =='numbers':
        cnts = doc.count_by(spacy.attrs.IS_DIGIT)
        
    # lookup attribute name
    d = {doc.vocab[k].text:v for k,v in cnts.items()}
    return d

In [11]:
def get_text_summary(text,text_source,summarizer,num_sentences,print_summary=True):
    """
    Generate a text summary from a document (extractive)
    Params:
        text: plain text or path to a document
        text_source: text or file
        summarizer: text-summarization method. lex-rank, text-rank,lsa
        num_sentences: Number of summary sentences to generate
        print_summary: T/F
    Returns:
        A list of sentences.
    """
    # Create parser from text-source
    if text_source=='text':
        parser = PlaintextParser.from_string(text,Tokenizer('english'))
    elif text_source=='file':
        parser = PlaintextParser.from_file(text,Tokenizer('english'))
    else:
        raise Exception(f"text source must be 'text' or 'file'!")
        
    # Select summarizer method
    if summarizer=='lex-rank':
        # Lex Rank:  A graph-based summarization method that uses keyword extractions
        s = LexRankSummarizer()
    elif summarizer=='text-rank':
        # Text rank: A graph-based summarization method that uses keyword extractions
        s = TextRankSummarizer()
    elif summarizer=='lsa':
        # Latent semantic analysis 
        # Combines term frequency with singular value decomposition
        s = LsaSummarizer(Stemmer("english"))
        s.stop_words = get_stop_words("english")
    else:
        raise Exception(f"Summarizer must be 'lex-rank','luhn','text-rank' or 'lsa'!")
    
    # create summary
    summary = s(parser.document,num_sentences)
    
    # print summary
    if print_summary:
        print(f'Text Summary:Top {num_sentences} sentences using a {summarizer} summarizer')
        print('='*100)
        for sentence in summary:
            print(sentence)
        print('='*100,'\n')
    return s

In [74]:
# def extract_keywords(text,num, print_words=True):
#     """
#     Extracts keywords from a text object.  Ignores common job positng phrases
#     Params:
#         text: string text object
#         num: integer.  The number of keywords to extract
#         print_words: boolean. Prints the output
#     Returns: A list of the top keywords in the text object
#     """
#     words = []
#     pos = ['NOUN','PROPN']
#     common_job_keys = ['experience','job','ability','posting','process','date','stakeholders','skills','solutions','project']
#     tokens = NLP(text.lower())
    
#     for token in tokens:
#         if (token.text not in NLP.Defaults.stop_words and token.text.lower() not in common_job_keys and token.pos_ in pos):
#             words.append(token.text)
            
#     # get the top keywords by frequency
#     keyword_frequency = [word for word in Counter(words).most_common(num)]
#     keywords = [word[0] for word in keyword_frequency]
    
#     if print_words:
#         print(keyword_frequency)
#     return keywords            

In [26]:
def preprocess_text(text):
    """
    Preprocess text documents in preparation for analysis
    Params:
        text: a text object
    Returns: 
        A text object after converting to lowercase lemmas, removing stop-words,punctuation, numbers, and emails
    """
    
    # replace carriage returns and convert to lowercase
    doc = text.replace("\n","").replace("  ","").lower()
    
    # tokenize
    tokens = NLP(doc)

    text_list = []
    for token in tokens:
        if token.text in NLP.Defaults.stop_words:
            continue
        if token.is_punct:
            continue
        if token.is_digit:
            continue
        if token.like_email:
            continue
        if token.lemma_=='-PRON-':
            continue
            
        # lemmetize
        text_list.append(token.lemma_)
    return " ".join(text_list)

In [27]:
def calc_similarity(doc1,doc2):
    """ 
    Calculate the cosine similarity between two documents
    Params:
        doc1 - text document 
        doc2 - text document
    Returns: The cosine similairty score between the documents
    """
    d1 = preprocess_text(doc1)
    d2 = preprocess_text(doc2)
    
    X = TfidfVectorizer().fit_transform([d1,d2])
    score = cosine_similarity(X[0],X[1])
    return score[0][0]

In [28]:
def calc_similarities(resume, jobs):
    """ 
    Calculate the cosine similarity between a resume and a list of job descriptions
    Params:
        resume: a resume (text)
        jobs: a dictionary of jobs listings (output from get_job_details)
    Returns: A dataframe of job title, company, location and similarity score
    """
    scores = []
    titles = []
    companies =[]
    locs = []
    for job in jobs:
        titles.append(job['title'])
        companies.append(job['company'])
        locs.append(job['location'])
        scores.append(calc_similarity(resume, job['desc']))
    
    df = pd.DataFrame(list(zip(titles, companies,locs,scores)),columns = ['title','company','location','similarity-score'])
    
    return df.sort_values(by=['similarity-score'], ascending=False)

In [29]:
calc_similarities(r1,job_details)

Unnamed: 0,title,company,location,similarity-score
20,Sr. IT Data Analyst,TD Bank,"Toronto, ON",0.336437
14,Sr. Data Analyst (Pharma),Tiger Analytics,"Toronto, ON",0.267301
4,Data Science Manager - Contract,Scotiabank,"Toronto, ON",0.258833
3,Data Scientist,Wish,"Toronto, ON",0.249669
23,"Senior Manager, Data and AI Risk",Scotiabank,"Toronto, ON",0.228092
19,Senior Power BI Developer,BMO Financial Group,"Toronto, ON",0.227275
5,Data Scientist II,TD Bank,"Toronto, ON",0.22539
7,Data Scientist (HYBRID),Workplace Safety and Insurance Board,"Toronto, ON",0.222095
8,Senior Data Analyst,Sun Life,"Toronto, ON",0.221648
1,Data Analyst - Operations,Flair Airlines Ltd.,"Toronto, ON",0.221136


In [12]:
def get_terms(path):
    """
    Read a csv of data science terms & return themn as a list
    Params: path -string
    Returns: A list of terms
    """
    with open(path,'r') as f:
        lst = [line.lower().strip() for line in f.readlines()]
    return set(lst)

In [77]:
def get_phrases_from_keywords(keywords):
    lst = []
    for keyword in keywords:
        if " " in keyword:
            lst.append(keyword.lower().replace(" ","-"))
        lst.append(keyword.lower())
    return set(lst)

In [206]:
def get_phrase_matches(keywords,text):
    """
    spacy 3.5 has a built in fuzzy matching option
    """
    # when spacy 3.5 is available on anaconda
#     matcher = Matcher(NLP.vocab)
#     patterns = [{"TEXT": {"FUZZY": "modelling"}},{"TEXT": {"FUZZY": "machine learning"}}]

    # get the term list
    terms = get_phrases_from_keywords(keywords)
    
    # convert text to tokens
    doc = NLP(text.lower())

    # phrase matching
    patterns = [NLP(text) for text in terms]

    # add to matcher
    matcher = PhraseMatcher(NLP.vocab)
    matcher.add('KEYWORDS',patterns)

    # search for phrase matches
    found_matches = matcher(doc)

    matched_phrases = []
    for match_id, start, end in found_matches:
        matched_phrases.append(doc[start:end].text)

    return dict(Counter(matched_phrases)) 

In [264]:
def compare_keywords(resume,job,keyword_path):
    """
    """
    # path to csv of terms
    keys = get_phrases_from_keywords(get_terms(keyword_path))
    
    # match phrases
    matches_resume = get_phrase_matches(keys,resume)
    matches_job = get_phrase_matches(keys,job)

    # convert to dataframes
    df_resume = pd.DataFrame.from_dict(matches_resume,orient='index',columns=['resume'])
    df_job = pd.DataFrame.from_dict(matches_job,orient='index',columns=['job'])
    
    # combine results & sort by job keywords
    df = df_job.join(df_resume)
    df.sort_values(by=['job'], ascending=False,inplace=True)
    
    # calculate the weighted difference in keyword frequencies
    df['wt'] = df['job']/df['job'].sum()
    df['wt-score'] = df['wt']*(df['resume']-df['job'])
    score = df['wt-score'].sum()
    
    return df, score

In [245]:
def compare_job_specific_keywords(resume, job_keywords):
    """
    """
    # match phrases
    matches = get_phrase_matches(job_keywords,resume)
    
    # add missing job keywords to the dict
    for key in job_keywords:
        if key.lower() in matches.keys():
            continue
        else:
            matches[key.lower()]=0
    
    # convert to dateframe
    df = pd.DataFrame.from_dict(matches,orient='index',columns=['resume'])
    
    return df

In [266]:
df,score = compare_keywords(r1,job_details[0]['desc'],'data/terms.csv')
# spectral, diverging
print(score)
df

1.1851851851851851


Unnamed: 0,job,resume,wt,wt-score
data science,9,1.0,0.333333,-2.666667
machine learning,4,10.0,0.148148,0.888889
models,3,16.0,0.111111,1.444444
ai,2,,0.074074,
predictive,2,3.0,0.074074,0.074074
analytics,1,4.0,0.037037,0.111111
computer vision,1,1.0,0.037037,0.0
analysis,1,27.0,0.037037,0.962963
algorithms,1,,0.037037,
master’s degree,1,,0.037037,


In [285]:
def plot_keyword_comparison(resume, job_details, keyword_path):
    """ 
    Plot Job keywords vs Resume keywords
    Params: df- dataframe output from compare_keywords
    Returns: None
    """
    
    # compare common keywords, split into matches/no-matches
    df,score = compare_keywords(resume,job_details['desc'],keyword_path)
    df_missing  = df.query('resume.isnull()')
    df_matching = df.query('~resume.isnull()')
    
    # compare to job specific keywords
    df_spec = compare_job_specific_keywords(resume, job_details['keywords'])
    
    col_scale = [
        [0,'rgba(252,76,1,0.9)'],
        [0.25,'rgba(252,76,1,0.5)'],
        [0.5,'rgba(83,128,141,0.25)'],
        [0.75,'rgba(83,128,141,0.5)'],
        [1.0,'rgba(83,128,141,0.9)']
    ]
    
    # create plots
    fig = make_subplots(
        rows=1,
        cols=3,
        subplot_titles = ('Matching Keywords','Job Specific Keywords','Missing Keywords')
    )
    fig.add_trace(
        go.Bar(
            name='Weighted Frequency',
            y = df_matching.index,
            x = df_matching['wt-score'],
            orientation='h',
            marker = dict(color=df_matching['wt-score'],colorscale=col_scale)
        ),row=1,col=1
    )
    fig.add_trace(
        go.Bar(
            name='Job Specific Keywords',
            y = df_spec.index,
            x = df_spec.resume,
            orientation='h',
            marker = dict(color='rgba(83,128,141,0.8)')
        ),row=1,col=2
    )
    fig.add_trace(
        go.Bar(
            name='Missing Keywords',
            y = df_missing.job,
            x = df_missing.index,
            marker = dict(color='rgba(252,76,1,0.8)')
        ),row=1,col=3
    )
    fig.update_layout(
        template='plotly_white',
        title='Keyword Frequencies: Resume vs Job Posting',
        width = 900,
        height=500
    )
    fig.show()

In [286]:

plot_keyword_comparison(r1,job_details[5],'data/terms.csv')

In [97]:
# job_key = 1
# doc = nlp(job_details[job_key]['desc'])
# keywords = get_keywords(job_details[job_key]['desc'],5)
# s = get_text_summary(job_details[job_key]['desc'],'text','lsa',2)
# job_details[job_key]['keywords']


# show_entities(doc,True)
# print(attr_counts(doc,'entity'))
# show_pos(doc)
