In [38]:
from datetime import datetime
import sys
import time
import sqlite3
import pandas as pd
from gensim.parsing.preprocessing import strip_multiple_whitespaces, preprocess_string, remove_stopwords, strip_tags, strip_punctuation 
import gensim
import re
from gensim.summarization.mz_entropy import mz_keywords

In [39]:
#load
data = pd.read_parquet("data.parquet", engine="fastparquet")

In [40]:
def get_cat(row):
    if ("ENG - Software Engineering" in row["targeted_disciplines"]) and ("MATH - Computer Science" in row["targeted_disciplines"]):
        return True
    else:
        return False
def merge_duplicates(df):
    df = df.drop_duplicates(subset=['organization', 'job_title'], keep="first") 
    df = df.drop_duplicates(subset=['job_responsibilities', 'job_summary', 'required_skills'], keep="first")
    return df
def clean(x):
    newstr = strip_multiple_whitespaces(x)
    return newstr
def preprocess(x):
    CUSTOM_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation, remove_stopwords]
    x = re.sub(r"http\S+", "", x)
    x = preprocess_string(x, CUSTOM_FILTERS)
    x = " ".join(x)
    return x

In [41]:
#Clean data
data["job_responsibilities"] = data["job_responsibilities"].map(lambda x: clean(x))
data["job_summary"] = data["job_summary"].map(lambda x: clean(x))
data["required_skills"] = data["required_skills"].map(lambda x: clean(x))
data = merge_duplicates(data)
m = data.apply(get_cat, axis=1)
data = data[m]

In [49]:
resume = "Worked on proof-of-concept projects to demonstrate ML feasibility in key supply chain areas. Pre-processed time series data, trained forecasting models, and evaluated predictions. Mentored other interns on data science libraries, Jupyter, natural language processing, and ML best practices. Secured over $15k+ worth of sponsorship for over 6 events (1200+ attendees combined) since inception. \
Created fully functional platform to collect hackathon data for analytics. Hosted one of Canada’s first game development hackathons with Red Bull Canada: redbull.com/adrenalan. Experimented with state of the art NLP techniques to detect type of toxicity in online comments. \
Experimented with LSTMs, word embeddings (Glove, FastText), and techniques such as sequence bucketing. \
Fine-tuned and ensembled Bidirectional Encoder Representations from Transformers (BERT). \
Integrated an “auto-swipe” feature into Tinder based on real-time brain EEG data collected by the Muse. \
Tested on participants at Hack The 6ix 2017 and achieved a 75% accuracy. \
Knowledgeable in: Git, Docker, Python (Pandas, Numpy, Scikit-Learn), Keras, Pytorch, Java, Javascript, VueJS, Firebase \
"

In [50]:
import spacy

In [51]:
from collections import OrderedDict
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('en_core_web_sm')

class TextRank4Keyword():
    """Extract keywords from text"""
    
    def __init__(self):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight

    
    def set_stopwords(self, stopwords):  
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences
        
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmeric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        
        return g_norm

    
    def get_keywords(self, number=10):
        """Print top number keywords"""
        keywords = []
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        for i, (key, value) in enumerate(node_weight.items()):
            keywords.append([key, value])
            if i > number:
                break
        return keywords
        
    def analyze(self, text, 
                candidate_pos=['NOUN', 'PROPN'], 
                window_size=4, lower=False, stopwords=list()):
        """Main function to analyze text"""
        
        # Set stop words
        self.set_stopwords(stopwords)
        
        # Pare text by spaCy
        doc = nlp(text)
        
        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words
        
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)
        
        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)
        
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight

In [52]:
tr4w = TextRank4Keyword()
tr4w.analyze(resume, candidate_pos = ['NOUN', 'PROPN'], window_size=4, lower=False)
keywords = tr4w.get_keywords(50)

In [53]:
print(keywords)

[['data', 2.5637175099206346], ['ML', 1.8211539682539684], ['time', 1.50946875], ['Glove', 1.281397222222222], ['FastText', 1.281397222222222], ['feasibility', 1.2247305555555554], ['NLP', 1.1900458333333335], ['type', 1.1900458333333335], ['Canada', 1.1678277777777777], ['hackathons', 1.1678277777777777], ['Jupyter', 1.1644969246031747], ['Pandas', 1.1545347222222224], ['Java', 1.1545347222222224], ['Tinder', 1.1532571924603174], ['brain', 1.152207341269841], ['Numpy', 1.1292708333333334], ['Pytorch', 1.129270833333333], ['libraries', 1.107890972222222], ['Scikit', 1.1066041666666666], ['Keras', 1.1066041666666666], ['Learn', 1.104361111111111], ['embeddings', 1.0959555555555553], ['techniques', 1.0959555555555553], ['language', 1.086504365079365], ['sponsorship', 1.0814583333333332], ['events', 1.0814583333333332], ['attendees', 1.0814583333333332], ['Encoder', 1.0814583333333332], ['Representations', 1.0814583333333332], ['Transformers', 1.0814583333333332], ['supply', 1.05293611111

In [54]:
def score(keywords, job):
    tr4w.analyze(job, candidate_pos = ['NOUN', 'PROPN'], window_size=4, lower=False)
    job_keywords = tr4w.get_keywords(50)
    score = 0
    for w1 in keywords:
        for w2 in job_keywords:
            if w1[0] == w2[0]:
                score += w1[1]
    return score

In [56]:
import time
start = time.time()
scores = []
for i, (doc1, doc2, doc3) in enumerate(zip(data["job_responsibilities"].values, data["job_summary"].values, data["required_skills"].values)):
    doc = doc1 + " " + doc2 + " " + doc3
    scores.append((i, score(keywords, doc)))
print(time.time() - start)

70.39730668067932


In [58]:
scores = sorted(scores,key=lambda x: x[1], reverse=True)

In [59]:
scores

[(767, 10.015698263888888),
 (675, 9.882897569444443),
 (61, 9.846139285714285),
 (877, 9.717430902777778),
 (12, 9.034172271825396),
 (867, 8.949150347222222),
 (379, 8.7885408234127),
 (78, 8.334419593253967),
 (1155, 8.154538343253968),
 (271, 8.02239032738095),
 (1211, 7.677435367063491),
 (150, 7.532724255952379),
 (290, 7.424377926587301),
 (969, 7.307236458333332),
 (967, 7.259154117063491),
 (159, 7.170313938492062),
 (649, 7.169614037698411),
 (403, 7.154502083333332),
 (743, 7.104878273809522),
 (274, 7.04620044642857),
 (710, 6.987813541666666),
 (31, 6.95897931547619),
 (1209, 6.917317509920634),
 (723, 6.907920287698412),
 (667, 6.885702232142856),
 (298, 6.879226041666666),
 (447, 6.846085615079364),
 (926, 6.844099454365079),
 (45, 6.843202232142857),
 (552, 6.842895287698411),
 (692, 6.82867296626984),
 (267, 6.813830009920634),
 (70, 6.8107268353174595),
 (547, 6.804378819444444),
 (476, 6.754755009920634),
 (477, 6.754755009920634),
 (615, 6.754755009920634),
 (758, 6

In [60]:
for job in scores[:50]:
    X = job[0]
    print("ID: ", X)
    print(data["job_title"].values[X], ", ", data["organization"].values[X])
    print(data["job_responsibilities"].values[X])
    print(data["job_summary"].values[X])
    print(data["required_skills"].values[X])
    print("================================")

ID:  767
Data Analyst ,  The Globe and Mail
· Performing data analyses against big data using SQL and NoSQL databases · Maintaining and enhancing our analytics reporting functionalities · Developing dashboards for stakeholders · Familiarize and understand current projects and document project details · Perform on-going project QA · Supporting the continued evolution of the Globe in Mail into a highly data driven business.
The Globe and Mail is seeking a Data Analyst to support leading edge data analytics projects to increase the Globe and Mail's competitive advantage. The Data Analyst will collaborate with the Digital & Data Science team to build new data-driven technologies to support business functions. Projects will include prototyping new features, enhancing and refining existing projects, and QA testing for production. Successful applicants will work in Lab 351, the Globe's own innovation lab, at our new downtown office, located in Toronto's Distillery District. Other benefits inc