In [60]:
from datetime import datetime
import sys
import time
import sqlite3
import pandas as pd
from gensim.parsing.preprocessing import strip_multiple_whitespaces
import gensim

In [2]:
def initialize_db(db_name):
    connection = sqlite3.connect(db_name)
    connection.row_factory = sqlite3.Row

    cur = connection.cursor()
    cur.execute("SELECT * FROM Jobs")

    return connection, cur

def parse_job(row):
    return {
        'organization' : row['organization'],
        'job_title' : row['job_title'],
        'number_of_openings' : row['number_of_openings'],
        'address_line1' : row['address_line1'],
        'city' : row['city'],
        'province_state' : row['province_state'],
        'postal_code' : row['postal_code'],
        'job_summary' : row['job_summary'],
        'job_responsibilities' : row['job_responsibilities'],
        'required_skills' : row['required_skills'],
        'targeted_disciplines' : row['targeted_disciplines'].replace('\r', '').replace('\t', ' ').replace('\n', '').replace('Targeted Clusters', '')
    }

In [3]:
connection, cur = initialize_db("data/fall2017/jobs.db")

In [4]:
count = 0
jobs = []
while True:
    count += 1
    row = cur.fetchone()

    if row is None:
        print("========================== All records processed ============================")
        break

    job = parse_job(row)
    
    jobs.append(job)
    
print("Number of rows loaded: ", count)
connection.close()

Number of rows loaded:  3933


In [5]:
#save to pandas dataframe
df = pd.DataFrame.from_dict(jobs, orient="columns")

In [7]:
df.to_parquet('data/data.parquet', compression="gzip", engine='fastparquet')

In [32]:
#load
data = pd.read_parquet("data/data.parquet")

In [33]:
data.head()

Unnamed: 0,address_line1,city,job_responsibilities,job_summary,job_title,number_of_openings,organization,postal_code,province_state,required_skills,targeted_disciplines
0,Canada Centre for Inland Waters,Burlington,Working with Water Resources staff within the ...,The Water Resources Student will work with Wat...,Water Resources Student,1,Environment and Climate Change Canada,L7S 1A1,Ontario,Preference given to Canadian Citizenship\r\nKn...,- Theme - Natural Resource Manag...
1,187 King Street South,Waterloo,Responsibilities\r\n-Under the direction of st...,"Validus Research Inc., based in Waterloo, Onta...",Developer,1,Validus Group,N2J 1R1,Ontario,Essential Skills/Qualifications\r\n-Undergradu...,- Theme - Business Administratio...
2,225 Wicksteed Avenue,Toronto,- Perform mechanical properties testing\r\n- ...,"Reporting to Chemist (Toronto), this position ...",Chemist's Assistant,1,Siltech Corporation,M4H 1G5,Ontario,- Excellent analytical skills\r\n- Excellent...,- Theme - Scientific Experimenta...
3,555 Richmond Street West,Toronto,Technical Tasks\r\n- Assist in the maintenance...,Synaptive Medical is a newly-formed company ba...,Systems Engineering Co-op (MRI),1,Synaptive Medical,M5V 3B1,Ontario,Required\r\n- Enrollment in Systems Design Eng...,- Theme - Medical Devices and Di...
4,3190 Steeles Avenue East,Markham,Summary: \r\nReporting to the Senior Project ...,Geranium is a medium sized well established bu...,Project Assistant,1,Geranium Corporation,L3R 1G9,Ontario,The Project Assistant should have: \r\n- Stro...,- Theme - Construction and Infra...


In [52]:
def clean(x):
    newstr = strip_multiple_whitespaces(x)
    return newstr


In [55]:
#Clean data
data["job_responsibilities"] = data["job_responsibilities"].map(lambda x: clean(x))
data["job_summary"] = data["job_summary"].map(lambda x: clean(x))
data["required_skills"] = data["required_skills"].map(lambda x: clean(x))

In [57]:
data["required_skills"].values[:3]

array(['Preference given to Canadian Citizenship Knowledge of water resources related issues, particularly related to the Great Lakes Knowledge of Microsoft Office software including Excel, Word and Powerpoint Knowledge of basic data and information management concepts including metadata Abilities - organized and thorough Abilities - communicate effectively both in writing and verbally Personal Suitability - work well with others Personal Suitability - adaptable Personal Suitability - critical thinking',
       'Essential Skills/Qualifications -Undergraduate and/or graduate students in computer science or other technical background with the relevant programming experience and aptitude -Experience with at least one of: Javascript, Java -Experience developing web based application. Familiar with HTML/CSS -Experience with SQL -Enthusiasm and strong work ethic -Ability to work in team environment Advantages -You will grow at much faster pace than working for FB, GO, AZ. You are not a numbe

In [98]:
#create corpus
def read_corpus():
    for i, (doc1, doc2, doc3) in enumerate(zip(data["job_responsibilities"].values, data["job_summary"].values, data["required_skills"].values)):
        doc = doc1 + " " + doc2 + " " + doc3
        yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(doc), [i])

train_corpus = list(read_corpus())

In [99]:
train_corpus[:2]

[TaggedDocument(words=['working', 'with', 'water', 'resources', 'staff', 'within', 'the', 'unit', 'the', 'job', 'would', 'require', 'organizing', 'existing', 'media', 'reports', 'tracking', 'new', 'media', 'reports', 'and', 'identifying', 'emerging', 'research', 'and', 'studies', 'related', 'to', 'the', 'impact', 'of', 'fluctuating', 'great', 'lakes', 'water', 'levels', 'on', 'key', 'socio', 'economic', 'and', 'environmental', 'outcomes', 'summarizing', 'emerging', 'findings', 'and', 'results', 'from', 'identified', 'media', 'reports', 'and', 'studies', 'documenting', 'and', 'reporting', 'important', 'findings', 'back', 'to', 'the', 'glam', 'committee', 'and', 'identifying', 'potential', 'strategies', 'for', 'undertaking', 'similar', 'work', 'in', 'the', 'future', 'contributing', 'to', 'additional', 'efforts', 'in', 'support', 'of', 'great', 'lakes', 'water', 'level', 'management', 'which', 'may', 'include', 'reviewing', 'and', 'applying', 'existing', 'modelling', 'tools', 'to', 'asses

In [100]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=300, min_count=2, epochs=40)

In [101]:
model.build_vocab(train_corpus)

In [102]:
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

CPU times: user 1min 33s, sys: 702 ms, total: 1min 34s
Wall time: 33.3 s


In [152]:
resume = "Secured over $15k+ worth of sponsorship for over 6 events (1200+ attendees combined) since inception. Hosted one of Canada’s first game development hackathons with Red Bull Canada: redbull.com/adrenalan"
# resume = "Machine Learning Intern Worked on proof-of-concept projects to demonstrate ML feasibility in key supply chain areas. \
# Pre-processed time series data, trained forecasting models, and evaluated predictions. \
# Mentored other interns on data science libraries, Jupyter, natural language processing, and ML best practices. \
# Secured over $15k+ worth of sponsorship for over 6 events (1200+ attendees combined) since inception. \
# Created fully functional platform to collect hackathon data for analytics. \
# Hosted one of Canada’s first game development hackathons with Red Bull Canada: redbull.com/adrenalan \
# Experimented with state of the art NLP techniques to detect type of toxicity in online comments. \
# Experimented with LSTMs, word embeddings (Glove, FastText), and techniques such as sequence bucketing. \
# Fine-tuned and ensembled Bidirectional Encoder Representations from Transformers (BERT). \
# Integrated an “auto-swipe” feature into Tinder based on real-time brain EEG data collected by the Muse. \
# Tested on participants at Hack The 6ix 2017 and achieved a 75% accuracy. \
# Knowledgeable in: Git, Docker, Python (Pandas, Numpy, Scikit-Learn), Keras, Pytorch, Java, Javascript, VueJS, Firebase \
# "

print(gensim.utils.simple_preprocess(resume))
inferred_vector = model.infer_vector(gensim.utils.simple_preprocess(resume))
sims = model.docvecs.most_similar([inferred_vector], topn=30)

['secured', 'over', 'worth', 'of', 'sponsorship', 'for', 'over', 'events', 'attendees', 'combined', 'since', 'inception', 'hosted', 'one', 'of', 'canada', 'first', 'game', 'development', 'hackathons', 'with', 'red', 'bull', 'canada', 'redbull', 'com', 'adrenalan']


In [153]:
sims

[(1863, 0.3852059841156006),
 (3793, 0.3828640580177307),
 (1282, 0.3617967963218689),
 (1431, 0.35404014587402344),
 (3785, 0.3507872223854065),
 (711, 0.3393417000770569),
 (1530, 0.338154137134552),
 (3402, 0.33568012714385986),
 (1182, 0.33392512798309326),
 (661, 0.3332894742488861),
 (2488, 0.33199015259742737),
 (955, 0.32992246747016907),
 (3892, 0.32974690198898315),
 (3581, 0.32532793283462524),
 (3609, 0.32360512018203735),
 (2324, 0.32139715552330017),
 (534, 0.3209558129310608),
 (1010, 0.32037433981895447),
 (3289, 0.318112313747406),
 (2507, 0.3170558214187622),
 (466, 0.3161781430244446),
 (1135, 0.3138604164123535),
 (2442, 0.3126385807991028),
 (142, 0.31148380041122437),
 (779, 0.31139060854911804),
 (627, 0.3058198094367981),
 (3847, 0.3057766556739807),
 (3048, 0.3040385842323303),
 (102, 0.30346807837486267),
 (854, 0.30213463306427)]

In [154]:
for job in sims:
    X = job[0]
    print(data["job_title"].values[X])
    print(data["job_responsibilities"].values[X])
    print(data["job_summary"].values[X])
    print(data["required_skills"].values[X])
    print("================================")

Finance Intern
Candidates must have a keen interest in the media and entertainment industry and a desire to explore non-traditional aspects of business. Candidates should be able to demonstrate a commitment to the Corus values of win together, think beyond, make it happen, learn every day and show we care. Not only should candidates perform given tasks to an exemplary level, they should also be able to identify needs within their business unit and offer ways in which they can exploit these opportunities.
Corus Entertainment is looking for one to three (1 - 3) finance interns to work with us during the winter of 2018. The candidate will participate in a rotational program throughout the company, working with a variety of different teams such as planning & analysis, risk & compliance, and accounting. This position is located in Corus's award-winning offices in downtown Toronto. About Corus Entertainment Inc. Corus Entertainment Inc. (TSX: CJR.B) is a leading media and content company tha