In [70]:
from datetime import datetime
import sys
import time
import sqlite3
import pandas as pd
from gensim.parsing.preprocessing import strip_multiple_whitespaces, preprocess_string, remove_stopwords, strip_tags, strip_punctuation 
import gensim
import re

In [None]:
def initialize_db(db_name):
    connection = sqlite3.connect(db_name)
    connection.row_factory = sqlite3.Row

    cur = connection.cursor()
    cur.execute("SELECT * FROM Jobs")

    return connection, cur

def parse_job(row):
    return {
        'organization' : row['organization'],
        'job_title' : row['job_title'],
        'number_of_openings' : row['number_of_openings'],
        'address_line1' : row['address_line1'],
        'city' : row['city'],
        'province_state' : row['province_state'],
        'postal_code' : row['postal_code'],
        'job_summary' : row['job_summary'],
        'job_responsibilities' : row['job_responsibilities'],
        'required_skills' : row['required_skills'],
        'targeted_disciplines' : row['targeted_disciplines'].replace('\r', '').replace('\t', ' ').replace('\n', '').replace('Targeted Clusters', '')
    }

In [4]:
connection, cur = initialize_db("data/fall2017/jobs.db")

OperationalError: unable to open database file

In [None]:
count = 0
jobs = []
while True:
    count += 1
    row = cur.fetchone()

    if row is None:
        print("========================== All records processed ============================")
        break

    job = parse_job(row)
    
    jobs.append(job)
    
print("Number of rows loaded: ", count)
connection.close()

In [None]:
#save to pandas dataframe
df = pd.DataFrame.from_dict(jobs, orient="columns")

In [None]:
df.to_parquet('data/data.parquet', compression="gzip", engine='fastparquet')

In [61]:
#load
data = pd.read_parquet("data.parquet", engine="fastparquet")

In [86]:
print(data["organization"].values[971])
print(data["job_title"].values[971])
print(data["required_skills"].values[971])
print(len(data))

Shoplogix Inc
Front-end Developer
If you have a portfolio of work you've produced then we'd love to see it. We're looking for a highly motivated developer that takes pride in creating the best possible user experience Coding challenge: ++++++++++[>+>+++>+++++++>++++++++++<<<<-]>>>+++++++. >+. +++++++++. ++++++. -----------. ++++++. -. <<++. ++. >++. >+++++++. -. ----. -----------. +++++++++++++. ------------. +++++++++. +++. -----. <<. --. >>-------. +++++++++. +++. <<. >>---------. +++++. ++++++. ---------------. +++++++++++++. ++++. -------------. ----. ++++++++++++++++++. <<. >+++++++++++++++++++. >--------. -. +++++++. --. <<. >>---. -. ------. +++++. ++++++. -. <<++++++++++++++. --------------. . >-----------------------------. >+++++. ----. --. -----------------. <<. >>+. +++++++++++++. -. +++++++. --. <-----------. <. >>++++. . . <------------. >---. ---------------. ++++++++++++++. +. ----------------. +++++++++++. --. --------. <. >--. ++++++++++++. --. <+. >------------. ++++

In [63]:
def get_cat(row):
    if ("ENG - Software Engineering" in row["targeted_disciplines"]) and ("MATH - Computer Science" in row["targeted_disciplines"]):
        return True
    else:
        return False

In [131]:
def merge_duplicates(df):
    df = df.drop_duplicates(subset=['organization', 'job_title'], keep="first") 
    df = df.drop_duplicates(subset=['job_responsibilities', 'job_summary', 'required_skills'], keep="first")
    return df

In [132]:
def clean(x):
    newstr = strip_multiple_whitespaces(x)
    return newstr


In [133]:
#Clean data
data["job_responsibilities"] = data["job_responsibilities"].map(lambda x: clean(x))
data["job_summary"] = data["job_summary"].map(lambda x: clean(x))
data["required_skills"] = data["required_skills"].map(lambda x: clean(x))
data = merge_duplicates(data)

In [134]:
m = data.apply(get_cat, axis=1)
data = data[m]

In [135]:
data.head()

Unnamed: 0,address_line1,city,job_responsibilities,job_summary,job_title,number_of_openings,organization,postal_code,province_state,required_skills,targeted_disciplines
1,187 King Street South,Waterloo,Responsibilities -Under the direction of staff...,"Validus Research Inc., based in Waterloo, Onta...",Developer,1,Validus Group,N2J 1R1,Ontario,Essential Skills/Qualifications -Undergraduate...,- Theme - Business Administratio...
11,"144 Front Street West, Suite 685",Toronto,Peraso's Product Validation engineers are resp...,Company Profile Peraso is a fabless semiconduc...,Validation Engineering,1,Peraso Technologies Inc,M5J 2L7,Ontario,Minimum Qualifications: - Understanding of C/C...,- Theme - Computing: Information...
12,67 Yonge Street,Toronto,We are looking for a QA Automation Developer t...,"QuickTapSurvey, created by TabbleDabble helps ...",Automation Developer,1,QuickTapSurvey created by TabbleDabble Inc.,M5E 1J8,Ontario,You must have development skills that consist ...,- Theme - Computing: Information...
14,609 Kumpf Drive,Waterloo,- Build strong partnerships with software engi...,As part of the Global R&D; Software organizati...,Technical Writer,1,DEMATIC,N2V 1K8,Ontario,- Exceptional writing skills - Proficiency wit...,- Theme - Computing: Information...
15,,,RESPONSIBILITIES Review and develop understand...,Are you looking for the opportunity to be invo...,Manufacturing Software Development,1,Ford Motor Company of Canada Limited,,Ontario,ESSENTIAL SKILLS AND QUALIFICATIONS Working to...,- Theme - Computing: Hardware ...


In [136]:
#create corpus
def preprocess(x):
    CUSTOM_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation, remove_stopwords]
    x = re.sub(r"http\S+", "", x)
    x = preprocess_string(x, CUSTOM_FILTERS)
    return x
    
def read_corpus():
    for i, (doc1, doc2, doc3) in enumerate(zip(data["job_responsibilities"].values, data["job_summary"].values, data["required_skills"].values)):
        doc = doc1 + " " + doc2 + " " + doc3
        yield gensim.models.doc2vec.TaggedDocument(preprocess(doc), [i]) #formerly gensim.utils.simple_preprocess

train_corpus = list(read_corpus())

In [137]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=500, min_count=2, epochs=75)

In [138]:
model.build_vocab(train_corpus)

In [139]:
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

CPU times: user 1min 55s, sys: 1.06 s, total: 1min 56s
Wall time: 42.4 s


In [143]:
resume = "Worked on proof-of-concept projects to demonstrate ML feasibility in key supply chain areas. \
Pre-processed time series data, trained forecasting models, and evaluated predictions. \
Mentored other interns on data science libraries, Jupyter, natural language processing, and ML best practices."
# resume = "Secured over $15k+ worth of sponsorship for over 6 events (1200+ attendees combined) since inception. \
# Created fully functional platform to collect hackathon data for analytics. \
# Hosted one of Canada’s first game development hackathons with Red Bull Canada: redbull.com/adrenalan "
# resume = "Experimented with state of the art NLP techniques to detect type of toxicity in online comments. \
# Experimented with LSTMs, word embeddings (Glove, FastText), and techniques such as sequence bucketing. \
# Fine-tuned and ensembled Bidirectional Encoder Representations from Transformers (BERT). "
# Integrated an “auto-swipe” feature into Tinder based on real-time brain EEG data collected by the Muse. \
# Tested on participants at Hack The 6ix 2017 and achieved a 75% accuracy. \
# Knowledgeable in: Git, Docker, Python (Pandas, Numpy, Scikit-Learn), Keras, Pytorch, Java, Javascript, VueJS, Firebase \
# "

print(gensim.utils.simple_preprocess(resume))
inferred_vector = model.infer_vector(preprocess(resume))
sims = model.docvecs.most_similar([inferred_vector], topn=30)

['worked', 'on', 'proof', 'of', 'concept', 'projects', 'to', 'demonstrate', 'ml', 'feasibility', 'in', 'key', 'supply', 'chain', 'areas', 'pre', 'processed', 'time', 'series', 'data', 'trained', 'forecasting', 'models', 'and', 'evaluated', 'predictions', 'mentored', 'other', 'interns', 'on', 'data', 'science', 'libraries', 'jupyter', 'natural', 'language', 'processing', 'and', 'ml', 'best', 'practices']


In [144]:
sims

[(61, 0.4429358243942261),
 (406, 0.3491681218147278),
 (440, 0.32728835940361023),
 (1002, 0.31627583503723145),
 (962, 0.3137148320674896),
 (540, 0.31088000535964966),
 (602, 0.30809494853019714),
 (994, 0.29312780499458313),
 (379, 0.2906227707862854),
 (483, 0.2863311171531677),
 (224, 0.28553855419158936),
 (213, 0.2854747474193573),
 (77, 0.2850501835346222),
 (1036, 0.2826984226703644),
 (335, 0.2771909832954407),
 (456, 0.274921178817749),
 (1033, 0.274872362613678),
 (127, 0.2719246745109558),
 (1078, 0.270953893661499),
 (573, 0.2705705761909485),
 (232, 0.2698262631893158),
 (199, 0.2687980532646179),
 (866, 0.2583879232406616),
 (611, 0.25711262226104736),
 (878, 0.25243133306503296),
 (642, 0.25061482191085815),
 (228, 0.24966290593147278),
 (1015, 0.24874156713485718),
 (413, 0.24649274349212646),
 (272, 0.24523122608661652)]

In [145]:
for job in sims:
    X = job[0]
    print("ID: ", X)
    print(data["job_title"].values[X], ", ", data["organization"].values[X])
    print(data["job_responsibilities"].values[X])
    print(data["job_summary"].values[X])
    print(data["required_skills"].values[X])
    print("================================")

ID:  61
Artificial Intelligence Research ,  ContextLogic
AI is not just a buzzword for Wish. It forms the foundational layer of everything we do. We treat data as a natural resource and the projects you work on will push the boundaries of AI at massive scale. You will get a great deal of responsibility for each project you work on, and will have the flexibility to influence design and execution. The field of AI is moving quickly and we need people who can adapt to build state of the art systems. Some of the most exciting areas you will work on are computer vision, NLP and deep reinforcement learning.
Our engineers move extremely fast, while solving unusual and challenging problems. They're smart, hands-on, and have a solid foundation in computer science, math or software engineering. They have strong competencies in data structures, algorithms and software design, and are able to contribute under a great deal of independence. They also have a strong desire and passion for hacking socia

In [109]:
#Get consistent results