In [60]:
from datetime import datetime
import sys
import time
import sqlite3
import pandas as pd
from gensim.parsing.preprocessing import strip_multiple_whitespaces
import gensim

In [2]:
def initialize_db(db_name):
    connection = sqlite3.connect(db_name)
    connection.row_factory = sqlite3.Row

    cur = connection.cursor()
    cur.execute("SELECT * FROM Jobs")

    return connection, cur

def parse_job(row):
    return {
        'organization' : row['organization'],
        'job_title' : row['job_title'],
        'number_of_openings' : row['number_of_openings'],
        'address_line1' : row['address_line1'],
        'city' : row['city'],
        'province_state' : row['province_state'],
        'postal_code' : row['postal_code'],
        'job_summary' : row['job_summary'],
        'job_responsibilities' : row['job_responsibilities'],
        'required_skills' : row['required_skills'],
        'targeted_disciplines' : row['targeted_disciplines'].replace('\r', '').replace('\t', ' ').replace('\n', '').replace('Targeted Clusters', '')
    }

In [3]:
connection, cur = initialize_db("data/fall2017/jobs.db")

In [4]:
count = 0
jobs = []
while True:
    count += 1
    row = cur.fetchone()

    if row is None:
        print("========================== All records processed ============================")
        break

    job = parse_job(row)
    
    jobs.append(job)
    
print("Number of rows loaded: ", count)
connection.close()

Number of rows loaded:  3933


In [5]:
#save to pandas dataframe
df = pd.DataFrame.from_dict(jobs, orient="columns")

In [7]:
df.to_parquet('data/data.parquet', compression="gzip", engine='fastparquet')

In [32]:
#load
data = pd.read_parquet("data/data.parquet")

In [33]:
data.head()

Unnamed: 0,address_line1,city,job_responsibilities,job_summary,job_title,number_of_openings,organization,postal_code,province_state,required_skills,targeted_disciplines
0,Canada Centre for Inland Waters,Burlington,Working with Water Resources staff within the ...,The Water Resources Student will work with Wat...,Water Resources Student,1,Environment and Climate Change Canada,L7S 1A1,Ontario,Preference given to Canadian Citizenship\r\nKn...,- Theme - Natural Resource Manag...
1,187 King Street South,Waterloo,Responsibilities\r\n-Under the direction of st...,"Validus Research Inc., based in Waterloo, Onta...",Developer,1,Validus Group,N2J 1R1,Ontario,Essential Skills/Qualifications\r\n-Undergradu...,- Theme - Business Administratio...
2,225 Wicksteed Avenue,Toronto,- Perform mechanical properties testing\r\n- ...,"Reporting to Chemist (Toronto), this position ...",Chemist's Assistant,1,Siltech Corporation,M4H 1G5,Ontario,- Excellent analytical skills\r\n- Excellent...,- Theme - Scientific Experimenta...
3,555 Richmond Street West,Toronto,Technical Tasks\r\n- Assist in the maintenance...,Synaptive Medical is a newly-formed company ba...,Systems Engineering Co-op (MRI),1,Synaptive Medical,M5V 3B1,Ontario,Required\r\n- Enrollment in Systems Design Eng...,- Theme - Medical Devices and Di...
4,3190 Steeles Avenue East,Markham,Summary: \r\nReporting to the Senior Project ...,Geranium is a medium sized well established bu...,Project Assistant,1,Geranium Corporation,L3R 1G9,Ontario,The Project Assistant should have: \r\n- Stro...,- Theme - Construction and Infra...


In [52]:
def clean(x):
    newstr = strip_multiple_whitespaces(x)
    return newstr


In [55]:
#Clean data
data["job_responsibilities"] = data["job_responsibilities"].map(lambda x: clean(x))
data["job_summary"] = data["job_summary"].map(lambda x: clean(x))
data["required_skills"] = data["required_skills"].map(lambda x: clean(x))

In [57]:
data["required_skills"].values[:3]

array(['Preference given to Canadian Citizenship Knowledge of water resources related issues, particularly related to the Great Lakes Knowledge of Microsoft Office software including Excel, Word and Powerpoint Knowledge of basic data and information management concepts including metadata Abilities - organized and thorough Abilities - communicate effectively both in writing and verbally Personal Suitability - work well with others Personal Suitability - adaptable Personal Suitability - critical thinking',
       'Essential Skills/Qualifications -Undergraduate and/or graduate students in computer science or other technical background with the relevant programming experience and aptitude -Experience with at least one of: Javascript, Java -Experience developing web based application. Familiar with HTML/CSS -Experience with SQL -Enthusiasm and strong work ethic -Ability to work in team environment Advantages -You will grow at much faster pace than working for FB, GO, AZ. You are not a numbe

In [59]:
resume = "Machine Learning Intern Worked on proof-of-concept projects to demonstrate ML feasibility in key supply chain areas. Pre-processed time series data, trained forecasting models, and evaluated predictions. Mentored other interns on data science libraries, Jupyter, natural language processing, and ML best practices."

In [65]:
#create corpus
def read_corpus():
    for i, doc in enumerate(data["required_skills"].values):
        yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(doc), [i])

train_corpus = list(read_corpus())

In [66]:
train_corpus[:2]

[TaggedDocument(words=['preference', 'given', 'to', 'canadian', 'citizenship', 'knowledge', 'of', 'water', 'resources', 'related', 'issues', 'particularly', 'related', 'to', 'the', 'great', 'lakes', 'knowledge', 'of', 'microsoft', 'office', 'software', 'including', 'excel', 'word', 'and', 'powerpoint', 'knowledge', 'of', 'basic', 'data', 'and', 'information', 'management', 'concepts', 'including', 'metadata', 'abilities', 'organized', 'and', 'thorough', 'abilities', 'communicate', 'effectively', 'both', 'in', 'writing', 'and', 'verbally', 'personal', 'suitability', 'work', 'well', 'with', 'others', 'personal', 'suitability', 'adaptable', 'personal', 'suitability', 'critical', 'thinking'], tags=[0]),
 TaggedDocument(words=['essential', 'skills', 'qualifications', 'undergraduate', 'and', 'or', 'graduate', 'students', 'in', 'computer', 'science', 'or', 'other', 'technical', 'background', 'with', 'the', 'relevant', 'programming', 'experience', 'and', 'aptitude', 'experience', 'with', 'at',

In [None]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=100, min_count=2, epochs=40)

In [None]:
model.build_vocab(train_corpus)