In [115]:
import numpy as np
import pandas as pd
import subprocess
import argparse
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from bs4 import BeautifulSoup
import pickle

import re

def clean_html(raw_html):
    for index, row in raw_html.iterrows():
        cleanr = re.compile('<.*?>')
        cleantext = re.sub(cleanr, '', raw_html)
        return cleantext

# 1. Career Builder jobs processing

In [28]:
path =  "../../data/"
dataset = "cb12/"

raw_path = path + dataset + "raw/" 
interim_path = path + dataset + "interim/"
processed_path = path + dataset + "processed/"

In [101]:
# Read data
jobs = pd.read_csv(raw_path + "jobs.tsv", header=0, sep='\t', error_bad_lines=False)
jobs = jobs.rename(columns={"JobID": "item_id", "State": "state", "Country": "country", "City": "city", "Zip5": "zip5"})
jobs = jobs.set_index("item_id")

b'Skipping line 122433: expected 11 fields, saw 12\n'
b'Skipping line 602576: expected 11 fields, saw 12\n'
b'Skipping line 990950: expected 11 fields, saw 12\n'
  interactivity=interactivity, compiler=compiler, result=result)


In [103]:
jobs["Requirements"].fillna("", inplace=True)
jobs["Description"].fillna("", inplace=True)
jobs["Title"].fillna("", inplace=True)

#jobs['Requirements'] = [BeautifulSoup(text).get_text() for text in jobs['Requirements']]

jobs['Requirements'] = jobs['Requirements'].map(lambda x:re.sub('<[^<]+?>', '', x)).map(lambda x:re.sub('\\\\r', '', x)).map(lambda x:re.sub('\\\\n', '', x)).map(lambda x:re.sub('&nbsp;', ' ', x)).map(lambda x:re.sub('[—]+', ' ', x)).map(lambda x:re.sub('/', ' ', x))
jobs['Description'] = jobs['Description'].map(lambda x:re.sub('<[^<]+?>', '', x)).map(lambda x:re.sub('\\\\r', '', x)).map(lambda x:re.sub('\\\\n', '', x)).map(lambda x:re.sub('&nbsp;', ' ', x)).map(lambda x:re.sub('[—]+', ' ', x)).map(lambda x:re.sub('/', ' ', x))
jobs['Title'] = jobs['Title'].map(lambda x:re.sub('/', ' ', x))

jobs['Requirements'] = jobs['Requirements'].str.lower()
jobs['Description'] = jobs['Description'].str.lower()
jobs['Title'] = jobs['Title'].str.lower()

jobs.head()

Unnamed: 0_level_0,WindowID,Title,Description,Requirements,city,state,country,zip5,StartDate,EndDate
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1,security engineer technical lead,security clearance required: top secret job n...,skill set network security tools: webdefend we...,Washington,DC,US,20531.0,2012-03-07 13:17:01.643,2012-04-06 23:59:59
4,1,sap business analyst wm,no corp. to corp resumes are being considered ...,what you need: four year college degreeminimum...,Charlotte,NC,US,28217.0,2012-03-21 02:03:44.137,2012-04-20 23:59:59
7,1,p t human resources assistant,p t human resources assistant 1-2 ye...,please refer to the job description to view th...,Winter Park,FL,US,32792.0,2012-03-02 16:36:55.447,2012-04-01 23:59:59
8,1,route delivery drivers,city beverages come to work for the best in th...,please refer to the job description to view th...,Orlando,FL,US,,2012-03-03 09:01:10.077,2012-04-02 23:59:59
9,1,housekeeping,i make sure every part of their day is magica...,please refer to the job description to view th...,Orlando,FL,US,,2012-03-03 09:01:11.88,2012-04-02 23:59:59


In [104]:
jobs.to_csv(interim_path + "jobs_cleaned.tsv", sep='\t')
len(jobs.Requirements.unique())

513498

In [105]:
print("Unique cities: " + str(len(jobs.city.unique())))
#print(jobs['city'].value_counts(normalize=True) * 100)
print("Unique states: " + str(len(jobs.state.unique())))
#print(jobs['state'].value_counts(normalize=True) * 100)
print("Unique zip codes: " + str(len(jobs.zip5.unique())))
print("Unique countries: " + str(len(jobs.country.unique())))
#print(jobs['country'].value_counts(normalize=True) * 100)

Unique cities: 11075
Unique states: 61
Unique zip codes: 33832
Unique countries: 67


In [106]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        
def calc_lda(df, no_features = 1000, no_topics = 20):
    # LDA can only use raw term counts for LDA because it is a probabilistic graphical model
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
    tf = tf_vectorizer.fit_transform(df)
    tf_feature_names = tf_vectorizer.get_feature_names()

    # Run LDA
    lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)


    no_top_words = 5
    display_topics(lda, tf_feature_names, no_top_words)
    return lda, tf_feature_names

In [107]:
lda_req, tf_feature_names_req = calc_lda(jobs["Requirements"])

Topic 0:
requirements temp resident conditions car
Topic 1:
care license nurse health driver
Topic 2:
aflac companies company fortune quality
Topic 3:
ability management business team knowledge
Topic 4:
bull ndash matco 350 road
Topic 5:
experience years degree required preferred
Topic 6:
job description view refer requirements
Topic 7:
sales business insurance outside marketing
Topic 8:
experience systems development knowledge engineering
Topic 9:
professionals office industry half contact
Topic 10:
work opportunity time home retail
Topic 11:
able ability required work experience
Topic 12:
server responsible windows maintaining wireless
Topic 13:
accounting finance automotive franchise financial
Topic 14:
manager marketing sales management service
Topic 15:
job temp conditions characteristics pt
Topic 16:
benefits opportunity employer equal com
Topic 17:
maintenance mechanical electrical repair central
Topic 18:
skills experience ability communication strong
Topic 19:
fast state paced

In [108]:
lda_desc, tf_feature_names_desc = calc_lda(jobs["Description"])

Topic 0:
care patient health medical patients
Topic 1:
position candidates experience resume com
Topic 2:
duties information assist support reports
Topic 3:
benefits dental company paid medical
Topic 4:
equipment safety work procedures duties
Topic 5:
career technology company professional ll
Topic 6:
automotive driver truck residents property
Topic 7:
rsquo training work team career
Topic 8:
management team ensure performance development
Topic 9:
business marketing company market products
Topic 10:
food states united offices great
Topic 11:
customer customers service products needs
Topic 12:
sales business products new selling
Topic 13:
project bull financial business accounting
Topic 14:
technical design systems software development
Topic 15:
com www construction visit online
Topic 16:
staffing contract aerotek specialist job
Topic 17:
experience skills ability work required
Topic 18:
store manager retail customer associates
Topic 19:
client clients services financial solutions


In [109]:
lda_title, tf_feature_names_title = calc_lda(jobs["Title"])

Topic 0:
driver openings truck agent licensed
Topic 1:
sales representative associate project marketing
Topic 2:
support tech franchise teller controller
Topic 3:
nurse account registered cdl office
Topic 4:
retail director development clerk loan
Topic 5:
rn care occupational production prn
Topic 6:
engineer software design architect electrical
Topic 7:
technician maintenance lead technical field
Topic 8:
clinical needed certified auditor commercial
Topic 9:
service customer specialist consultant management
Topic 10:
financial services administrator accountant ii
Topic 11:
team designer product healthcare hr
Topic 12:
level entry restaurant quality job
Topic 13:
store advisor ft work tx
Topic 14:
assistant sr executive medical health
Topic 15:
analyst business mechanic travel days
Topic 16:
senior supervisor training operator shift
Topic 17:
manager time operations network branch
Topic 18:
coordinator therapist automotive home physical
Topic 19:
developer nursing java professional web


In [116]:
pickle.dump(lda_title, open(interim_path + "lda_title.model", 'wb'), protocol=4)
pickle.dump(tf_feature_names_title, open(interim_path + "lda_title.fnames", 'wb'), protocol=4)

pickle.dump(lda_desc, open(interim_path + "lda_desc.model", 'wb'), protocol=4)
pickle.dump(tf_feature_names_desc, open(interim_path + "lda_desc.fnames", 'wb'), protocol=4)

pickle.dump(lda_req, open(interim_path + "lda_req.model", 'wb'), protocol=4)
pickle.dump(tf_feature_names_req, open(interim_path + "lda_req.fnames", 'wb'), protocol=4)

In [119]:
lda_title = pickle.load(open(interim_path + "lda_title.model", 'rb'))
tf_feature_names_title = pickle.load(open(interim_path + "lda_title.fnames", 'rb'))

lda_desc = pickle.load(open(interim_path + "lda_desc.model", 'rb'))
tf_feature_names_desc = pickle.load(open(interim_path + "lda_desc.fnames", 'rb'))

lda_req = pickle.load(open(interim_path + "lda_req.model", 'rb'))
tf_feature_names_req = pickle.load(open(interim_path + "lda_req.fnames", 'rb'))

#display_topics(lda_title, tf_feature_names_title, 10)
#display_topics(lda_desc, tf_feature_names_desc, 10)
#display_topics(lda_req, tf_feature_names_req, 10)

In [183]:
def apply_topics(text, model, feature_names, fn_dict):
    words = re.findall(r"[\w']+", text)
        
    overlap = [value for value in feature_names if value in words]
    
    max_sum_t = -1.0
    max_topic_idx = None
    for topic_idx, topic in enumerate(model.components_):
        sum_t = 0.0
        for word in overlap:
            sum_t += topic[fn_dict[word]]
        
        if sum_t > max_sum_t:
            max_sum_t = sum_t
            max_topic_idx = topic_idx
    
    return max_topic_idx

        
def gen_fn_dict(feature_names):
    fn_dict = {}
    f_idx = 0
    
    for name in feature_names:
        fn_dict[name] = f_idx
        f_idx += 1
        
    return fn_dict

fn_dict_req = gen_fn_dict(tf_feature_names_req)
fn_dict_desc = gen_fn_dict(tf_feature_names_desc)
fn_dict_title = gen_fn_dict(tf_feature_names_title)


jobs['ReqTopic'] = jobs.apply(lambda x: apply_topics(x['Requirements'], lda_req, tf_feature_names_req, fn_dict_req), axis=1)
jobs['DescTopic'] = jobs.apply(lambda x: apply_topics(x['Description'], lda_desc, tf_feature_names_desc, fn_dict_desc), axis=1)
jobs['TitTopic'] = jobs.apply(lambda x: apply_topics(x['Title'], lda_title, tf_feature_names_title, fn_dict_title), axis=1)


jobs.head()

Unnamed: 0_level_0,WindowID,Title,Description,Requirements,city,state,country,zip5,StartDate,EndDate,ReqTopic,DescTopic,TitTopic
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,1,security engineer technical lead,security clearance required: top secret job n...,skill set network security tools: webdefend we...,Washington,DC,US,20531.0,2012-03-07 13:17:01.643,2012-04-06 23:59:59,18,8,6
4,1,sap business analyst wm,no corp. to corp resumes are being considered ...,what you need: four year college degreeminimum...,Charlotte,NC,US,28217.0,2012-03-21 02:03:44.137,2012-04-20 23:59:59,18,8,15
7,1,p t human resources assistant,p t human resources assistant 1-2 ye...,please refer to the job description to view th...,Winter Park,FL,US,32792.0,2012-03-02 16:36:55.447,2012-04-01 23:59:59,15,17,14
8,1,route delivery drivers,city beverages come to work for the best in th...,please refer to the job description to view th...,Orlando,FL,US,,2012-03-03 09:01:10.077,2012-04-02 23:59:59,15,3,3
9,1,housekeeping,i make sure every part of their day is magica...,please refer to the job description to view th...,Orlando,FL,US,,2012-03-03 09:01:11.88,2012-04-02 23:59:59,15,7,10


In [184]:
jobs.to_csv(interim_path + "jobs_cleaned.tsv", sep='\t')
jobs = jobs.drop(columns=["WindowID", "Title", "Description", "Requirements", "StartDate", "EndDate"])
jobs.to_csv(interim_path + "items.csv", sep='\t')
len(jobs)

1091923

In [185]:
jobs.head()

Unnamed: 0_level_0,city,state,country,zip5,ReqTopic,DescTopic,TitTopic
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Washington,DC,US,20531.0,18,8,6
4,Charlotte,NC,US,28217.0,18,8,15
7,Winter Park,FL,US,32792.0,15,17,14
8,Orlando,FL,US,,15,3,3
9,Orlando,FL,US,,15,7,10
