In [177]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
import re

# PROOF OF CONCEPT - LDA TOPIC MODELING - JOBS DATA


This jupyter notebook provides code showing a potential implementation of LDA for JOBS data.  A dictionary of words/documents based on the job description and title is created.  A topic model is created by factorizing an overall document/word matrix into separate document/word - topic/word matrices.  Extensive validation/ideal topic number & word number were carried out to ensure that enough/not too many topics are returned/matched given a previous click.  The ultimate utilization would be for a single city or cross-city matching environment.

jobs_move = pd.read_csv('jobs.csv')

## Data cleaning
Stop words removed.  Headings more easily identified.  Floats removed.  

In [179]:
jobs_move.columns = ['ID1', 'ID2', 'City', 'Job_ID', 'Title', 'URL', 'Company', 'Snippet', 'State', 'Date_posted']


In [180]:
for j in range(len(jobs_move.Snippet)):

    if type(jobs_move.iloc[j,3]) == float:
        jobs_move.iloc[j,3] = 'blank'

In [181]:
jobs_move['Combined'] = jobs_move.Snippet + jobs_move.Title

In [182]:
jobs_move['Combined'].replace(regex=True,inplace=True,to_replace=r'\W',value=r' ')

In [183]:
jobs_move['Combined'].replace(regex=True,inplace=True,to_replace=r'\d',value=r' ')

In [184]:
document_set = jobs_move.Combined

In [185]:
document_set[0]

'    years experience building  b front  b   b end  b  applications  Participate in the definition and roadmap of the new features and functionalities for partners    Front End Engineer'

Lemmatization and splitting of words

In [186]:

stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

doc_clean = [clean(doc).split() for doc in document_set]   


In [187]:
doc_final = []
for i in range(len(doc_clean)):
    s = doc_clean[i]
    y = [word for word in s if len(word) >= 2]
    doc_final.append(y)

In [211]:
jobs_move

Unnamed: 0,ID1,ID2,City,Job_ID,Title,URL,Company,Snippet,State,Date_posted,Combined,cleaned,topic_probs
0,0,13503,New York,8b45ea8bc3872013,Front End Engineer,http://www.indeed.com/viewjob?jk=8b45ea8bc3872...,Dailymotion,3-5 years experience building <b>front</b> <b>...,NY,"Fri, 07 Apr 2017 03:10:12 GMT",years experience building b front b b ...,"[year, experience, building, front, end, appli...","[6, 14, 139, 221, 274, 300, 326, 446, 487, 620..."
1,1,13504,New York,1fbdfac9fada8fb3,Frontend Engineer,http://www.indeed.com/viewjob?jk=1fbdfac9fada8...,Clark,Clark (www.hiclark.com) is looking for an enth...,NY,"Tue, 04 Apr 2017 15:15:15 GMT",Clark www hiclark com is looking for an enth...,"[clark, www, hiclark, com, looking, enthusiast...","[63, 211, 472, 487, 522, 527, 620, 628]"
2,2,13505,New York,ecab36c84848c578,Freelance Front-End Engineer,http://www.indeed.com/viewjob?jk=ecab36c84848c...,"Fluid, Inc",You will be experienced or eager to gain exper...,NY,"Fri, 07 Apr 2017 01:12:27 GMT",You will be experienced or eager to gain exper...,"[experienced, eager, gain, experience, end, en...","[57, 99, 129, 287, 311, 400, 412, 443, 459, 48..."
3,3,13506,New York,77c88d636976d818,"Front End Software Engineer, Category Experien...",http://www.indeed.com/viewjob?jk=77c88d636976d...,fiverr,3+ years experience in client facing <b>front<...,NY,"Thu, 09 Mar 2017 23:26:06 GMT",years experience in client facing b front ...,"[year, experience, client, facing, front, end,...","[69, 139, 188, 266, 267, 499, 620, 664, 752]"
4,4,13507,New York,da3b04c1dc96fcd5,Software Engineer (Front End),http://www.indeed.com/viewjob?jk=da3b04c1dc96f...,CommonBond,<b>Front</b> <b>End</b> <b>Engineers</b> at Co...,NY,"Thu, 06 Apr 2017 05:56:02 GMT",b Front b b End b b Engineers b at Co...,"[front, end, engineer, commonbond, create, fas...","[96, 296, 362, 499, 512, 535, 564, 620, 752, 765]"
5,5,13508,New York,8085fa37f700d40c,Front-End Software Engineer,http://www.indeed.com/viewjob?jk=8085fa37f700d...,Everyday Health,Years of experience with <b>front</b> <b>end</...,NY,"Wed, 05 Apr 2017 20:39:21 GMT",Years of experience with b front b b end ...,"[year, experience, front, end, development, au...","[88, 139, 178, 221, 267, 362, 544, 566, 620, 752]"
6,6,13509,New York,83daa7f70ab5f0dc,Front-end Engineer,http://www.indeed.com/viewjob?jk=83daa7f70ab5f...,Transfix.io,"As a <b>Front</b>-<b>end</b> <b>Engineer</b>, ...",NY,"Sun, 26 Feb 2017 11:34:54 GMT",As a b Front b b end b b Engineer b ...,"[front, end, engineer, building, world, class,...","[15, 134, 187, 243, 487, 620, 646]"
7,7,13510,New York,b50d6d7c46e38929,Front-End Engineer,http://www.indeed.com/viewjob?jk=b50d6d7c46e38...,Call9,You are capable of iterating prototypes quickl...,NY,"Thu, 06 Apr 2017 20:44:19 GMT",You are capable of iterating prototypes quickl...,"[capable, iterating, prototype, quickly, worki...","[57, 282, 302, 503, 507, 579, 620, 631]"
8,8,13511,New York,4c2213444bc6ca9d,Frontend Software Engineer,http://www.indeed.com/viewjob?jk=4c2213444bc6c...,Collage.com,We are seeking a senior-level <b>front</b>-<b>...,NY,"Mon, 27 Mar 2017 23:03:38 GMT",We are seeking a senior level b front b b ...,"[seeking, senior, level, front, end, software,...","[50, 54, 84, 115, 214, 241, 270, 300, 329, 571..."
9,9,13512,New York,5cc36a513a9837b3,Front End Engineer,http://www.indeed.com/viewjob?jk=5cc36a513a983...,Bitly,As a <b>Front</b> <b>End</b> <b>Engineer</b> a...,NY,"Fri, 31 Mar 2017 20:05:48 GMT",As a b Front b b End b b Engineer b a...,"[front, end, engineer, bitly, bitly, seeking, ...","[12, 15, 100, 141, 571, 620, 641, 685, 765]"


In [189]:
jobs_move['cleaned'] = doc_final

### Dictionary creation
Dictionary creation based on split words.  Also doc matrix found.

In [190]:
# Importing Gensim
import gensim
from gensim import corpora

# Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)

dictionary = corpora.Dictionary(doc_final)


# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_final]

In [191]:
jobs_move

Unnamed: 0,ID1,ID2,City,Job_ID,Title,URL,Company,Snippet,State,Date_posted,Combined,cleaned
0,0,13503,New York,8b45ea8bc3872013,Front End Engineer,http://www.indeed.com/viewjob?jk=8b45ea8bc3872...,Dailymotion,3-5 years experience building <b>front</b> <b>...,NY,"Fri, 07 Apr 2017 03:10:12 GMT",years experience building b front b b ...,"[year, experience, building, front, end, appli..."
1,1,13504,New York,1fbdfac9fada8fb3,Frontend Engineer,http://www.indeed.com/viewjob?jk=1fbdfac9fada8...,Clark,Clark (www.hiclark.com) is looking for an enth...,NY,"Tue, 04 Apr 2017 15:15:15 GMT",Clark www hiclark com is looking for an enth...,"[clark, www, hiclark, com, looking, enthusiast..."
2,2,13505,New York,ecab36c84848c578,Freelance Front-End Engineer,http://www.indeed.com/viewjob?jk=ecab36c84848c...,"Fluid, Inc",You will be experienced or eager to gain exper...,NY,"Fri, 07 Apr 2017 01:12:27 GMT",You will be experienced or eager to gain exper...,"[experienced, eager, gain, experience, end, en..."
3,3,13506,New York,77c88d636976d818,"Front End Software Engineer, Category Experien...",http://www.indeed.com/viewjob?jk=77c88d636976d...,fiverr,3+ years experience in client facing <b>front<...,NY,"Thu, 09 Mar 2017 23:26:06 GMT",years experience in client facing b front ...,"[year, experience, client, facing, front, end,..."
4,4,13507,New York,da3b04c1dc96fcd5,Software Engineer (Front End),http://www.indeed.com/viewjob?jk=da3b04c1dc96f...,CommonBond,<b>Front</b> <b>End</b> <b>Engineers</b> at Co...,NY,"Thu, 06 Apr 2017 05:56:02 GMT",b Front b b End b b Engineers b at Co...,"[front, end, engineer, commonbond, create, fas..."
5,5,13508,New York,8085fa37f700d40c,Front-End Software Engineer,http://www.indeed.com/viewjob?jk=8085fa37f700d...,Everyday Health,Years of experience with <b>front</b> <b>end</...,NY,"Wed, 05 Apr 2017 20:39:21 GMT",Years of experience with b front b b end ...,"[year, experience, front, end, development, au..."
6,6,13509,New York,83daa7f70ab5f0dc,Front-end Engineer,http://www.indeed.com/viewjob?jk=83daa7f70ab5f...,Transfix.io,"As a <b>Front</b>-<b>end</b> <b>Engineer</b>, ...",NY,"Sun, 26 Feb 2017 11:34:54 GMT",As a b Front b b end b b Engineer b ...,"[front, end, engineer, building, world, class,..."
7,7,13510,New York,b50d6d7c46e38929,Front-End Engineer,http://www.indeed.com/viewjob?jk=b50d6d7c46e38...,Call9,You are capable of iterating prototypes quickl...,NY,"Thu, 06 Apr 2017 20:44:19 GMT",You are capable of iterating prototypes quickl...,"[capable, iterating, prototype, quickly, worki..."
8,8,13511,New York,4c2213444bc6ca9d,Frontend Software Engineer,http://www.indeed.com/viewjob?jk=4c2213444bc6c...,Collage.com,We are seeking a senior-level <b>front</b>-<b>...,NY,"Mon, 27 Mar 2017 23:03:38 GMT",We are seeking a senior level b front b b ...,"[seeking, senior, level, front, end, software,..."
9,9,13512,New York,5cc36a513a9837b3,Front End Engineer,http://www.indeed.com/viewjob?jk=5cc36a513a983...,Bitly,As a <b>Front</b> <b>End</b> <b>Engineer</b> a...,NY,"Fri, 31 Mar 2017 20:05:48 GMT",As a b Front b b End b b Engineer b a...,"[front, end, engineer, bitly, bitly, seeking, ..."


### Topic Modelling
LDA model created.  Extensive topic and pass validation were done to ensure successful model.

In [192]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=800, id2word = dictionary, passes=50)

In [193]:
topics = ldamodel.print_topics(num_topics=800, num_words=5)
topics

[(0,
  u'0.401*"believe" + 0.056*"headspring" + 0.000*"texas" + 0.000*"role" + 0.000*"city"'),
 (1,
  u'0.141*"independent" + 0.129*"however" + 0.090*"leasing" + 0.077*"agent" + 0.077*"budgeting"'),
 (2,
  u'0.795*"day" + 0.082*"run" + 0.025*"engineer" + 0.019*"started" + 0.010*"sa"'),
 (3,
  u'0.328*"termination" + 0.163*"exit" + 0.118*"interview" + 0.055*"preventive" + 0.055*"military"'),
 (4,
  u'0.543*"case" + 0.192*"assistance" + 0.158*"bar" + 0.000*"manager" + 0.000*"lawyer"'),
 (5,
  u'0.911*"management" + 0.056*"experience" + 0.016*"team" + 0.000*"director" + 0.000*"interaction"'),
 (6,
  u'0.249*"warehousing" + 0.249*"functionality" + 0.146*"licensing" + 0.073*"relational" + 0.024*"roblox"'),
 (7,
  u'0.459*"head" + 0.165*"ca" + 0.064*"business" + 0.054*"support" + 0.042*"gap"'),
 (8,
  u'0.591*"vp" + 0.238*"operational" + 0.023*"edi" + 0.012*"trust" + 0.012*"adhering"'),
 (9,
  u'0.555*"administrator" + 0.079*"expectation" + 0.059*"bronx" + 0.021*"exceed" + 0.020*"thrives"'),

### Running of model

Individual topics found for each job.  Shown in final column below.

In [194]:
topic_holder = []

for j in range(len(jobs_move)):
    

    doc_new = [jobs_move.iloc[j, 11]]

# Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)



# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    doc_term_matrix2 = [dictionary.doc2bow(doc) for doc in doc_new]

    output = ldamodel[doc_term_matrix2]
    
    topic_holder.append(output)

In [195]:
jobs_move['topic_probs'] = 'hold'

for k in range(len(jobs_move)):
    jobs_move.topic_probs[k] = topic_holder[k][0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [196]:
for m in range(len(jobs_move)):
    n = len(jobs_move.topic_probs[m])
    hold = jobs_move.topic_probs[m]
    jobs_move.topic_probs[m] = [hold[n][0] for n in range(n)]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [197]:
jobs_move.head()

Unnamed: 0,ID1,ID2,City,Job_ID,Title,URL,Company,Snippet,State,Date_posted,Combined,cleaned,topic_probs
0,0,13503,New York,8b45ea8bc3872013,Front End Engineer,http://www.indeed.com/viewjob?jk=8b45ea8bc3872...,Dailymotion,3-5 years experience building <b>front</b> <b>...,NY,"Fri, 07 Apr 2017 03:10:12 GMT",years experience building b front b b ...,"[year, experience, building, front, end, appli...","[6, 14, 139, 221, 274, 300, 326, 446, 487, 620..."
1,1,13504,New York,1fbdfac9fada8fb3,Frontend Engineer,http://www.indeed.com/viewjob?jk=1fbdfac9fada8...,Clark,Clark (www.hiclark.com) is looking for an enth...,NY,"Tue, 04 Apr 2017 15:15:15 GMT",Clark www hiclark com is looking for an enth...,"[clark, www, hiclark, com, looking, enthusiast...","[63, 211, 472, 487, 522, 527, 620, 628]"
2,2,13505,New York,ecab36c84848c578,Freelance Front-End Engineer,http://www.indeed.com/viewjob?jk=ecab36c84848c...,"Fluid, Inc",You will be experienced or eager to gain exper...,NY,"Fri, 07 Apr 2017 01:12:27 GMT",You will be experienced or eager to gain exper...,"[experienced, eager, gain, experience, end, en...","[57, 99, 129, 287, 311, 400, 412, 443, 459, 48..."
3,3,13506,New York,77c88d636976d818,"Front End Software Engineer, Category Experien...",http://www.indeed.com/viewjob?jk=77c88d636976d...,fiverr,3+ years experience in client facing <b>front<...,NY,"Thu, 09 Mar 2017 23:26:06 GMT",years experience in client facing b front ...,"[year, experience, client, facing, front, end,...","[69, 139, 188, 266, 267, 499, 620, 664, 752]"
4,4,13507,New York,da3b04c1dc96fcd5,Software Engineer (Front End),http://www.indeed.com/viewjob?jk=da3b04c1dc96f...,CommonBond,<b>Front</b> <b>End</b> <b>Engineers</b> at Co...,NY,"Thu, 06 Apr 2017 05:56:02 GMT",b Front b b End b b Engineers b at Co...,"[front, end, engineer, commonbond, create, fas...","[96, 296, 362, 499, 512, 535, 564, 620, 752, 765]"


## Proof of concept displayed
A random job description is chosen.  In this case a front end engineer.  Suggested jobs are then found by matching jobs with similar derived topic numbers.  In this case the resulting matches are jobs with similar skillsets, for example a machine learning engineer.  Please note that the jobs model was trained to allow far more topics and description matches then the events data.

In [198]:
test_index = np.random.choice(range(len(jobs_move)))

In [199]:
test_index

1065

In [202]:
jobs_move.Title[1065]

'Front End Engineer'

In [203]:
similar_topics = []
for p in jobs_move.topic_probs[test_index]:
    for q in range(len(jobs_move)):
        if q != test_index:
            if p in jobs_move.topic_probs[q]:
                similar_topics.append(q)
            
similar_topics = list((set(similar_topics)))

In [205]:
similar_topics

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 2085,
 38,
 39,
 2396,
 45,
 55,
 2740,
 2106,
 2107,
 2108,
 2109,
 2110,
 2111,
 64,
 65,
 2114,
 2115,
 2116,
 2117,
 2118,
 2119,
 2120,
 73,
 74,
 75,
 76,
 77,
 78,
 2127,
 2128,
 2129,
 82,
 83,
 84,
 85,
 86,
 697,
 89,
 91,
 2140,
 93,
 94,
 95,
 96,
 98,
 2147,
 2148,
 2150,
 103,
 2155,
 109,
 111,
 113,
 702,
 118,
 121,
 122,
 2173,
 2174,
 2175,
 128,
 2177,
 131,
 2180,
 133,
 134,
 2183,
 2184,
 2185,
 139,
 2188,
 2189,
 142,
 2191,
 2192,
 2193,
 2194,
 2195,
 2196,
 2197,
 161,
 2211,
 165,
 166,
 167,
 1052,
 2218,
 2223,
 2224,
 2225,
 2226,
 2227,
 2228,
 2230,
 2232,
 187,
 2237,
 190,
 192,
 197,
 2247,
 201,
 202,
 203,
 207,
 2265,
 37,
 2277,
 240,
 2289,
 248,
 2301,
 2303,
 2305,
 2309,
 2310,
 2311,
 2312,
 269,
 2323,
 280,
 1072,
 290,
 297,
 299,
 2380,
 2384,
 57,
 345,
 348,
 2400,
 

In [210]:
jobs_move.Title[2740]


'Machine Learning Engineer'