## Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from pattern import en
from scipy import spatial
import pickle

ModuleNotFoundError: ignored

## Global Variables

In [0]:
data_repository = 'candidates_2019_05_24_16_24.csv'
job_repository = 'indeed_job_dataset_1.csv'
word2vecModel = 'resume_word2vec'
word2vecResume = 'resume_w2v_array'

## Functions

In [0]:
def get_resume_data():
    dfdata = pd.read_csv(data_repository)
    dfdata['whole_text'] = pd.Series(dfdata.fillna('').values.tolist()).str.join(' ')
    return dfdata

In [0]:
def get_jobprofile_data():
    dfjob = pd.read_csv(job_repository)
    #dfjob['whole_text'] = pd.Series(dfjob.fillna('').values.tolist()).str.join(' ')
    return dfjob

In [0]:
def create_w2v_model(dataset):    
    alltext = ' '  
    for index, row in dataset.iterrows():
        alltext = alltext + " " + row['whole_text']   
    alltext = alltext.lower()
    vector = []
    for sentence in en.parsetree(alltext, tokenize=True, lemmata=True, tags=True):
        temp = []
        for chunk in sentence.chunks:
            for word in chunk.words:
                if word.tag == 'NN' or word.tag == 'VB':
                    temp.append(word.lemma)
        vector.append(temp)
    model = Word2Vec(vector, size=200, window=5, min_count=3, workers=4)
    model.save(word2vecModel)

In [0]:
def get_job_desc(skill, description):
    vector = []
    for sentence in en.parsetree(description, tokenize=True, lemmata=True, tags=True):
        for chunk in sentence.chunks:
            for word in chunk.words:
                if word.tag == 'NN' or word.tag == 'VB':
                    vector.append(word.lemma)
    for sentence in en.parsetree(skill, tokenize=True, lemmata=True, tags=True):
        for chunk in sentence.chunks:
            for word in chunk.words:
                vector.append(word.lemma)
    return vector

In [0]:
def create_w2v_resume(df_resume):
    D_w2v = []
    for index, row in df_resume.iterrows():
        print("Processing resume " + str(index))
        yd = row['whole_text']
        w2v = []
        for sentence in en.parsetree(yd.lower(), tokenize=True, lemmata=True, tags=True):
            for chunk in sentence.chunks:
                for word in chunk.words:
                    if word.lemma in model.wv.vocab:
                        w2v.append(model.wv[word.lemma])
                    else:
                        if word.lemma.lower() in model.wv.vocab:
                            w2v.append(model.wv[word.lemma.lower()])
        D_w2v.append((np.mean(w2v, axis=0),index))
    with open(word2vecResume, 'wb') as fp:
        pickle.dump(D_w2v, fp)

In [0]:
def get_recommend_initialrun(job_profile, df_resume):  
    #data = request.args.get('value')
    w2v = []
    job_profile = job_profile.lower()
    model = Word2Vec.load(word2vecModel)
    for sentence in en.parsetree(job_profile, tokenize=True, lemmata=True, tags=True):
        for chunk in sentence.chunks:
            for word in chunk.words:
                if word.lemma in model.wv.vocab:
                    w2v.append(model.wv[word.lemma])
                else:
                    if word.lemma.lower() in model.wv.vocab:
                        w2v.append(model.wv[word.lemma.lower()])
    Q_w2v = np.mean(w2v, axis=0)
    
    print("completed job profile screening")
    
    # Example of document represented by average of each document term vectors.
    D_w2v = []
    for index, row in df_resume.iterrows():
        print("Processing resume " + str(index))
        yd = row['whole_text']
        w2v = []
        for sentence in en.parsetree(yd.lower(), tokenize=True, lemmata=True, tags=True):
            for chunk in sentence.chunks:
                for word in chunk.words:
                    if word.lemma in model.wv.vocab:
                        w2v.append(model.wv[word.lemma])
                    else:
                        if word.lemma.lower() in model.wv.vocab:
                            w2v.append(model.wv[word.lemma.lower()])
        D_w2v.append((np.mean(w2v, axis=0),index))
    with open(word2vecResume, 'wb') as fp:
        pickle.dump(D_w2v, fp)
    
    # Make the retrieval using cosine similarity between query and document vectors.
    retrieval = []
    for i in range(len(D_w2v)):
        print('Calulating cosine similarity for resume: ' + str(i))
        retrieval.append((1 - spatial.distance.cosine(Q_w2v, D_w2v[i][0]),D_w2v[i][1]))
    retrieval.sort(reverse=True)
    return retrieval
    #with app.app_context(), app.test_request_context():
        #ret_data = {"cv1":url_for('static', filename="test/"+retrieval[0][1][retrieval[0][1].rfind('/')+1:]), "score1": str(round(retrieval[0][0], 4)), "cv2":url_for('static', filename="test/"+retrieval[1][1][retrieval[1][1].rfind('/')+1:]), "score2": str(round(retrieval[1][0], 4)),"cv3":url_for('static', filename="test/"+retrieval[2][1][retrieval[2][1].rfind('/')+1:]), "score3": str(round(retrieval[2][0], 4))   }
        #return jsonify(ret_data)
        #return ret_data

In [0]:
def get_recommend(job_profile, df_resume):  
    #data = request.args.get('value')
    w2v = []
    job_profile = job_profile.lower()
    model = Word2Vec.load(word2vecModel)
    for sentence in en.parsetree(job_profile, tokenize=True, lemmata=True, tags=True):
        for chunk in sentence.chunks:
            for word in chunk.words:
                if word.lemma in model.wv.vocab:
                    w2v.append(model.wv[word.lemma])
                else:
                    if word.lemma.lower() in model.wv.vocab:
                        w2v.append(model.wv[word.lemma.lower()])
    Q_w2v = np.mean(w2v, axis=0)
    
    # Document represented by average of each document term vectors.
    #D_w2v = []
    with open(word2vecResume, 'rb') as fp:
        D_w2v = pickle.load(fp)
    
    # Make the retrieval using cosine similarity between query and document vectors.
    retrieval = []
    for i in range(len(D_w2v)):
        retrieval.append((1 - spatial.distance.cosine(Q_w2v, D_w2v[i][0]),D_w2v[i][1]))
    retrieval.sort(reverse=True)
    return retrieval
    #with app.app_context(), app.test_request_context():
        #ret_data = {"cv1":url_for('static', filename="test/"+retrieval[0][1][retrieval[0][1].rfind('/')+1:]), "score1": str(round(retrieval[0][0], 4)), "cv2":url_for('static', filename="test/"+retrieval[1][1][retrieval[1][1].rfind('/')+1:]), "score2": str(round(retrieval[1][0], 4)),"cv3":url_for('static', filename="test/"+retrieval[2][1][retrieval[2][1].rfind('/')+1:]), "score3": str(round(retrieval[2][0], 4))   }
        #return jsonify(ret_data)
        #return ret_data

## Implementation

In [0]:
dfdata = get_resume_data()

In [0]:
dfjob = get_jobprofile_data()

In [0]:
create_w2v_model(dfdata)

In [0]:
#model = Word2Vec.load("resume_word2vec")

In [0]:
#if 'matlab' in word_vectors.vocab:
 #   print("Yes")
#else:
 #   print("No")

Yes


In [0]:
job_profile = dfjob.Skill[0] + dfjob.Description[0]

In [0]:
result = get_recommend(job_profile, dfdata)