## Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
from pattern import en
from scipy import spatial
import pickle

## Global Variables

In [17]:
data_repository = 'candidates_2019_06_18_5_16.csv'
job_repository = 'indeed_job_dataset_1.csv'
gloveModel = 'glove.6B.200d.txt'
word2vecResume = 'resume_w2v_array'

## Functions

In [3]:
def get_resume_data():
    dfdata = pd.read_csv(data_repository)
    dfdata['whole_text'] = pd.Series(dfdata.fillna('').values.tolist()).str.join(' ')
    return dfdata

In [4]:
def get_jobprofile_data():
    dfjob = pd.read_csv(job_repository)
    #dfjob['whole_text'] = pd.Series(dfjob.fillna('').values.tolist()).str.join(' ')
    return dfjob

In [5]:
def loadGloveModel(gloveFile):
    #print("Loading Glove Model")
    f = open(gloveFile,'r',encoding="utf8")
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    #print("Done.",len(model)," words loaded!")
    return model

In [6]:
def get_job_desc(skill, description):
    vector = []
    for sentence in en.parsetree(description, tokenize=True, lemmata=True, tags=True):
        for chunk in sentence.chunks:
            for word in chunk.words:
                if word.tag == 'NN' or word.tag == 'VB':
                    vector.append(word.lemma)
    for sentence in en.parsetree(skill, tokenize=True, lemmata=True, tags=True):
        for chunk in sentence.chunks:
            for word in chunk.words:
                vector.append(word.lemma)
    return vector

In [8]:
def create_w2v_resume(df_resume, model):
    D_w2v = []
    for index, row in df_resume.iterrows():
        print("Processing resume " + str(index))
        yd = row['whole_text']
        w2v = []
        for sentence in en.parsetree(yd.lower(), tokenize=True, lemmata=True, tags=True):
            for chunk in sentence.chunks:
                for word in chunk.words:
                    if word.lemma in model.wv.vocab:
                        w2v.append(model.wv[word.lemma])
                    else:
                        if word.lemma.lower() in model.wv.vocab:
                            w2v.append(model.wv[word.lemma.lower()])
        D_w2v.append((np.mean(w2v, axis=0),index))
    with open(word2vecResume, 'wb') as fp:
        pickle.dump(D_w2v, fp)

In [9]:
def get_recommend_initialrun(job_profile, df_resume):  
    #data = request.args.get('value')
    w2v = []
    job_profile = job_profile.lower()
    model = Word2Vec.load(word2vecModel)
    for sentence in en.parsetree(job_profile, tokenize=True, lemmata=True, tags=True):
        for chunk in sentence.chunks:
            for word in chunk.words:
                if word.lemma in model.wv.vocab:
                    w2v.append(model.wv[word.lemma])
                else:
                    if word.lemma.lower() in model.wv.vocab:
                        w2v.append(model.wv[word.lemma.lower()])
    Q_w2v = np.mean(w2v, axis=0)
    
    print("completed job profile screening")
    
    # Example of document represented by average of each document term vectors.
    D_w2v = []
    for index, row in df_resume.iterrows():
        print("Processing resume " + str(index))
        yd = row['whole_text']
        w2v = []
        for sentence in en.parsetree(yd.lower(), tokenize=True, lemmata=True, tags=True):
            for chunk in sentence.chunks:
                for word in chunk.words:
                    if word.lemma in model.wv.vocab:
                        w2v.append(model.wv[word.lemma])
                    else:
                        if word.lemma.lower() in model.wv.vocab:
                            w2v.append(model.wv[word.lemma.lower()])
        D_w2v.append((np.mean(w2v, axis=0),index))
    with open(word2vecResume, 'wb') as fp:
        pickle.dump(D_w2v, fp)
    
    # Make the retrieval using cosine similarity between query and document vectors.
    retrieval = []
    for i in range(len(D_w2v)):
        print('Calulating cosine similarity for resume: ' + str(i))
        retrieval.append((1 - spatial.distance.cosine(Q_w2v, D_w2v[i][0]),D_w2v[i][1]))
    retrieval.sort(reverse=True)
    return retrieval
    #with app.app_context(), app.test_request_context():
        #ret_data = {"cv1":url_for('static', filename="test/"+retrieval[0][1][retrieval[0][1].rfind('/')+1:]), "score1": str(round(retrieval[0][0], 4)), "cv2":url_for('static', filename="test/"+retrieval[1][1][retrieval[1][1].rfind('/')+1:]), "score2": str(round(retrieval[1][0], 4)),"cv3":url_for('static', filename="test/"+retrieval[2][1][retrieval[2][1].rfind('/')+1:]), "score3": str(round(retrieval[2][0], 4))   }
        #return jsonify(ret_data)
        #return ret_data

In [22]:
def get_recommend(job_profile, df_resume):  
    #data = request.args.get('value')
    w2v = []
    job_profile = job_profile.lower()
    model = loadGloveModel(gloveModel)
    for sentence in en.parsetree(job_profile, tokenize=True, lemmata=True, tags=True):
        for chunk in sentence.chunks:
            for word in chunk.words:
                if word.string in model:
                    w2v.append(model[word.string])
                else:
                    if word.string.lower() in model:
                        w2v.append(model[word.string.lower()])
    Q_w2v = np.mean(w2v, axis=0)
    
    # Document represented by average of each document term vectors.
    #D_w2v = []
    with open(word2vecResume, 'rb') as fp:
        D_w2v = pickle.load(fp)
    
    # Make the retrieval using cosine similarity between query and document vectors.
    retrieval = []
    for i in range(len(D_w2v)):
        retrieval.append((1 - spatial.distance.cosine(Q_w2v, D_w2v[i][0]),D_w2v[i][1]))
    retrieval.sort(reverse=True)
    return retrieval
    #with app.app_context(), app.test_request_context():
        #ret_data = {"cv1":url_for('static', filename="test/"+retrieval[0][1][retrieval[0][1].rfind('/')+1:]), "score1": str(round(retrieval[0][0], 4)), "cv2":url_for('static', filename="test/"+retrieval[1][1][retrieval[1][1].rfind('/')+1:]), "score2": str(round(retrieval[1][0], 4)),"cv3":url_for('static', filename="test/"+retrieval[2][1][retrieval[2][1].rfind('/')+1:]), "score3": str(round(retrieval[2][0], 4))   }
        #return jsonify(ret_data)
        #return ret_data

## Implementation

In [11]:
dfdata = get_resume_data()

In [12]:
dfjob = get_jobprofile_data()

In [14]:
job_profile = dfjob.Skill[0] + dfjob.Description[0]

In [15]:
job_profile

'SAP, SQLPOSITION SUMMARY, \r\r\nThe Business Analyst role is the primary architect of reporting and dashboard solutions for internal and external clients. Utilizing ESI corporate standard development tools this position is responsible for the design, development, implementation, analysis, interpretation and communication of business information based on the needs of individual clients. The ability to balance overall aesthetics with robust and intuitive functionality is a critical requirement for success in this position., \r\r\nESSENTIAL FUNCTIONS, \r\r\nSuccessfully design and implement external client data reporting and dashboard solutions with a strong focus on product aesthetics and functionality.\r\r\nAid in the design, development, and implementation of new product ideas for external and internal clients.\r\r\nMaintain Live and Data Warehouse Business Objects Universes; add new fields, modify table joins, implement data structures that streamline report extraction and data analy

In [23]:
result = get_recommend(job_profile, dfdata)

In [24]:
result

[(0.02495492514869324, 197),
 (0.023483383519039003, 163),
 (0.0166733759498382, 301),
 (0.010417911714156736, 130),
 (0.0051756439306798185, 446),
 (0.002494362051347232, 548),
 (0.0022790201604862004, 139),
 (0.0008207192126068463, 585),
 (-0.0014704829963698707, 934),
 (-0.001832588085251663, 603),
 (-0.005009771332711965, 380),
 (-0.005498282317270897, 440),
 (-0.006448862692151058, 298),
 (-0.008971083885608522, 720),
 (-0.008976726506396249, 826),
 (-0.009068948092011864, 210),
 (-0.010428928385432634, 688),
 (-0.010428928385432634, 687),
 (-0.010428928385432634, 686),
 (-0.010428928385432634, 685),
 (-0.010428928385432634, 684),
 (-0.010428928385432634, 682),
 (-0.010428928385432634, 680),
 (-0.011276571584181827, 600),
 (-0.011464423799754098, 249),
 (-0.013963458178141153, 576),
 (-0.014697932822672177, 772),
 (-0.014974107168501227, 567),
 (-0.015267036484360563, 72),
 (-0.015281647937861242, 113),
 (-0.015821198239177603, 491),
 (-0.01640862752398875, 751),
 (-0.018060074261

In [18]:
dfdata.iloc[197]

First Name                                                Anthony
Last Name                                                  Birley
Position        Lead Consultant - Data Science, Big Data, BI a...
Company                                             Searchability
Location                                  Chester, United Kingdom
Experience1     Lead Consultant - Business Intelligence, Data ...
Experience2     Lead Consultant - Subsurface, Oscar Energy, 20...
Experience3                 Managing Consultant, Oscar, 2008-2012
Experience4     Recruitment Consultant, Senitor Associates, 20...
Experience5                 Managing Consultant, Oscar, 2008-2012
Experience6                                                   NaN
Experience7                                                   NaN
Experience8                                                   NaN
Experience9                                                   NaN
Experience10                                                  NaN
Experience