In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics.pairwise import cosine_similarity

In [2]:
google_model = Word2Vec.load("../data/model_word2vec.bin")

In [3]:
google_model.wv.most_similar(positive=["microsoft"])

[('autocad', 0.7432770729064941),
 ('photoshop', 0.7404437065124512),
 ('vmware', 0.7233696579933167),
 ('turbotax', 0.7154563069343567),
 ('unix', 0.6999496817588806),
 ('macintosh', 0.6942809820175171),
 ('matlab', 0.6902979016304016),
 ('oem', 0.6757484674453735),
 ('ibm', 0.6701933145523071),
 ('lightroom', 0.6684433817863464)]

In [4]:
file_dataframe = "../data/recommender_resumes_train.pkl"

df = pd.read_pickle(file_dataframe)
df.head(300)

Unnamed: 0,_id,text,class,raw_text
0,d59170ea-e3cf-41a3-982b-cf163efa2b4b,healthcar consult execut profil seminar medium...,estandar\HEALTHCARE,HEALTHCARE CONSULTANT\nExecutive Profile\nsemi...
1,9a422a34-8db0-4734-ba35-1a4a0c81cfb3,labor career focu look entri level challeng no...,estandar\AVIATION,LABORER\nCareer Focus\nLooking for an entry le...
2,892c1107-e5cc-4e18-bc90-d65408b4d166,holder summari highli organ effici multitask ...,estandar\APPAREL,KEY HOLDER\nSummary\nHighly organized efficien...
3,b6536847-0bdc-42f2-bd3b-886dd14a7d73,design summari seek posit respect compani all...,estandar\DESIGNER,CAD DESIGNER\nSummary\n\nSeeking a position wi...
4,85f31ffa-d926-4d73-ab37-a5df76b344e9,consult summari posit accountingregulatori ban...,estandar\CONSULTANT,CONSULTANT\nSummary\nPosition in Accounting/Re...
...,...,...,...,...
295,ed4a8b72-b3ca-4930-a26f-7d682c2b1cda,sale associ experi current sale associ compani...,estandar\SALES,SALES ASSOCIATE\nExperience\n04/2016 to Curren...
296,27e57901-1b3f-481c-874e-58bfff82151c,coownerexecut chef career focu perform driven ...,estandar\CHEF,CO-OWNER/EXECUTIVE CHEF\nCareer Focus\nPerform...
297,aa9fd349-632f-4db0-902f-503ceb61d73a,account summari profession account year fullra...,estandar\ACCOUNTANT,ACCOUNTANT II\nSummary\nA Professional Account...
298,a5e33855-779d-4cfa-b207-99c7cda7277b,director financ execut profil ambiti financ di...,estandar\FINANCE,DIRECTOR OF FINANCE\nExecutive Profile\nAmbiti...


In [5]:
# Generate the average word2vec for the each book description

def vectors(x):

    # Creating a list for storing the vectors (description into vectors)
    global word_embeddings
    word_embeddings = []

    # Reading the each book description
    for line in df['text']:
        avgword2vec = None
        count = 0
        for word in line.split():
            if word in google_model.wv.key_to_index:
                count += 1
                if avgword2vec is None:
                    avgword2vec = google_model.wv[word]
                else:
                    avgword2vec = avgword2vec + google_model.wv[word]

        if avgword2vec is not None:
            avgword2vec = avgword2vec / count

            word_embeddings.append(avgword2vec)

In [6]:
# Recommending the Top 5 similar books

def recommendations(title):

    # Calling the function vectors

    vectors(df)

    # finding cosine similarity for the vectors

    cosine_similarities = cosine_similarity(word_embeddings, word_embeddings)

    # taking the title and book image link and store in new data frame called books
    books = df[['_id','raw_text']]
    #Reverse mapping of the index
    indices = pd.Series(df.index, index = df['_id'].values.astype('str')).drop_duplicates()

    idx = indices[title]

    #print(len(word_embeddings))
    sim_scores = list(enumerate(cosine_similarities[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    #print(sim_scores)
    sim_scores = sim_scores[1:6] # DESDE 1 PARA OCULTAR LA OFERTA
    book_indices = [i[0] for i in sim_scores]
    scores_values = [i[1] for i in sim_scores]
    recommend = books.iloc[book_indices]

    output = []

    countIndex = 0
    for index, row in recommend.iterrows():

        #print(row['title'] )
        #print(scores_values[countIndex])

        output.append([countIndex + 1, row['raw_text'] ,row['_id'] , scores_values[countIndex], index])

        countIndex+=1


        #response = requests.get(row['image_link'])
        #img = Image.open(BytesIO(response.content))
        #plt.figure()
        #plt.imshow(img)
        #plt.title(row['title'])

    result = pd.DataFrame(output, columns=["rank", "common_texts", "title", "cosine_similarity", "index_df"])

    return result

In [10]:
result = recommendations("d59170ea-e3cf-41a3-982b-cf163efa2b4b")
result

Unnamed: 0,rank,common_texts,title,cosine_similarity,index_df
0,1,HEALTHCARE\nExecutive Profile\nCynthia was als...,8bb2f260-b9df-44ea-a519-e1d692c7cca9,0.958835,251
1,2,PREVENTATIVE HEALTHCARE INTERN\nProfessional S...,ad427244-d92c-4b4b-a831-9c55ccc3859b,0.955878,1978
2,3,PROGRAM MANAGER\nHighlights\n\nMicrosoft Outlo...,da518f69-b678-4543-bc42-a56fd4829f7b,0.953056,27
3,4,HEALTH COACH\n\nwww.linkedin.com/in/caitlinmas...,d88120cf-632c-41f1-9f0f-38815ac095a8,0.947463,2446
4,5,DISABILITY ADVOCATE\nProfessional Summary\nDed...,a8a997ef-ec8d-4245-a1e4-6935ebdbc187,0.945228,241


In [11]:
print(df["raw_text"].iloc[251])

HEALTHCARE
Executive Profile
Cynthia was also an instructor for the Cap Gemini Ernst & Young Global Regulatory Compliance Boot Camp where members of the CGE&Y
Regulatory Compliance Team are immersed in global regulatory compliance, Good Manufacturing Practices, Good Clinical Practices, Good
Laboratory Practices, CGE&Y validation tools and methodology. Working with the international workforce of CGE&Y, Cynthia was required to
be flexible, sensitive and creative in her methods of teaching Cynthia's nursing career focus was in Pain Management, Emergency Nursing, Legal
Nurse Consulting and Life Care Planning. She has traveled extensively throughout the U. S. and worldwide as a Health Care and Life Sciences
Consultant for G Major Consulting, Cap Gemini Ernst & Young, Accenture, and Ness Technologies. While working in clinical nursing, Cynthia
worked in acute, home care and long term care arenas. As a Case Manager and Life Care Planner Cynthia was an adjunct instructor for the
University of 