In [20]:
import numpy as np
import pandas as pd

from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.utils import simple_preprocess

from gensim.models.phrases import Phraser, Phrases

## data

In [14]:
df = pd.read_csv('/data/notebooks/challenge04/day04/data/job_ofer.csv')

In [16]:
df.shape

(36109, 8)

In [19]:
def prepare_corpus(corpus, bigram):
    for sentence in corpus:
        yield bigram[sentence] + sentence

In [17]:
title_corpus = df['title'].map(simple_preprocess)

## phrasing

In [18]:
title_bigram = Phraser( Phrases(title_corpus, min_count=2, threshold=2))

In [39]:
title_corpus_phrased = prepare_corpus(title_corpus, title_bigram)

## tagging

In [40]:
title_corpus_tagged = [TaggedDocument(words=sentence, tags=[i]) for i, sentence in enumerate(title_corpus_phrased)]

## model

In [65]:
title_model = Doc2Vec(vector_size=300, window_size=1, min_count=2)
title_model.build_vocab(title_corpus_tagged)

In [66]:
%time title_model.train(title_corpus_tagged, total_examples=title_model.corpus_count, epochs=10)

CPU times: user 25.1 s, sys: 19 s, total: 44 s
Wall time: 32.2 s


In [97]:
def list_similars(model, tagged_corpus, tag, n=5):
    print(f'Similars to {tag}, {tagged_corpus[tag]}:\n')
    for sim_tag, prob in model.docvecs.most_similar(tag):
        title = df[ df.index==sim_tag].title
        print(str(title).split('\n')[0])

In [98]:
list_similars(title_model, title_corpus_tagged, 0)

Similars to 0, TaggedDocument(['machine_learning', 'engineer', 'machine', 'learning', 'engineer'], [0]):

446    Senior Machine Learning Engineer (Relocate to ...
14076    Senior Data Pipeline Engineer
19590    Senior Computer Vision R&D Engineer
362    Senior Deep Learning Engineer
12266    Senior Computer Vision R&D Engineer
9442    Machine Learning and Big Data Researcher - Roc...
357    Software Development Engineer - AWS AI Deep Le...
9556    NLP Engineer
10611    AI Programmer
10148    Artificial Intelligence Engineer (1744)


In [96]:
list_similars(title_model, title_corpus_tagged, 9658)

Similars to 9658, TaggedDocument(['data', 'engineer', 'data', 'engineer'], [9658]):

2095    Software Backend Engineer
3022    Aii Sales Manager - Wichita, KS
2589    Industrial Air Sales Manager - Appleton, WI wi...
13356    iOS Software Engineer (1596)
26174    Senior Software Engineer - Microservices
15657    BAT/Rewards Chromium Engineer
341    Machine Learning Engineer
33933    Northern Maine: OBGYN Opportunity
384    Machine Learning Engineer
8440    Route Delivery Driver CDL
