In [1]:
from functools import reduce
from gensim.corpora.wikicorpus import tokenize
from gensim.corpora import Dictionary, HashDictionary, MmCorpus, WikiCorpus, TextCorpus
from gensim import utils, models, similarities
from gensim.models import TfidfModel

from IPython.display import HTML, display
import tabulate, bs4, unicodedata
import psycopg2

def list_versions(*modules):
    global_vars = globals().items()
    import types, sys
    modules = set([val.__name__.split('.')[0] for name, val in global_vars if isinstance(val, types.ModuleType)]+list(modules))
    display(HTML(tabulate.tabulate([(name, sys.modules[name].__version__) for name in modules if hasattr(sys.modules[name],'__version__')], tablefmt='html')))
list_versions('IPython')

outp  = 'seek'
dictionary = None
    

0,1
psycopg2,2.6.2 (dt dec pq3 ext lo64)
gensim,3.4.0
IPython,6.2.1
bs4,4.5.3
tabulate,0.8.2


In [2]:
pgconfig = {
    'host': 'postgresql',
    'port': 5432,
    'database': 'dev',
    'user': 'dev',
    'password': None,
}

In [3]:
%load_ext sql
dsl = 'postgres://{user}:{password}@{host}:{port}/{database}'.format(**pgconfig)
%sql $dsl

'Connected: dev@dev'

In [4]:
class JobAdCorpus(TextCorpus):
    def __init__(self, query, metadata=None):
        self.metadata=metadata
        self.input=None
        self.fname='none'
        self.df = query 
     
    def get_texts(self):
        for id, title, content in self.df:
            tokenized_content = tokenize(
                unicodedata.normalize(
                    "NFKD",
                    bs4.BeautifulSoup(content, 'html5lib').text
                )
            )
            if self.metadata:
                yield (tokenized_content, (id, title))
            else:
                yield tokenized_content
                

# Build Dictionary

In [6]:
DEFAULT_DICT_SIZE = 100000
query = %sql select id, title, mobile_ad_template from jobs where sub_classification__id=6287
jobad = JobAdCorpus(metadata=False, query=query)
dictionary = Dictionary(jobad.get_texts())
jobad.dictionary = dictionary
jobad.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
jobad.dictionary.save_as_text(outp + '_wordids.txt.bz2')

21203 rows affected.


# Plain Corpus -> Mm Corpus

In [7]:
query = %sql select id, title, mobile_ad_template from jobs where sub_classification__id=6287 and location_hierarchy__state='Victoria' and listing_date::date>now()::date-30; 
jobad = JobAdCorpus(query=query, metadata=True)
dictionary = dictionary or Dictionary.load_from_text(outp + '_wordids.txt.bz2')
jobad.dictionary = dictionary
MmCorpus.serialize(outp + '_bow.mm', jobad, progress_cnt=10000, metadata=True)

1143 rows affected.


# Tfidf

In [8]:
dictionary = dictionary or Dictionary.load_from_text(outp + '_wordids.txt.bz2')
corpus_mm = MmCorpus(outp + '_bow.mm')
tfidf = TfidfModel(corpus_mm, id2word=dictionary, normalize=True)
MmCorpus.serialize(outp + '_tfidf.mm', tfidf[corpus_mm], progress_cnt=10000)

# Lsi

In [6]:
dictionary = dictionary or Dictionary.load_from_text(outp + '_wordids.txt.bz2')
corpus_tfidf = MmCorpus(outp + '_tfidf.mm')
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=200)
corpus_lsi = lsi[corpus_tfidf]
MmCorpus.serialize(outp + '_lsi.mm', corpus_lsi, progress_cnt=10000)

# Query

In [7]:
keywords = "react reactjs graphql koa nodejs python docker kubernetes k8s"
dictionary = dictionary or Dictionary.load_from_text(outp + '_wordids.txt.bz2')
vec_bow = dictionary.doc2bow(keywords.lower().split())
vec_lsi = lsi[vec_bow]
index = similarities.MatrixSimilarity(corpus_lsi)
sims = index[vec_lsi]
sims = sorted(enumerate(sims), key=lambda item: -item[1])
metadata = utils.unpickle(outp + '_bow.mm.metadata.cpickle')
result = map(lambda a:(a[1],metadata[a[0]][1]),sims[:50])
display(HTML(tabulate.tabulate(result, headers=['Similarity', 'Title'], tablefmt='html')))

Similarity,Title
0.491399,Senior React / JavaScript Developer
0.450245,FullStack Developer - Node / Golang
0.415908,Senior Developer
0.389994,REACT Developer (3 months initially)
0.374252,ReactJS Developer - Melbourne
0.35333,Experienced React Dev
0.337109,Python Developer
0.335757,Artificial Intelligence & Machine Learning Engineer
0.31741,Senior Full Stack Developer
0.31741,Senior Full Stack Developer
