## Utilizando el módulo de Gensim

In [1]:
from gensim import corpora, models, similarities
from collections import defaultdict



In [2]:
documents = [
    'real solido de atun en aceite x 170 gr',  # doc_id 0
    'a-1 filete atun aceite 3 x 80 gr',  # doc_id 1
    'a-1 filete de atun en aceite x 170 gr',  # doc_id 2
    'a-1 grated de atun en aceite x 170 gr',  # doc_id 3
    'a-1 lomito atun aceite x 170 g',  # doc_id 4
    'real sólido de atún en aceite vegetal. lata 170gr',  # doc_id 5
    'a1 ver todos sólido de atún en aceite vegetal lata x170gr', # doc_id 6
    'fanny sólido de atún en aceite vegetal lata 170 g', # doc_id 7
    'fanny solido de atun en aceite vegetal x 170 gr', # doc_id 8
    'real solido de atun en aceite oliva x 170 gr', # doc_id 9
]

In [3]:
stoplist = set(['la', 'en','de'])

texts = [[word.lower() for word in document.split()
          if word.lower() not in stoplist]
         for document in documents]

print texts

[['real', 'solido', 'atun', 'aceite', 'x', '170', 'gr'], ['a-1', 'filete', 'atun', 'aceite', '3', 'x', '80', 'gr'], ['a-1', 'filete', 'atun', 'aceite', 'x', '170', 'gr'], ['a-1', 'grated', 'atun', 'aceite', 'x', '170', 'gr'], ['a-1', 'lomito', 'atun', 'aceite', 'x', '170', 'g'], ['real', 's\xc3\xb3lido', 'at\xc3\xban', 'aceite', 'vegetal.', 'lata', '170gr'], ['a1', 'ver', 'todos', 's\xc3\xb3lido', 'at\xc3\xban', 'aceite', 'vegetal', 'lata', 'x170gr'], ['fanny', 's\xc3\xb3lido', 'at\xc3\xban', 'aceite', 'vegetal', 'lata', '170', 'g'], ['fanny', 'solido', 'atun', 'aceite', 'vegetal', 'x', '170', 'gr'], ['real', 'solido', 'atun', 'aceite', 'oliva', 'x', '170', 'gr']]


In [4]:
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
print frequency

defaultdict(<type 'int'>, {'gr': 6, '80': 1, '170': 7, 'ver': 1, 'filete': 2, '170gr': 1, '3': 1, 'lata': 3, 'grated': 1, 'atun': 7, 'vegetal.': 1, 'fanny': 2, 'todos': 1, 'x170gr': 1, 'real': 3, 's\xc3\xb3lido': 3, 'a1': 1, 'aceite': 10, 'at\xc3\xban': 3, 'a-1': 4, 'vegetal': 3, 'oliva': 1, 'g': 2, 'solido': 3, 'lomito': 1, 'x': 7})


In [5]:
texts = [[token for token in text if frequency[token] > 1]
         for text in texts]
print texts

[['real', 'solido', 'atun', 'aceite', 'x', '170', 'gr'], ['a-1', 'filete', 'atun', 'aceite', 'x', 'gr'], ['a-1', 'filete', 'atun', 'aceite', 'x', '170', 'gr'], ['a-1', 'atun', 'aceite', 'x', '170', 'gr'], ['a-1', 'atun', 'aceite', 'x', '170', 'g'], ['real', 's\xc3\xb3lido', 'at\xc3\xban', 'aceite', 'lata'], ['s\xc3\xb3lido', 'at\xc3\xban', 'aceite', 'vegetal', 'lata'], ['fanny', 's\xc3\xb3lido', 'at\xc3\xban', 'aceite', 'vegetal', 'lata', '170', 'g'], ['fanny', 'solido', 'atun', 'aceite', 'vegetal', 'x', '170', 'gr'], ['real', 'solido', 'atun', 'aceite', 'x', '170', 'gr']]


In [6]:
#This module implements the concept of Dictionary – a mapping between words and their integer ids
dictionary = corpora.Dictionary(texts)
print type(dictionary)
print dictionary

<class 'gensim.corpora.dictionary.Dictionary'>
Dictionary(15 unique tokens: [u'real', u's\xf3lido', u'gr', u'g', u'filete']...)


In [7]:
# doc2bow counts the number of occurences of each distinct word,
# converts the word to its integer word id and returns the result
# as a sparse vector

corpus = [dictionary.doc2bow(text) for text in texts]
print corpus

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)], [(1, 1), (3, 1), (4, 1), (5, 1), (7, 1), (8, 1)], [(1, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)], [(1, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (9, 1)], [(0, 1), (3, 1), (10, 1), (11, 1), (12, 1)], [(3, 1), (10, 1), (11, 1), (12, 1), (13, 1)], [(3, 1), (6, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1)], [(1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (13, 1), (14, 1)], [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)]]


In [8]:
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)
print lsi

LsiModel(num_terms=15, num_topics=2, decay=1.0, chunksize=20000)


In [9]:
doc = "real solido de atun en aceite x 170 gr" 
vec_bow = dictionary.doc2bow(doc.lower().split())
print vec_bow

[(3, 1), (5, 1), (8, 1)]


In [10]:
# convert the query to LSI space
vec_lsi = lsi[vec_bow]
index = similarities.MatrixSimilarity(lsi[corpus])

# perform a similarity query against the corpus
sims = index[vec_lsi]
sims = sorted(enumerate(sims), key=lambda item: -item[1])

print sims

[(4, 0.99994826), (0, 0.99736017), (9, 0.99736017), (8, 0.99396342), (3, 0.98814189), (2, 0.98311067), (1, 0.97171909), (7, 0.45667768), (5, 0.38584575), (6, 0.32286653)]
