In [101]:
# Imports

import gensim
from gensim import corpora
from gensim import models, similarities
from gensim.similarities import Similarity
from gensim.models import LsiModel
import os
import collections
import smart_open
import random

import pandas as pd
import numpy as np


from pprint import pprint
import string
import re

In [3]:
!ls

Similarity.ipynb                  processed_products_data.csv
Testing Similarity.ipynb          products_corpus.dict
bow_classifier.py                 products_corpus.mm
enwiki-latest-pages-articles1.xml products_corpus.mm.index
make_text_corpus.py               similarities
model                             similarities.index.npy
model.projection                  [1m[36mw2v_model[m[m
model.projection.u.npy            [1m[36mwiki-archive[m[m
[1m[36moutput_data[m[m                       word2vec.ipynb
pretrained_glove.py


In [205]:
def remove_punctuation(row):
#     row['document'] = row['document'].translate(translator)
    row['name'] = row['name'].decode("utf-8").translate(translator)
    row['departments_name'] = row['departments_name'].decode("utf-8").translate(translator)
    row['categories_name'] = row['categories_name'].decode("utf-8").translate(translator)
    description = row['description_en']
    row['description_en'] = row['description_en'].decode("utf-8").translate(translator)
    return row

In [207]:
def remove_unicode(series):
    return [s.encode('ascii', 'ignore').strip()
               for s in series.str.decode('unicode_escape')]

In [179]:
!ls

Similarity.ipynb                  processed_products_data.csv
Testing Similarity.ipynb          products_corpus.dict
bow_classifier.py                 products_corpus.mm
enwiki-latest-pages-articles1.xml products_corpus.mm.index
make_text_corpus.py               similarities
model                             similarities.index.npy
model.projection                  [1m[36mw2v_model[m[m
model.projection.u.npy            [1m[36mwiki-archive[m[m
[1m[36moutput_data[m[m                       word2vec.ipynb
pretrained_glove.py


In [180]:
# Paths

DATA_FOLDER = 'output_data'
DICTIONARY_PATH = 'products_corpus.dict'
MODEL_FILENAME = 'model'
SIMILARITIES_FILENAME = 'similarities'

In [181]:
def load_dictionary():
    """
    loads dictionary from the disk
    """
    dictionary = corpora.Dictionary.load(DICTIONARY_PATH)
    
    return dictionary

In [182]:
def load_model():
    """
    read model from the disk
    """
    
    model = LsiModel.load(MODEL_FILENAME)
    return model

In [183]:
def load_similarities():
    """
    read similarity indexes from list
    """
    
    index = Similarity.load(SIMILARITIES_FILENAME)
    return index

### Load from disk

In [184]:
model = load_model()
dictionary = load_dictionary()
index = load_similarities()

In [185]:
!ls

Similarity.ipynb                  processed_products_data.csv
Testing Similarity.ipynb          products_corpus.dict
bow_classifier.py                 products_corpus.mm
enwiki-latest-pages-articles1.xml products_corpus.mm.index
make_text_corpus.py               similarities
model                             similarities.index.npy
model.projection                  [1m[36mw2v_model[m[m
model.projection.u.npy            [1m[36mwiki-archive[m[m
[1m[36moutput_data[m[m                       word2vec.ipynb
pretrained_glove.py


### Test Sample doc

In [186]:

# print(vec_lsi)

In [209]:
# Reading product data from disk
def get_train_data():
    """
    SQL here 
    """
    data = pd.read_csv('~/Desktop/grocery_products.csv')
    return data

data = get_train_data()
data = data.head(1000)
data = data.replace(np.nan, '', regex=True)

data['name'] = remove_unicode(data['name'])
data['departments_name'] = remove_unicode(data['departments_name'])
data['categories_name'] = remove_unicode(data['categories_name'])
data['description_en'] = remove_unicode(data['description_en'])

data = data.apply(remove_punctuation, axis=1)
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 18 columns):
name                 1000 non-null object
departments_name     1000 non-null object
categories_name      1000 non-null object
description_en       1000 non-null object
catalog_name         1000 non-null object
price                1000 non-null float64
bulkiness            1000 non-null int64
is_fragile           1000 non-null bool
is_perishable        1000 non-null bool
weight               1000 non-null float64
country_of_origin    1000 non-null object
tradename_en         1000 non-null object
url                  1000 non-null object
image_url            1000 non-null object
source_product_id    1000 non-null int64
source_brand_id      1000 non-null int64
source_catalog_id    1000 non-null int64
source_country_id    1000 non-null int64
dtypes: bool(2), float64(2), int64(5), object(9)
memory usage: 127.0+ KB
None


In [210]:
selected_data = data[['source_product_id', 'name', 'departments_name', 'categories_name', 'description_en', 'image_url']]
selected_data['document'] = selected_data['name'] + ' ' + selected_data['departments_name'] + ' ' + selected_data['categories_name'] + ' ' + selected_data['description_en']
# save dataset to file
selected_data.to_csv('processed_products_data.csv')
docs = selected_data['document'].tolist()
product_ids = selected_data['source_product_id'].tolist()

# Mapping from document index 
doc_index_to_id = {}
for index, product_id in enumerate(product_ids):
    doc_index_to_id[index] = product_id

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


## Create Data for UI

In [211]:
data_for_ui = selected_data[['source_product_id', 'name']]

In [212]:
data_for_ui = data_for_ui.sample(frac=1)
data_for_ui.rename(columns={'source_product_id':'id'}, 
                 inplace=True)
dfui = data_for_ui.head(500)

In [213]:
dfui.to_json(orient='records')

'[{"id":5627136,"name":"Ebi  Gratin Shrimp  Cheese"},{"id":5029878,"name":"Yam Bun"},{"id":4774236,"name":"Tempura Prawn 5 Pcs"},{"id":4978902,"name":"Sarsaparilla Traditional Soda"},{"id":2877319,"name":"Dairy Cheerfully Chiko Ice Cream"},{"id":5162281,"name":"Artisan Nuts  Nutty Pop"},{"id":5015026,"name":"Summer Azumi Wasabi"},{"id":5439920,"name":"Korean Burdock Crispy Chips"},{"id":935392,"name":"Pro 1 Probiotics"},{"id":2769524,"name":"Chicken Curry with Potatoes"},{"id":2768948,"name":"Women AntiPerspirant Deodorant Spray  Advanced Whitening"},{"id":3030988,"name":"Durum Wheat Spaghetti"},{"id":4981154,"name":"Glass Wipes 177cm x 279cm"},{"id":3561264,"name":"Coco Water"},{"id":5439188,"name":"Instant Noodle Shio Ramen"},{"id":5607763,"name":"DUPLICATED Soda Water"},{"id":5046852,"name":"Coconut Oil Retail Pack"},{"id":5626850,"name":"Concentrated Laundry Detergent Liquid Superior Clothes Care"},{"id":2893284,"name":"Chocolate Swirl Pound Cake"},{"id":4980934,"name":"Tuna Sandwi

# Playground

In [81]:
data = pd.read_csv('processed_products_data.csv')
document = data[data['source_product_id'] == int('4644188')].iloc[0]['document']
document

'Spaghetti Aglio Olio with Mushroom Department - (Just Acia Menu 2 (just-acia)) Spaghetti Set '

In [45]:
! open . 

# Get similarity

In [45]:
def get_similarity(doc):
    sample_doc = doc

    # Pre - process document
    vec_bow = dictionary.doc2bow(sample_doc.lower().split())
    vec_lsi = model[vec_bow]
    
    similarity_scores = index[vec_lsi]
    similarity_scores = list(enumerate(similarity_scores))
    
    
    

    # Sort in descending order of similarity
    sorted_similarity_scores = sorted(similarity_scores, key=lambda tup: tup[1], reverse=True)
    similar_products = []
    for doc_index, score in sorted_similarity_scores[:10]:
        product_id = doc_index_to_id[doc_index]
        product_info = selected_data[selected_data['source_product_id'] == product_id].iloc[0]
    #     product_info += " : " + str(selected_data[selected_data['source_product_id'] == product_id]['description_en'])
#         print(f'{product_info} : {score}')
    #     print(str(selected_data[selected_data['source_product_id'] == product_id]['description_en']))
        similar_products.append(product_info.to_json())
    return similar_products

In [46]:
similar_products = {}
for id in dfui['id'][0:1]:
#     print(id)
    document = selected_data[selected_data['source_product_id'] == id]['document'].iloc[0]
    print("=========")
    print(document)
    print(get_similarity(document))

Xin's Gold Leaf Salmon Yusheng Chinese New Year Ready to Eat 
['{"source_product_id":1156968,"name":"Xin\'s Gold Leaf Salmon Yusheng","departments_name":"Chinese New Year","categories_name":"Ready to Eat","description_en":"","image_url":"https:\\/\\/assets.honestbee.com\\/products\\/images\\/480\\/holiday-inn-atrium-cny-2017_hiacny201701_hiacny201701-1.jpg","document":"Xin\'s Gold Leaf Salmon Yusheng Chinese New Year Ready to Eat "}', '{"source_product_id":1229304,"name":"Xin\'s Gold Leaf Salmon Yusheng","departments_name":"Chinese New Year","categories_name":"Ready to Eat","description_en":"","image_url":"https:\\/\\/assets.honestbee.com\\/products\\/images\\/480\\/holiday-inn-atrium-cny-2017_hiacny201701small_hiacny201701small-1.jpg","document":"Xin\'s Gold Leaf Salmon Yusheng Chinese New Year Ready to Eat "}', '{"source_product_id":3199222,"name":"Hamachi and Salmon Yu Sheng \\u6cb9\\u7518\\u9c7c\\u4e09\\u6587\\u9c7c\\u751f\\u635e\\u8d77 (Large)","departments_name":"Chinese New Year

In [147]:
import string
#make translator object
translator=str.maketrans('','',string.punctuation)
string = "D24 Durian Nian Gao \\u9996\\u521bD24 \\u69b4\\u83b2\\u5e74\\u7cd5 Chinese New Year Ready to Eat "

In [145]:
# string = string.encode(encoding='UTF-8',errors='strict')
string = string.encode('ascii',errors='ignore')
# string=string.translate(translator)
string

b'D24 Durian Nian Gao \\u9996\\u521bD24 \\u69b4\\u83b2\\u5e74\\u7cd5 Chinese New Year Ready to Eat '

In [194]:
s = "\\"Ko Song\\" Plain Pasta"

SyntaxError: invalid syntax (<ipython-input-194-5cb48b15ffaa>, line 1)