In [2]:
import pandas as pd
import numpy as np
import sys

sys.path.insert(1, '../')

In [20]:
import os
os.chdir('../')

In [26]:
import pandas as pd
import numpy as np
import pdb
from tqdm import tqdm
import nltk
import string
import pickle 
import yaml
import os
import torch 

from sentence_transformers import SentenceTransformer

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

import pickle


In [27]:
CONFIG_FILE = 'config.yaml'

def text_preprocessing(text: str):
    
    # Lowercase
    text = text.lower()
    # Remove Punctuation
    # text = "".join([char for char in text if char not in string.punctuation])
    words = word_tokenize(text)  
    # Remove Stopwords
    stop_words = stopwords.words('english')
    filtered_words = [word for word in words if word not in stop_words]
    ## Stemming
    # porter = PorterStemmer()
    # stemmed = [porter.stem(word) for word in filtered_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(word) for word in filtered_words]
    
    return ' '.join(lemmatized)

def get_data_from_file(filename:str):

    with open(filename, 'rb') as file:
        data = pickle.load(file)
    return data


def get_top_n_indices(array, top_n):
    return array.argsort()[-top_n:][::-1]

In [112]:
class Recommender():

    def __init__(self, primary_column:str = 'title', top_n:int = 10) -> None:

        # Open the configuration file to load parameters
        with open(CONFIG_FILE, "r") as file:
            try:
                self.params = yaml.safe_load(file)
            except yaml.YAMLError as exc:
                print(exc)
        
        self.top_n = top_n
        self.primary_column = primary_column
        metadata_file = self.params['METADATA_FILE']
        self.metadata = pd.read_csv(metadata_file, index_col=0)
        self.preprocess_metadata()
        print("INFO: Initializing Model")
        self.model = SentenceTransformer('msmarco-distilbert-base-dot-prod-v3')
        print("INFO: Creating embeddings")
        self.embeddings = self.get_embeddings(column=primary_column)
        # self.product_feature_positiveness = get_data_from_file(self.params['product_feature_ratings'])


    def preprocess_metadata(self) -> None:

        self.metadata['description'] = self.metadata['description'].apply(lambda x: text_preprocessing(eval(x)[0]))

    def get_embeddings(self, column) -> np.array:

        # If embedding is locally saved already, load it 
        try:
            with open(self.params['EMBEDDING_FILE'], 'rb') as file:
                embeddings = pickle.load(file)
            print("INFO : Loaded Product Embeddings")
            return embeddings
        
        # If embedding is not locally available, create embeddings
        except:
            print("INFO : Creating Embeddings")
            if torch.cuda.is_available():
                embeddings = self.model.encode(self.metadata[column].tolist(), device='cuda') 
            else:
                embeddings = self.model.encode(self.metadata[column].tolist())
            embeddings = np.asarray(embeddings.astype('float32'))   
            
            print("INFO: Saving embeddings")
            if not os.path.exists(os.path.dirname(self.params['EMBEDDING_FILE'])):
                os.mkdir(os.path.dirname(self.params['EMBEDDING_FILE']))
            with open(self.params['EMBEDDING_FILE'],'wb') as file:
                pickle.dump(embeddings, file, protocol=pickle.HIGHEST_PROTOCOL)

            return embeddings

    def return_most_similar(self, query):

        print("INFO: Retrieving items for query")
        query_vector = self.model.encode([query])
        similarity = np.dot(self.embeddings,query_vector.T)
        top_items = similarity.flatten().argsort()[-self.top_n:][::-1]
        print(self.metadata['title'].iloc[top_items])
        return list(top_items), list(self.metadata['title'].iloc[top_items])


    def character_similarity(self, character_list:list, subset_indices:list, method:int = 1):

        # Method 1 - Join all characteristics to make an expanded query
        if method == 1 :
            character_query = ' '.join(character_list)
            character_query_vector = self.model.encode(character_query)
            similarity = np.dot(self.embeddings[subset_indices],character_query_vector.T)
            # top_items =  get_top_n_indices(similarity.flatten(), self.top_n)       
            return similarity / np.linalg.norm(similarity)

        # Method 2 - Rank items based on individual characteristics
        elif method == 2:
            similarity_list =[]
            for characteristic in character_list:
                character_vector = self.model.encode(characteristic)
                similarity = np.dot(self.embeddings[subset_indices],character_vector.T)
                # top_items_list.append(get_top_n_indices(similarity.flatten(), top_n=200)) 
                similarity_list.append(similarity.flatten() / np.linalg.norm(similarity.flatten())) 
            agg_similarity = np.array(similarity_list).mean(axis=0)
            return agg_similarity
        
    def feature_similarity(self, feature_imp):

        agg_feature_poitiveness = np.zeros(self.product_feature_positiveness.shape()[0])
        for ind, imp in enumerate(feature_imp):
            agg_feature_poitiveness += imp*self.product_feature_positiveness[ind]
        return agg_feature_poitiveness


    def return_most_similar_v1(self, query:str, character_list:list, character_method:int, feature_imp:list = None):

        print("INFO: Retrieving items for query")
        query_vector = self.model.encode([query + ' '.join(character_list)])
        query_similarity = np.dot(self.embeddings,query_vector.T).flatten()
        query_similarity  /= np.linalg.norm(query_similarity)
        final_similarity = query_similarity
        indices = get_top_n_indices(final_similarity, 10)
        # character_similarity = self.character_similarity(character_list, method=1)
        # feature_similarity = self.feature_similarity(feature_imp)

        return indices
                 

    def get_top_items_for_features(top_n):
        
        self
        final_product_embeddings = self.product_feature_ratings.mean(axis=1)
        top_item_ind = get_top_n_indices(final_product_embeddings, top_n=5)
        return top_item_ind

In [113]:
recommender = Recommender(primary_column='description')


INFO: Initializing Model
INFO: Creating embeddings
INFO : Loaded Product Embeddings


In [104]:
query = 'Women leather shoes'

# Query similarity
query_similarity = np.dot(recommender.embeddings,recommender.model.encode([query]).T).flatten()
query_similarity  /= np.linalg.norm(query_similarity)

query_top_20 = get_top_n_indices(query_similarity, 20)
# indices = get_top_n_indices(final_similarity, 20)

character_list = ['Women', 'Leather']
character_similarity = recommender.character_similarity(character_list=character_list, subset_indices=query_top_20, method=1)

final_similarity = character_similarity
sub_indices = get_top_n_indices(final_similarity, 10)
main_indices = query_top_20[sub_indices]
recommender.metadata['title'].iloc[main_indices].to_list()

["Fangsto Women's Cowhide Leather Loafers Flats Sandals Slip-On",
 "Skechers USA Men's Caswell Oxford",
 "Roper Women's Lace and Underlay Western Boot",
 "Jambu Women's Pecan Mary Jane Flat",
 "KEEN Women's Terradora Mid Wp-w Hiking Boot",
 "JARO VEGA Women's Soft Goatskin Genuine Leather Pumps Slender Block Heel Closed Almond Toe Dress Shoes",
 "FRYE Women's Patty Artisan Zip Bootie",
 "ECCO Men's Soft 7 Fashion Sneaker",
 "Easy Spirit Women's Realflex Walking Shoe",
 "ASICS Women's GT-2000 3 Running Shoe"]

In [115]:
query = "Women's shoes"
character_list = ['Shiny', 'Sexy', 'Black']
recommender.metadata['title'].iloc[recommender.return_most_similar_v1(query=query, character_list=character_list, character_method=1)].to_list()

INFO: Retrieving items for query


['SoftSpots Perri Women&rsquo;s Slip-On',
 "Blue Q Men's Crew Socks - Fits Men's Shoe Size 7-12",
 "Lacoste Men's Malahini Deck 316 1 Spm Fashion Sneaker",
 'In Touch Bamboo Above The Knee Skirt',
 "Clarks Women's Daelyn Summit Slip-On Loafer",
 "Luichiny Women's Case Closed Snow Boot",
 "Steve Madden Women's Pierce Ankle Bootie",
 "Muck Boot Women's Breezy Low Boot",
 "Skechers Sport Women's Scene Stealer Fashion Sneaker",
 'Soft Jersey Blend Sleep Hat Comfortable Soft Hat Liner Beanie Skull Cap Chemo Hair Loss Head Covering']

In [97]:
main_indices

array([4266, 1967, 1929, 4669, 4094, 2543, 2580,  896, 1814, 4878])

In [116]:
l = [1,2,3,4,5,6,7]
l[:100]

[1, 2, 3, 4, 5, 6, 7]

In [80]:
recommender.metadata['title'].iloc[get_top_n_indices(character_similarity, 10)].to_list()

['Crocs A Leigh 2-Strap Miniwedge',
 "Womens Soft Leather Cigarette Case Holds Regular and 100's USA Made",
 "Cole Haan Men's Calhoun Lace-Up Derby Shoe",
 "Geox Men's Federico 9 Shoe",
 "Skechers USA Men's Caswell Oxford",
 'Woly German Suede Nubuck Brush 5&quot; Removes Dirt &amp; Stains on Designer Shoes, Boots, Handbags, Clothes.',
 "Birkenstock Women's Madrid Birko-Flor Sandal",
 "Lucky Women's Galvann",
 'Collonil Nubuck + Velours/Suede Waterproof Protector Repellent Spray, 200 ml',
 "Fangsto Women's Cowhide Leather Loafers Flats Sandals Slip-On"]

In [81]:
recommender.metadata['title'].iloc[get_top_n_indices(query_similarity, 10)].to_list()

['SoftSpots Perri Women&rsquo;s Slip-On',
 "Lacoste Men's Malahini Deck 316 1 Spm Fashion Sneaker",
 "Rocket Dog Women's Rainy Rubber Rain Boot",
 "Clarks Women's Blanche Nora Ballet Flat",
 "Bloch Dance Women's Jazzsoft Split Sole Leather Jazz Shoe",
 'Teva Scamper Water Shoe (Toddler/Little Kid/Big Kid)',
 "Ollio Women's Shoe Cross Braided Multi Color Flat Sandal",
 'Adidas Pretereo 2 Wrestling Shoes - Collegiate Royal/White/Black',
 "Dr. Martens Men's Octavius Lace Shoe",
 "New Balance Women's Minimus Sport Spikeless Golf Shoe"]

In [35]:
import pandas as pd
import numpy as np

metadata = pd.read_csv('../Project_Data/Metadata.csv', index_col=0)
metadata['product_detail'] = metadata['title'] + ' ' + metadata['category'].apply(lambda x: ' '.join(eval(x)[1:])) + ' ' + metadata['description'] + ' ' + metadata['feature'].apply(lambda x: ' '.join(eval(x)[:4]))
metadata.to_csv('../Project_Data/Metadata_W_Product_detail.csv')

In [4]:
metadata.sample(5)

Unnamed: 0.1,Unnamed: 0,category,tech1,description,fit,title,also_buy,image,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin
4895,47670,"['Clothing, Shoes & Jewelry', 'Women', 'Shoes'...",,['The iconic chuck taylor all star high top sn...,"class=""a-normal a-align-center a-spacing-smal...",Converse Women's Chuck Taylor All Star Leather...,"['B074V4DTPL', 'B0774PWGRQ', 'B07BS1P8T5', 'B0...",['https://images-na.ssl-images-amazon.com/imag...,,,"['100% Leather', 'Imported', 'Rubber sole', 'S...","31,038 in Clothing, Shoes & Jewelry (","['B078J5W5Z2', 'B074V4DTPL', 'B0741XXSRZ', 'B0...",{},"<img src=""https://images-na.ssl-images-amazon....",,"<div class=""a-fixed-left-grid a-spacing-none"">...",$44.49 - $135.47,B007PBF3B8
458,43530,"['Clothing, Shoes & Jewelry', 'Women', 'Handba...",,"['No matter the weather, this durable tote is ...",,Deluxe Canvas Tote Bag,"['B07BQXC688', 'B07DM4K3CY', 'B0027A7K2C', 'B0...",['https://images-na.ssl-images-amazon.com/imag...,,Ensign Peak,"['22 inch handles', 'Interoir zippered pocket'...","380,644 in Clothing, Shoes & Jewelry (","['B07BQXC688', 'B07D1VJS1B', 'B07DM4K3CY', 'B0...",{},"<img src=""https://images-na.ssl-images-amazon....",,"<div class=""a-fixed-left-grid a-spacing-none"">...",$15.99,B001SM0OZK
553,96247,"['Clothing, Shoes & Jewelry', 'Women', 'Clothi...",,['Lace-trimmed underwire bra is the perfect co...,"class=""a-normal a-align-center a-spacing-smal...",Hanes Women`s Everyday Classic Underwire Bra,"['B00UN8D3BC', 'B00UN8EQG8', 'B00GMJ2F9C', 'B0...",['https://images-na.ssl-images-amazon.com/imag...,,,"['nylon, polyester, spandex', 'Hook and Eye cl...","137,212 in Clothing, Shoes & Jewelry (","['B00UN8D3BC', 'B00GMJ2F9C', 'B00UN8EQG8', 'B0...",{},"<img src=""https://images-na.ssl-images-amazon....",,"<div class=""a-fixed-left-grid a-spacing-none"">...",$13.99 - $19.88,B002WJLBFU
5304,14301,"['Clothing, Shoes & Jewelry', 'Baby', 'Baby Gi...",,['Dress her for dreamtime like the princess yo...,,Baby Aspen Baby-girls Newborn Big Dreamzzz Pri...,"['B005VIFUEC', 'B0727X6119', 'B00X22RIOY', 'B0...",['https://images-na.ssl-images-amazon.com/imag...,,Baby Aspen,[],[],"['B0727X6119', 'B00PGMP93K', 'B005VIFUEC', 'B0...",{},Baby,,,$22.09,B00C0K0MHO
2880,2166,"['Clothing, Shoes & Jewelry', 'Men', 'Clothing...",,['Van Heusen regular fit flex collar long slee...,"class=""a-normal a-align-center a-spacing-smal...",Van Heusen Men's Flex Regular Fit Solid Spread...,"['B07976STYP', 'B00C7P5HKE', 'B008R50SZM', 'B0...",['https://images-na.ssl-images-amazon.com/imag...,,,"['55% Cotton, 45% Polyester', 'Imported', 'But...","3,700 in Clothing, Shoes & Jewelry (","['B00C7P5HKE', 'B009F8R15A', 'B07976STYP', 'B0...",{},"<img src=""https://images-na.ssl-images-amazon....",,"<div class=""a-fixed-left-grid a-spacing-none"">...",$20.11 - $126.35,B014128YZU


In [133]:
metadata.detailed_text = metadata['category'].apply(lambda x: ' '.join(eval(x)[1:]))

In [144]:
# import re
# CLEANR = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')

# def cleanhtml(raw_html):
#   cleantext = re.sub(CLEANR, '', raw_html)
#   return cleantext

metadata.cleaned_features = metadata.feature.apply(lambda x: eval(x)[:4])

  metadata.cleaned_features = metadata.feature.apply(lambda x: eval(x)[:4])


In [140]:
metadata.feature.iloc[[34,4576,282,4519,259,957]].to_list()

["['Move worry-free with this wireless sports bra from Enell. Made from high quality Nylon and LYCRA, this sports bra prevents skin chafing while providing optimum support. The front hook closure helps you wear or take off the bra easily. Contructed with Naturexx Performance Fabric that wicks away moisture to keep you cool and odor-free during the most intense workouts']",
 "['Soft and smooth suede upper. Pull-on design. Smooth faux fur lining and insole to keep you warm in cold temperatures. Cushioned synthetic smooth fur footbed. Mid-calf, approximately 11.25-inch shaft height. Approximately 15-inch circumference. Approximately 0.75-inch man-made outsole with treads. Flat heel.']",
 '["The Hanes men\'s comfort blend EcoSmart sweatpants are made with different front and back measurements, so they actually fit. They\'re also made with up to 5 percent recycled polyester from plastic bottles, so you can look good and feel good."]',
 '["Men\'s Nike Zoom Train Incredibly Fast Training Shoe

In [7]:
metadata.title.iloc[1:5].to_list()

["Enell Women's Wire-Free Sports Bra",
 'Converse Chuck Taylor All Star Core Ox',
 'Aeromax Jr. Fire Fighter Bunker Gear, Black, Size 4/6',
 "Stacy Adams Men's Garrison Wingtip Oxford"]

In [12]:
metadata.columns

Index(['Unnamed: 0', 'category', 'tech1', 'description', 'fit', 'title',
       'also_buy', 'image', 'tech2', 'brand', 'feature', 'rank', 'also_view',
       'details', 'main_cat', 'similar_item', 'date', 'price', 'asin'],
      dtype='object')

In [31]:
# metadata['product_detail'] = metadata.map(lambda x: 
#                                         #  x.title + 
#                                         #  eval(x.category)[1:]
#                                         x['title']
#                                         #  eval(x['category'])[1:] +
#                                         #  x['description'] +
#                                         #  eval(x['feature'])[:4]
#                                          )



In [32]:
metadata['product_detail'].sample(5).to_list()

 "YEESAM Muslim Swimsuit Islamic Full Cover Modest Swimwear Beachwear Burkini Women Clothing ['<br /> \\n<br /> \\n<b>Assalam Alaikum!<br />Find more great new arrival products to our shop - U.R.Beautiful <br /><br />Tips to Buy</b> \\n<br /> We use the \\n<b>Int&apos;l Clothing Size Table</b> as below. \\n<b>Please do not confuse with the Amazon General Size/Europe size/US size etc.</b> \\n<br /> Fabrics have a certain flexibility, and there are still 1-2cm measurement errors. \\n<br /> If you want to make sure the Modest style, you might consider a bit larger size. But, that is only our advice, you should decide by yourself! \\n<b> <br /> In order to avoid the inconvenience of return or exchange, please carefully choose your own size, especially please pay attention to the bust size.</b> \\n<br /> \\n<br /> \\n<b>Int&apos;l Clothing Size Table</b> \\n<br />1 inch = 2.54 cm \\n<br /> \\n<br />Int&apos;l S ~~ Ref. US Size 2-4 \\n<br />Bust: 89 cm / 35 inches, Waist: 74 cm, Trouser: 96 

In [44]:
import pickle

with open('../Output/image_vector_dict.pickle', 'rb') as file:
    image_vec_dict = pickle.load(file)

with open('../Output/Image_similar_items_dict.pickle', 'rb') as file:
    image_sim_items = pickle.load(file)

In [43]:
image_vec_dict['B000072UMA_0.jpg'].shape

torch.Size([512])

In [45]:
len(image_sim_items.keys())

5543

In [55]:
# Create Image Indicator
ind_image = {}
for asin, similar_products in image_sim_items.items():
    if similar_products == 0:
        ind_image[asin] = False
    else:
        ind_image[asin] = True

with open('../Output/Image_ind_dict.pickle', 'wb') as file:
    pickle.dump(ind_image, file, protocol=pickle.HIGHEST_PROTOCOL)

In [57]:
ind_image['B004BOU5TM']

True

In [51]:
from PIL import Image

Image.open('../Project_Data/Image')

In [52]:
with open('../Output/asin_mapping.pickle', 'rb') as file:
   asin_mapp = pickle.load(file)
asin_mapp

In [62]:
!pip install contextualSpellCheck

Collecting contextualSpellCheck
  Using cached contextualSpellCheck-0.4.3-py3-none-any.whl (128 kB)
Collecting editdistance==0.6.0 (from contextualSpellCheck)
  Using cached editdistance-0.6.0.tar.gz (29 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting spacy>=3.0.0 (from contextualSpellCheck)
  Using cached spacy-3.5.2-cp311-cp311-macosx_10_9_x86_64.whl (6.8 MB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy>=3.0.0->contextualSpellCheck)
  Using cached spacy_legacy-3.0.12-py2.py3-none-any.whl (29 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy>=3.0.0->contextualSpellCheck)
  Using cached spacy_loggers-1.0.4-py3-none-any.whl (11 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy>=3.0.0->contextualSpellCheck)
  Using cached murmurhash-1.0.9-cp311-cp311-macosx_10_9_x86_64.whl (18 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy>=3.0.0->contextualSpellCheck)
  Using cached cymem-2.0.7-cp311-cp311-macosx_10_9_x86_64.whl (31 kB)
Collecting preshed<3.1.0,>=3.0.2 (from 

In [61]:
!pip install pip install editdistance

Collecting editdistance
  Downloading editdistance-0.6.2-cp311-cp311-macosx_10_9_x86_64.whl (21 kB)
Installing collected packages: editdistance
Successfully installed editdistance-0.6.2


In [64]:
reviews = 'something knvef rkvkfvn'
reviews.split('_')

['something knvef rkvkfvn']