In [2]:
import pandas as pd
import numpy as np
import sys

sys.path.insert(1, '../')

In [20]:
import os
os.chdir('../')

In [26]:
import pandas as pd
import numpy as np
import pdb
from tqdm import tqdm
import nltk
import string
import pickle 
import yaml
import os
import torch 

from sentence_transformers import SentenceTransformer

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

import pickle


In [27]:
CONFIG_FILE = 'config.yaml'

def text_preprocessing(text: str):
    
    # Lowercase
    text = text.lower()
    # Remove Punctuation
    # text = "".join([char for char in text if char not in string.punctuation])
    words = word_tokenize(text)  
    # Remove Stopwords
    stop_words = stopwords.words('english')
    filtered_words = [word for word in words if word not in stop_words]
    ## Stemming
    # porter = PorterStemmer()
    # stemmed = [porter.stem(word) for word in filtered_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(word) for word in filtered_words]
    
    return ' '.join(lemmatized)

def get_data_from_file(filename:str):

    with open(filename, 'rb') as file:
        data = pickle.load(file)
    return data


def get_top_n_indices(array, top_n):
    return array.argsort()[-top_n:][::-1]

In [112]:
class Recommender():

    def __init__(self, primary_column:str = 'title', top_n:int = 10) -> None:

        # Open the configuration file to load parameters
        with open(CONFIG_FILE, "r") as file:
            try:
                self.params = yaml.safe_load(file)
            except yaml.YAMLError as exc:
                print(exc)
        
        self.top_n = top_n
        self.primary_column = primary_column
        metadata_file = self.params['METADATA_FILE']
        self.metadata = pd.read_csv(metadata_file, index_col=0)
        self.preprocess_metadata()
        print("INFO: Initializing Model")
        self.model = SentenceTransformer('msmarco-distilbert-base-dot-prod-v3')
        print("INFO: Creating embeddings")
        self.embeddings = self.get_embeddings(column=primary_column)
        # self.product_feature_positiveness = get_data_from_file(self.params['product_feature_ratings'])


    def preprocess_metadata(self) -> None:

        self.metadata['description'] = self.metadata['description'].apply(lambda x: text_preprocessing(eval(x)[0]))

    def get_embeddings(self, column) -> np.array:

        # If embedding is locally saved already, load it 
        try:
            with open(self.params['EMBEDDING_FILE'], 'rb') as file:
                embeddings = pickle.load(file)
            print("INFO : Loaded Product Embeddings")
            return embeddings
        
        # If embedding is not locally available, create embeddings
        except:
            print("INFO : Creating Embeddings")
            if torch.cuda.is_available():
                embeddings = self.model.encode(self.metadata[column].tolist(), device='cuda') 
            else:
                embeddings = self.model.encode(self.metadata[column].tolist())
            embeddings = np.asarray(embeddings.astype('float32'))   
            
            print("INFO: Saving embeddings")
            if not os.path.exists(os.path.dirname(self.params['EMBEDDING_FILE'])):
                os.mkdir(os.path.dirname(self.params['EMBEDDING_FILE']))
            with open(self.params['EMBEDDING_FILE'],'wb') as file:
                pickle.dump(embeddings, file, protocol=pickle.HIGHEST_PROTOCOL)

            return embeddings

    def return_most_similar(self, query):

        print("INFO: Retrieving items for query")
        query_vector = self.model.encode([query])
        similarity = np.dot(self.embeddings,query_vector.T)
        top_items = similarity.flatten().argsort()[-self.top_n:][::-1]
        print(self.metadata['title'].iloc[top_items])
        return list(top_items), list(self.metadata['title'].iloc[top_items])


    def character_similarity(self, character_list:list, subset_indices:list, method:int = 1):

        # Method 1 - Join all characteristics to make an expanded query
        if method == 1 :
            character_query = ' '.join(character_list)
            character_query_vector = self.model.encode(character_query)
            similarity = np.dot(self.embeddings[subset_indices],character_query_vector.T)
            # top_items =  get_top_n_indices(similarity.flatten(), self.top_n)       
            return similarity / np.linalg.norm(similarity)

        # Method 2 - Rank items based on individual characteristics
        elif method == 2:
            similarity_list =[]
            for characteristic in character_list:
                character_vector = self.model.encode(characteristic)
                similarity = np.dot(self.embeddings[subset_indices],character_vector.T)
                # top_items_list.append(get_top_n_indices(similarity.flatten(), top_n=200)) 
                similarity_list.append(similarity.flatten() / np.linalg.norm(similarity.flatten())) 
            agg_similarity = np.array(similarity_list).mean(axis=0)
            return agg_similarity
        
    def feature_similarity(self, feature_imp):

        agg_feature_poitiveness = np.zeros(self.product_feature_positiveness.shape()[0])
        for ind, imp in enumerate(feature_imp):
            agg_feature_poitiveness += imp*self.product_feature_positiveness[ind]
        return agg_feature_poitiveness


    def return_most_similar_v1(self, query:str, character_list:list, character_method:int, feature_imp:list = None):

        print("INFO: Retrieving items for query")
        query_vector = self.model.encode([query + ' '.join(character_list)])
        query_similarity = np.dot(self.embeddings,query_vector.T).flatten()
        query_similarity  /= np.linalg.norm(query_similarity)
        final_similarity = query_similarity
        indices = get_top_n_indices(final_similarity, 10)
        # character_similarity = self.character_similarity(character_list, method=1)
        # feature_similarity = self.feature_similarity(feature_imp)

        return indices
                 

    def get_top_items_for_features(top_n):
        
        self
        final_product_embeddings = self.product_feature_ratings.mean(axis=1)
        top_item_ind = get_top_n_indices(final_product_embeddings, top_n=5)
        return top_item_ind

In [113]:
recommender = Recommender(primary_column='description')


INFO: Initializing Model
INFO: Creating embeddings
INFO : Loaded Product Embeddings


In [104]:
query = 'Women leather shoes'

# Query similarity
query_similarity = np.dot(recommender.embeddings,recommender.model.encode([query]).T).flatten()
query_similarity  /= np.linalg.norm(query_similarity)

query_top_20 = get_top_n_indices(query_similarity, 20)
# indices = get_top_n_indices(final_similarity, 20)

character_list = ['Women', 'Leather']
character_similarity = recommender.character_similarity(character_list=character_list, subset_indices=query_top_20, method=1)

final_similarity = character_similarity
sub_indices = get_top_n_indices(final_similarity, 10)
main_indices = query_top_20[sub_indices]
recommender.metadata['title'].iloc[main_indices].to_list()

["Fangsto Women's Cowhide Leather Loafers Flats Sandals Slip-On",
 "Skechers USA Men's Caswell Oxford",
 "Roper Women's Lace and Underlay Western Boot",
 "Jambu Women's Pecan Mary Jane Flat",
 "KEEN Women's Terradora Mid Wp-w Hiking Boot",
 "JARO VEGA Women's Soft Goatskin Genuine Leather Pumps Slender Block Heel Closed Almond Toe Dress Shoes",
 "FRYE Women's Patty Artisan Zip Bootie",
 "ECCO Men's Soft 7 Fashion Sneaker",
 "Easy Spirit Women's Realflex Walking Shoe",
 "ASICS Women's GT-2000 3 Running Shoe"]

In [115]:
query = "Women's shoes"
character_list = ['Shiny', 'Sexy', 'Black']
recommender.metadata['title'].iloc[recommender.return_most_similar_v1(query=query, character_list=character_list, character_method=1)].to_list()

INFO: Retrieving items for query


['SoftSpots Perri Women&rsquo;s Slip-On',
 "Blue Q Men's Crew Socks - Fits Men's Shoe Size 7-12",
 "Lacoste Men's Malahini Deck 316 1 Spm Fashion Sneaker",
 'In Touch Bamboo Above The Knee Skirt',
 "Clarks Women's Daelyn Summit Slip-On Loafer",
 "Luichiny Women's Case Closed Snow Boot",
 "Steve Madden Women's Pierce Ankle Bootie",
 "Muck Boot Women's Breezy Low Boot",
 "Skechers Sport Women's Scene Stealer Fashion Sneaker",
 'Soft Jersey Blend Sleep Hat Comfortable Soft Hat Liner Beanie Skull Cap Chemo Hair Loss Head Covering']

In [97]:
main_indices

array([4266, 1967, 1929, 4669, 4094, 2543, 2580,  896, 1814, 4878])

['Avery Hill Boys Shiny Or Matte Patent Leather Special Occasion Christening Shoes',
 'Jumping Jacks Destiny Ballet Flat (Toddler/Little Kid/Big Kid)',
 "Jambu Women's Pecan Mary Jane Flat",
 'Polo Ralph Lauren Ankle Sport Socks 6-Pack',
 "Lacoste Men's Malahini Deck 316 1 Spm Fashion Sneaker",
 "Bloch Dance Women's Jazzsoft Split Sole Leather Jazz Shoe",
 "ECCO Men's BIOM Hydromax Golf Shoe",
 "Skechers Men's Diameter-Guy Thing Oxford Sneaker",
 "UGG Men's Hendren Tl Winter Boot",
 'SoftSpots Perri Women&rsquo;s Slip-On']

In [80]:
recommender.metadata['title'].iloc[get_top_n_indices(character_similarity, 10)].to_list()

['Crocs A Leigh 2-Strap Miniwedge',
 "Womens Soft Leather Cigarette Case Holds Regular and 100's USA Made",
 "Cole Haan Men's Calhoun Lace-Up Derby Shoe",
 "Geox Men's Federico 9 Shoe",
 "Skechers USA Men's Caswell Oxford",
 'Woly German Suede Nubuck Brush 5&quot; Removes Dirt &amp; Stains on Designer Shoes, Boots, Handbags, Clothes.',
 "Birkenstock Women's Madrid Birko-Flor Sandal",
 "Lucky Women's Galvann",
 'Collonil Nubuck + Velours/Suede Waterproof Protector Repellent Spray, 200 ml',
 "Fangsto Women's Cowhide Leather Loafers Flats Sandals Slip-On"]

In [81]:
recommender.metadata['title'].iloc[get_top_n_indices(query_similarity, 10)].to_list()

['SoftSpots Perri Women&rsquo;s Slip-On',
 "Lacoste Men's Malahini Deck 316 1 Spm Fashion Sneaker",
 "Rocket Dog Women's Rainy Rubber Rain Boot",
 "Clarks Women's Blanche Nora Ballet Flat",
 "Bloch Dance Women's Jazzsoft Split Sole Leather Jazz Shoe",
 'Teva Scamper Water Shoe (Toddler/Little Kid/Big Kid)',
 "Ollio Women's Shoe Cross Braided Multi Color Flat Sandal",
 'Adidas Pretereo 2 Wrestling Shoes - Collegiate Royal/White/Black',
 "Dr. Martens Men's Octavius Lace Shoe",
 "New Balance Women's Minimus Sport Spikeless Golf Shoe"]