# The Home Depot Decor Case

Getting Started  | Data Exploration  |  Preprocessing  |  **Benchmark Model** 

In [1]:
__author__ = 'Jaime Garvey'
__email__ = 'jaimemgarvey@gmail.com'

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

import gensim
from gensim.corpora.dictionary import Dictionary
from gensim.models import KeyedVectors as kv

from nltk.corpus import stopwords
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from collections import Counter

# other imports
%matplotlib inline

In [9]:
import sys
import inspect
sys.path.insert(0, '../modules')


# now read in new functions
from helpers import read_in_dataset, get_num_of_levels, flatten_categories, search_cons_status
from hierarchy import Hierarchy
from textpreprocessor import TextPreprocessor

**Read in Data**

In [32]:
# Read in product related data
#verbose_opt = False
#catalog = read_in_dataset('Decor_catalog.csv', verbose=verbose_opt)
#prod_desc = read_in_dataset('Product_name_description.csv', verbose=verbose_opt)
#prod_engagement = read_in_dataset('Product_engagement.csv', verbose=verbose_opt)

# Read in search related data
#navigations = read_in_dataset('Visual_navigations.csv', verbose=verbose_opt)
#search_imp = read_in_dataset('Search_impression.csv', verbose=verbose_opt)

# Overview

For our benchmark model, we will be exploring a collaborative filtering approach outlined below. 

---------- Data ---------------
1. Get Product Hierarchy Dictionary
2. Preprocess Search Terms
3. Consolidate Search Terms


---------- Recommender ------------ 

_Query Understanding_
4. Search similarity Matrix
  - Get Search Product Matrix
  - Compute Cosine Similarity
  
5. Match Search Term
  - Get Top Recommendation
  
_Query Classification_
6. Get Search-Cat Map
7. Get Top N Nodes
8. Classify Node
9. Make Recommendations


# Build Product Taxonomy (Hierarchy)

In [3]:
#read in data
catalog = read_in_dataset('Decor_catalog.csv', verbose=False)

In [96]:
# Get Product Taxonomy 
h = Hierarchy(data=catalog, num_levels=6)

# Preprocess & Consolidate Search Terms

In [10]:
# Read in Data 
search_imp = read_in_dataset('Search_impression.csv', verbose=False)

In [12]:
# Search
search_corpus = search_imp['Search_term'].values


tp = TextPreprocessor()

# Use preprocessor on searches
search_docs = tp.clean_text(search_imp, 'Search_term', method='stemmer')

#Consolidate Search after Preprocessing
consolidated_search = tp.compare_clean_searches(search_docs, search_imp)

#searches_trigram = tp.trigram_model(search_docs, verbose=False)
#searches_vec, searches_matrix = tp.vectorize(search_docs, format='docs')

In [37]:
#get search tokens
search_docs_series = pd.Series(search_docs)

search_tokens = search_docs_series.str.split().tolist()

# Query Understanding

In [38]:
catalog = read_in_dataset('Decor_catalog.csv', verbose=False)

In [39]:
# get Search Term Similarity Matrix

# Get impresions as document of sku ids
search_imp['Impression'] = search_imp['Impression'].str.replace(';', ' ')

# impressions array
imp = search_imp['Impression'].values

# Vectorize Data
vec = CountVectorizer()

#Fit Transform
seach_prod = vec.fit_transform(imp)

# Preview Sparse Matrix
pd.DataFrame(seach_prod.toarray(), index=search_imp['Search_term'].values, columns=vec.get_feature_names()).head()

Unnamed: 0,100012014,100022800,100023109,100023973,100036137,100044505,100048075,100051570,100061089,100074869,...,307717049,307717219,307717221,307727052,307833295,307920434,307939445,307939707,307940057,307940314
coffee,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
bag chair,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
kitchen wall tile,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mirror tile,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
entryway,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
#Get Search Term Similarity Matrix
search_similarity_matrix = cosine_similarity(seach_prod,seach_prod)

#Dataframe
index = search_imp['Search_term'].values
search_similarity_matrix_df = pd.DataFrame(search_similarity_matrix, index=index, columns=index)

In [41]:
# get term to index
def term2index_dict(search_terms_lst):
    d = {}
    for i,term in enumerate(search_terms_lst):
        d[term] = i
        
    return d

term2index = term2index_dict(search_imp.index.values)

#get top n closest query
def get_topn_query(term, sim_matrix=search_similarity_matrix_df, n=1):
    return sim_matrix.loc[:,term].sort_values(ascending=False).head(n)

In [42]:
get_topn_query('coffee')

coffee    1.0
Name: coffee, dtype: float64

# Query Classification

1. Get Search Term-Cat Matrix: Extract Categories from Top 24 Products 
2. Get Top N Nodes
3. Flatten: Create Node/Category Vectors with Node and Children*
4. Compute Cosine Similarity*
5. Take Top Match (Node/Category)*


In [43]:
#get search terms to limit catelog 
search_labels = pd.Series(vec.get_feature_names(), name='prod')
search_labels.index = search_labels

In [44]:
# Expand Path Columns(e.g. Furniture>Living Room>Accent Tables)

catalog['Category'] = catalog['Category'].str.split('>')

cat = catalog['Category'].values

# Vectorize Product Categories (SKU-ID - Category Matrix)
def dummy(doc):
    return doc

vec_cat = CountVectorizer(lowercase=False, preprocessor=dummy,tokenizer=dummy)

#Fit Transform
prod_cat = vec_cat.fit_transform(cat)


#Get category list 
cat_labels = vec_cat.get_feature_names()

#reduce catalog to product that are in our top 24 product list across search terms
catalog_reduced = catalog[catalog['SKU_ID'].isin(search_labels)]

#reset index for impressions and catelog data
search_imp.set_index('Search_term', inplace=True)
catalog_reduced.set_index('SKU_ID', inplace=True)

In [45]:
#search terms list for for eligible searches
search_terms_unique = list(search_imp.index.unique())

In [46]:
#split impressions into list
search_imp['Impression'] = search_imp['Impression'].str.split()

In [47]:
'''
Get dictionary of {term1: cat1: count, Cat2, count},
                    term2: {cat4: count, Cat9, count}...}
''' 
def get_labels(prod_lst):
    
    labels = []
    for prod in prod_lst:
        lst = catalog_reduced.loc[int(prod)]
        
        if len(lst)>1:
            lsts = catalog_reduced.loc[int(prod)]['Category'].to_list()
            for l in lsts:
                labels.extend(l)
        else:
            labels.extend(lst['Category'])
        
    d = Counter(labels)
    
    return d 

def get_term_cat_dict(search_term_lst):
    term_cat_dict = {}
    for term in search_term_lst:
        term_imp = search_imp.loc[term]['Impression']
        labels = get_labels(term_imp)
        
        term_cat_dict[term] = labels
        
    return term_cat_dict

In [48]:
term_cat_dict = get_term_cat_dict(search_terms_unique)

# Recommendation Example

In [49]:
query = 'accent table'

In [50]:
#Example
candidates = term_cat_dict[query]
candidates

Counter({'Furniture': 27,
         'Living Room Furniture': 24,
         'Accent Tables': 24,
         'End Tables': 21,
         'Entryway Furniture': 3,
         'Entryway Tables': 3,
         'Console Tables': 3})

In [51]:
def flatten_nodes(candidate=candidates, n=5):
    
    keys = list(candidates.keys())
    
    candiate_cats = keys[:n]
    
    candidate_docs = {}
    
    for candidate in candiate_cats:
        #get node id
        id = h.node2id[candidate]
        
        #get children node ids
        children_ids = h.prod_map[id]
        
        #convert back to children names
        child_doc= ' '.join([h.id2node[child_id] for child_id in children_ids])
        
        candidate_docs[candidate] = child_doc
    
    return candidate_docs

In [52]:
#save candidate docs
candidate_docs = flatten_nodes()
candidate_docs

{'Furniture': 'Furniture Accessories & Replacement Parts Kids & Baby Furniture Folding Tables & Chairs Home Office Furniture Entryway Furniture Living Room Furniture Kitchen & Dining Room Furniture Bedroom Furniture',
 'Living Room Furniture': 'Accent Tables Chairs Futons Slipcovers Sectionals Ottomans Media Storage TV Stands Living Room Sets Sofas & Loveseats',
 'Accent Tables': 'End Tables Indoor Plant Stands Console Tables Coffee Tables',
 'End Tables': 'nan',
 'Entryway Furniture': 'Umbrella Holders Coat Racks Hall Trees Entryway Benches & Trunks Entryway Tables nan'}

In [53]:
#Preprocess docs
t = TextPreprocessor()

# Use preprocessor on searches
node_docs = t.clean_docs([query] + list(candidate_docs.values()))

#vectorize
vec_cand, matrix_cand = t.vectorize(node_docs)

In [54]:
node_term = pd.DataFrame(matrix_cand.toarray(), index=[query] +list(candidate_docs.keys()), columns=vec_cand.get_feature_names())
node_term

Unnamed: 0,accent,accessory,baby,bedroom,bench,chair,coat,coffee,console,dining,...,sectional,slipcover,sofa,stand,storage,table,tree,trunk,tutor,umbrella
accent table,0.848083,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.529863,0.0,0.0,0.0,0.0
Furniture,0.0,0.126019,0.126019,0.126019,0.0,0.103337,0.0,0.0,0.0,0.126019,...,0.0,0.0,0.0,0.0,0.0,0.064563,0.0,0.0,0.0,0.0
Living Room Furniture,0.24051,0.0,0.0,0.0,0.0,0.24051,0.0,0.0,0.0,0.0,...,0.293299,0.293299,0.293299,0.24051,0.293299,0.150265,0.0,0.0,0.293299,0.0
Accent Tables,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.37703,0.37703,0.0,...,0.0,0.0,0.0,0.30917,0.0,0.579488,0.0,0.0,0.0,0.0
End Tables,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Entryway Furniture,0.0,0.0,0.0,0.0,0.322336,0.0,0.322336,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.165141,0.322336,0.322336,0.0,0.322336


In [55]:
#Get similiarity matrix
query_node_sim_matrix = cosine_similarity(matrix_cand,matrix_cand)

query_node_sim_df = pd.DataFrame(query_node_sim_matrix, index=node_term.index, columns=node_term.index)
query_node_sim_df

Unnamed: 0,accent table,Furniture,Living Room Furniture,Accent Tables,End Tables,Entryway Furniture
accent table,1.0,0.034209,0.283592,0.30705,0.0,0.087502
Furniture,0.034209,1.0,0.109116,0.037413,0.0,0.06529
Living Room Furniture,0.283592,0.109116,1.0,0.161435,0.0,0.024815
Accent Tables,0.30705,0.037413,0.161435,1.0,0.0,0.095697
End Tables,0.0,0.0,0.0,0.0,1.0,0.26432
Entryway Furniture,0.087502,0.06529,0.024815,0.095697,0.26432,1.0


In [56]:
query_node_sim_df.loc[:,query].sort_values(ascending=False)

accent table             1.000000
Accent Tables            0.307050
Living Room Furniture    0.283592
Entryway Furniture       0.087502
Furniture                0.034209
End Tables               0.000000
Name: accent table, dtype: float64

In [57]:
query_node_sim_df.loc[:,query].sort_values(ascending=False).index[1]

def predict_node():
    node_pred = query_node_sim_df.loc[:,query].sort_values(ascending=False).index[1]
    return node_pred
    
def get_recommendations(node_pred):
    
    display_pred = []
    
    #get node id
    node_id = h.node2id[node_pred]
    
    #get children ids
    children_ids = h.prod_map[node_id]
    
    #get children names
    for child_id in children_ids:
        child_name = h.id2node[child_id]
        
        display_pred.append(child_name)
    return display_pred

In [58]:
node_pred = predict_node()

recs = get_recommendations(node_pred)

print(f'Search Term:{query}')
print(f'Predicted Category: {node_pred}')
print(f'Recommended Display Names: {recs}')

Search Term:accent table
Predicted Category: Accent Tables
Recommended Display Names: ['End Tables', 'Indoor Plant Stands', 'Console Tables', 'Coffee Tables']


# Run Recommendation Engine

In [60]:
def get_recommendation(query, verbose=False):
    candidates = term_cat_dict[query]
    
    candidate_docs =  flatten_nodes()
    
    t = TextPreprocessor()

    # Use preprocessor on searches
    node_docs = t.clean_docs([query] + list(candidate_docs.values()))

    #vectorize
    vec_cand, matrix_cand = t.vectorize(node_docs)
    
    #Convert to Dataframe
    node_term = pd.DataFrame(matrix_cand.toarray(), index=[query] +list(candidate_docs.keys()), columns=vec_cand.get_feature_names())
    
    #Get similiarity matrix
    query_node_sim_matrix = cosine_similarity(matrix_cand,matrix_cand)

    query_node_sim_df = pd.DataFrame(query_node_sim_matrix, index=node_term.index, columns=node_term.index)

    #Get prediction
    node_pred = predict_node()

    recs = get_recommendations(node_pred)
    
    if verbose:
        print(f'Search Term:{query}')
        print(f'Predicted Category: {node_pred}')
        print(f'Recommended Display Names: {recommendations}')
        
    return recs

# Evaluator 

In [89]:
class Evaluator:

    def __init__(self, eval_data_file=None, metric=None):
        self.eval_data_file = eval_data_file
        self.metric=metric
        self.run()

    def load_data(self, data_folder='raw', data_type='csv'):
        if data_type == 'csv':
            df = pd.read_csv('../data/{}/{}'.format(data_folder, self.eval_data_file))
        self.eval_data = df

    def get_xy(self):
        self.eval_target = self.eval_data.groupby('Search_term')['Display_name'].apply(list)
        self.eval_terms = self.eval_target.index.tolist()
        
    def jaccard_sim(self, query, recs_pred):
        
        recs_act = self.eval_target.loc[query]
        
        intersect = len(list(set(recs_pred) & set(recs_act)))
        union = len(recs_pred) + len(recs_act) - intersect
        
        return intersect/union
    
    def run(self):
        self.load_data()
        self.get_xy()

In [90]:
e=Evaluator('Visual_navigations.csv')

In [91]:
def mean_similarity():
    scores = 0

    for term in e.eval_terms:

        recs_pred = get_recommendation(term)

        score = e.jaccard_sim(term, recs_pred)

        scores += score
        
    return scores/len(e.eval_terms)

In [None]:
mean_similarity()