# The Home Depot Decor Case

Getting Started  |  Data Prep  |  Data Exploration  |  Preprocessing  |  **Model Selection & Tuning**  |  Final Model

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

import gensim
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import surprise

# other imports
%matplotlib inline

In [3]:
import sys
import inspect
sys.path.insert(0, '../modules')


# now read in new functions
from helpers import read_in_dataset, get_num_of_levels, flatten_categories, search_cons_status
from hierarchy import Hierarchy
from textpreprocessor import TextPreprocessor

**Read in Data**

In [4]:
# Read in product related data
verbose_opt = False
catalog = read_in_dataset('Decor_catalog.csv', verbose=verbose_opt)
#prod_desc = read_in_dataset('Product_name_description.csv', verbose=verbose_opt)
#prod_engagement = read_in_dataset('Product_engagement.csv', verbose=verbose_opt)

# Read in search related data
#navigations = read_in_dataset('Visual_navigations.csv', verbose=verbose_opt)
search_imp = read_in_dataset('Search_impression.csv', verbose=verbose_opt)

# Get Hierarchy

In [None]:
# Get Product Taxonomy 

h = Hierarchy

h


# Preprocess & Vectorize Training Data

In [5]:
# Define List of Stop Words
new_stop_words = ['in', 'sq','ft', 'yd', 'cm', 'mm','gal','lb' ,'lbs','qt','oz', 'h', 'w', 'ii', 'x']
stopwords = set(stopwords.words('english') + new_stop_words)

In [6]:
# Search
search_corpus = search_imp['Search_term'].values

# Use preprocessor on searches
tps = TextPreprocessor
search = tps()
clean_searches = list(search.preprocess(search_corpus))
searches_trigram = search.trigram_model(clean_searches, verbose=False)
searches_vec, searches_matrix = search.vectorize(searches_trigram)



In [20]:
# Product Names
names_corpus = prod_desc['Product_name'].values

# Use preprocessor on product names

tpp = TextPreprocessor

name = tpp()
clean_names = list(search.preprocess(names_corpus))
names_trigram = search.trigram_model(clean_names, verbose=False)
names_vec, names_matrix = search.vectorize(names_trigram)



# Search Term - Product (Impressions) Matrix

In [7]:
# Get impresions as document of sku ids
search_imp['Impression'] = search_imp['Impression'].str.replace(';', ' ')

# impressions array
imp = search_imp['Impression'].values

In [8]:
# Vectorize Data
vec = CountVectorizer()

#Fit Transform
seach_prod = vec.fit_transform(imp)

# Preview Sparse Matrix
pd.DataFrame(seach_prod.toarray(), index=search_imp['Search_term'].values, columns=vec.get_feature_names()).head()

Unnamed: 0,100012014,100022800,100023109,100023973,100036137,100044505,100048075,100051570,100061089,100074869,...,307717049,307717219,307717221,307727052,307833295,307920434,307939445,307939707,307940057,307940314
coffee,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
bag chair,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
kitchen wall tile,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mirror tile,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
entryway,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Category - Product Matrix

In [9]:
# Expand Nested Columns
catalog['Category'] = catalog['Category'].str.split('>')

cat = catalog['Category'].values

In [10]:
# Vectorize Data

def dummy(doc):
    return doc

vec_cat = CountVectorizer(lowercase=False, preprocessor=dummy,tokenizer=dummy)

#Fit Transform
prod_cat = vec_cat.fit_transform(cat)

# Preview Sparse Matrix
prod_cat_df  = pd.DataFrame(prod_cat.toarray(), index=catalog['SKU_ID'].values, columns=vec_cat.get_feature_names())

prod_cat_df.head()

Unnamed: 0,AV Accessories,Abrasives,Accent Chairs,Accent Tables,Accessories,Acoustic Wall Paneling,Acrylic Sheets,Activity Tables,Address Plaques,Address Signs,...,Work Aprons,Work Gloves,Work Hats,Work Hoodies & Sweatshirts,Work Lights,Workbenches & Workbench Accessories,Workwear,Writing Utensils,Yoga Mats,Yogurt Makers
302087889,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
301688935,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
206788920,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
302087892,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
302087891,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
#get search terms 
search_labels = pd.Series(vec.get_feature_names(), name='prod')

#Get category list 
cat_labels = vec_cat.get_feature_names()

In [12]:
search_labels.index = search_labels

In [13]:
prod_cat_reduced = prod_cat_df.join(search_labels, how='right').drop('prod', axis=1)

In [15]:
seach_prod

<354x4764 sparse matrix of type '<class 'numpy.int64'>'
	with 8696 stored elements in Compressed Sparse Row format>

In [None]:
term_cat_matrix = np.dot(seach_prod, prod_cat_reduced)

In [None]:
# List of products accent table
at_prods = search_imp[search_imp['Search_term'] =='end table']['Impression'].str.split()
#a = at_prods.to_list()
#at_prods= a[0]

# labels
def get_labels(prod_lst=at_prods):
    
    labels = []
    for prod in prod_lst:
        lst = catalog.loc[int(prod)]
        
        if len(lst)>1:
            lsts = catalog.loc[int(prod)]['Category'].to_list()
            for l in lsts:
                labels.extend(l)
        else:
            labels.extend(lst['Category'])
        
    d = Counter(labels)
    
    return d 
    

In [None]:
labels = []
for prod in at_prods:
    labels.extend(catalog.loc[int(prod)]['Category'])

In [None]:
labels.append(catalog.loc[int(at_prods[0])])

In [None]:
catalog.loc[int(at_prods[0])]['Category']

### New heading

# Get Top Categories

In [12]:
# get index and col labels
cat_labels = prod_cat_df.columns.to_list()

In [16]:
# Similarity Matrix of Products
 
def get_prod_similarity(prod1, prod2, categories):
    cat1 = categories[prod1]
    cat2 = categories[prod2]
    
    sum_xy, sum_xx, sum_yy = 0, 0, 0
    
    for i in range(len(cat1)):
        x = cat1[i]
        y = cat2[i]
        
        sum_xx += x * y
        sum_yy += y * y
        sum_xy += x * y
        
    return sum_xy/math.sqrt(sum_xx*sum_yy)

In [24]:
# create empty df
prod_similarity_matrix = pd.DataFrame(index=cat_labels, columns=cat_labels)

In [26]:
class Recommender: 
    def __init__(self, data, prod_col, search_terms, products, cf_method='item', similarity='pearson'):
        self.data = data
        self.search_terms = search_terms
        self.products = products
        self.cf_method = cf_method
        self.similarity = similarity
        self.similarity_matrix = []
        self.term_prod_scores = []
        self.recs = []
        
    def create_similarity_matrix(self):
        '''Create correlation/similarity matrix for all items and stores result as self.similarity_matrix'''
        
        self.similarity_matrix = self._create_empty_df(self.cf_method)
        self._fill_similarity_matrix(self.similarity_matrix, self.similarity)
        
    def _create_empty_df(self, cf_type):
        '''creates and returns empty df with items as rows and columns'''
        if cf_type == 'item':
            labels = self.item_cols
        elif cf_type == 'user':
            labels = self.data[user_col]
        else:
            raise ValueError('Invalid collaborative filtering technique.  Please specify "item" or "user".')
        return pd.DataFrame(index=labels, columns=labels)
    
    def _fill_similarity_matrix(self, similarity_matrix, similarity):
        '''calculates correlation between items using specified similarity and saves results in similarity_matrix
           valid similarity types: jaccard, pearson, cosine'''
        k=0
        item_df = self.data[self.item_cols] 
        #print(item_df)
        progress_bar = tqdm(total = similarity_matrix.shape[0], mininterval=5)
        for i in range(similarity_matrix.shape[0]):
            progress_bar.update()
            similarity_matrix.ix[i,i] = 1.0
            x = item_df.ix[:,i]
            for j in range(i,similarity_matrix.shape[1]):
                y = item_df.ix[:,j]
                similarity_matrix.ix[i,j] = self._get_similarity(x, y, similarity)
                similarity_matrix.ix[j,i] = similarity_matrix.ix[i, j]
                
    def _get_similarity(self, x, y, similarity):
        '''calculated specified correlation between two vectors and returns result'''
        if similarity == 'pearson':
            return self._pearson_similarity(x, y)
        elif similarity == 'jaccard':
            return self._jaccard_similarity(x, y)
        elif similarity == 'cosine':
            return self._cosine_similarity(x, y)
        else:
            raise ValueError('Invalid similarity type.  Please specify "cosine", "pearson", or "jaccard".')
        
    def _pearson_similarity(self, x, y):
        '''returns pearson correlation between x and y: covariance(x,y)/(std_dev(x)*std_dev(y))'''
        #effective if data can be transformed to normal distribution 
        pass

    def _jaccard_similarity(self, x, y):
        '''returns jaccard correlation between x and y: |intsection(x,y)|/|union(x,y)|'''
        #ideal for binary data, e.g. buy vs non-buy
        nonzero_x = set(np.nonzero(x)[0])
        nonzero_y = set(np.nonzero(y)[0])
        intersection_size = len(nonzero_x.intersection(nonzero_y))
        union_size = len(nonzero_x.union(nonzero_y))
        if union_size == 0:
            return 0
        else:
            return intersection_size/union_size

    def _cosine_similarity(self, x, y):
        '''returns cosine of angles between x and y'''
        pass
    

NameError: name 'cosine_similarity' is not defined

In [None]:
rec_engine.create_similarity_matrix()


# Recommender

In [None]:
class Recommender:
    def __init__(self, data, search_terms, products, similarity='pearson'):
        self.data = data
        self.search_terms = search_terms
        self.products = products
        self.similarity = similarity
        self.similarity_matrix = []
        self.term_prod_scores = []
        self.recs = []
    
    def create_similarity_matrix(self):
        '''Create correlation/similarity matrix for all items and stores result as self.similarity_matrix'''
        
        self.similarity_matrix = self._create_empty_df(self.cf_method)
        self._fill_similarity_matrix(self.similarity_matrix, self.similarity)
        
    def 

# Model Selection

In [37]:
search_imp.head()

Unnamed: 0,Search_term,Impression
0,coffee,203054703;207061099;305561354;305561469;301692...
1,bag chair,305573411;305608772;301092388;301092383;301092...
2,kitchen wall tile,205140711;302603437;205762409;204923728;204337...
3,mirror tile,305696621;304142073;304142126;304142039;303058...
4,entryway,203532713;203532652;300750153;302042988;302042...


In [38]:

imp_expan.head()

Unnamed: 0,Search_term,L1,L2,L3,L4,L5,L6,L7,L8,L9,...,L48,L49,L50,L51,L52,L53,L54,L55,L56,L57
0,coffee,203054703,207061099,305561354,305561469,301692317,206090043,206090041,300742646,204077166,...,,,,,,,,,,
1,bag chair,305573411,305608772,301092388,301092383,301092391,305676166,305676517,305676196,305676124,...,,,,,,,,,,
2,kitchen wall tile,205140711,302603437,205762409,204923728,204337886,204695051,204065143,204289147,204170143,...,,,,,,,,,,
3,mirror tile,305696621,304142073,304142126,304142039,303058463,304187832,303383559,305696699,100656412,...,,,,,,,,,,
4,entryway,203532713,203532652,300750153,302042988,302042995,302042994,206639873,203532749,300750157,...,,,,,,,,,,


In [26]:
catalog_expanded.head()

Unnamed: 0,SKU_ID,L1,L2,L3,L4,L5,L6
0,302087889,Lighting,Sconces,,,,
1,301688935,Lighting,Sconces,,,,
2,206788920,Lighting,Sconces,,,,
3,302087892,Lighting,Sconces,,,,
4,302087891,Lighting,Sconces,,,,


In [1]:
# Define Variables


In [None]:
# Build Hierarchy 


In [None]:
# Save Hierarchy


## Parameter Tuning and Model Selection

## Final Parameters