# The Home Depot Decor Case

Getting Started  |  Data Prep  |  Data Exploration  |  **Preprocessing**  |  Model Tuning  |  Final Model

In [28]:
import pandas as pd
import numpy as np

import gensim
from gensim.parsing.preprocessing import remove_stopwords

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

from sklearn import preprocessing
from sklearn.metrics.pairwise import linear_kernel

%matplotlib inline

In [2]:
import sys
import inspect
sys.path.insert(0, '../modules')


# now read in new functions
from helpers import read_in_dataset, get_num_of_levels, flatten_categories, search_cons_status

**Read in Data**

In [3]:
# Read in product related data
verbose_opt = False
catalog = read_in_dataset('Decor_catalog.csv', verbose=verbose_opt)
prod_desc = read_in_dataset('Product_name_description.csv', verbose=verbose_opt)
prod_engagement = read_in_dataset('Product_engagement.csv', verbose=verbose_opt)

# Read in search related data
navigations = read_in_dataset('Visual_navigations.csv', verbose=verbose_opt)
search_imp = read_in_dataset('Search_impression.csv', verbose=verbose_opt)

# Preprocessor Text

## Preprocess Product Names

In [20]:
# Add Stopword List
# Define List of Stop Words
new_stop_words = ['in', 'sq','ft', 'yd', 'cm', 'mm','gal','lb' ,'lbs','qt','oz', 'h', 'w', 'ii', 'x']

stop_words = set(stopwords.words('english') + new_stop_words)

In [21]:
# Convert Product Name to Array
names = prod_desc['Product_name'].values

In [100]:
def preprocess_text(docs, stop_words=stop_words):
    '''
    Process docs
    
    Returns:
    --------
    tokenized list of docs
    '''
    
    lemmatizer = WordNetLemmatizer()

    for doc in docs:
        tokens = gensim.utils.simple_preprocess(doc)
        yield([lemmatizer.lemmatize(token) for token in tokens if not token in stop_words])
 

In [108]:
clean_names = list(preprocess_text(names))

In [143]:
def join_tokens(clean_names):
    for name in clean_names:
        yield ' '.join([token for token in name])

In [148]:
names_corpus = list(join_tokens(clean_names))

## Build Bigram/Trigram Model

In [103]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(clean_names, min_count=25)

trigram = gensim.models.Phrases(bigram[clean_names], min_count=15) 

# Names as a trigram/bigram
bigram_model = gensim.models.phrases.Phraser(bigram)
trigram_model = gensim.models.phrases.Phraser(trigram)

In [110]:
def trigram_model(corpus, threshholds=(25,15), verbose=False):
    '''
    Build trigram model
    '''

    bigram = gensim.models.Phrases(clean_names,
                                    min_count=threshholds[0])

    trigram = gensim.models.Phrases(bigram[clean_names],
                                    min_count=threshholds[1])

    # trigram/bigram model
    bigram_model = gensim.models.phrases.Phraser(bigram)
    trigram_model = gensim.models.phrases.Phraser(trigram)

    corpus_new = [trigram_model[bigram_model[doc]] for doc in corpus]

    if verbose:
        for doc in corpus[0:5]:
            print(f'{" ".join(trigram_model[bigram_model[doc]]) } \n')
                  
    return corpus_new

In [111]:
names_new = trigram_model(clean_names, threshholds=(25,15), verbose=True)



concrete solid block 

hummingbird stencil 

acrylic clear white dry_erase_board 

clear white boom dry_erase_board 

coconut charcoal 



# Preprocess Search Terms

In [70]:
# Make array
searches = search_imp['Search_term'].values

# Clean Search Terms
clean_searches = list(preprocess_text(searches))

# Get Status
search_cons_status(clean_searches, searches)

New Number of Searches: 258
Number of Consolidated Searches: 96
Percent Reduction: 27.1%


In [119]:
# tokens
search_tokens = [search.split() for search in clean_searches]

In [124]:
search_trigrams = trigram_model(search_tokens, threshholds=(25,15), verbose=True)



coffee 

bag chair 

kitchen wall tile 

mirror tile 

entryway 



# Vectorize

In [130]:
def vectorize(corpus):

    vectorizer = TfidfVectorizer(stop_words='english')
    matrix = vectorizer.fit_transform(corpus)

    return vectorizer, matrix

In [131]:
search_vec, search_matrix = vectorize(clean_searches)

In [133]:
search_matrix

<354x199 sparse matrix of type '<class 'numpy.float64'>'
	with 698 stored elements in Compressed Sparse Row format>

In [150]:
prod_vec, prod_matrix = vectorize(names_corpus)

# Text Processor

In [169]:
class TextPreprocessor:
    def __init__(self, stop_words):
        self.stop_words = stop_words
        
        
    def preprocess(self, corpus):
        '''
        Process documents

        Keyword Arguments:
        ------------------
        * corpus - list of documents
        * stop_words = set of stopwords

        Returns:
        --------
        Each document as a list of tokens (iterator)

        e.g., clean_corpus = list(preprocess(corpus))
        '''

        lemmatizer = WordNetLemmatizer()

        for doc in corpus:
            tokens = gensim.utils.simple_preprocess(doc, deacc=True)
            yield([lemmatizer.lemmatize(token) for token in tokens if not token in stop_words])
    
    
    def trigram_model(self, corpus_tokens, threshholds=(25,15), verbose=False):
        '''
        Build trigram model

        Keyword Arguments:
        ------------------
        * corpus - list of documents (as tokens)
        * stop_words = set of stopwords

        Returns:
        --------
        '''

        bigram = gensim.models.Phrases(corpus_tokens,
                                        min_count=threshholds[0])

        trigram = gensim.models.Phrases(bigram[corpus_tokens],
                                        min_count=threshholds[1])

        # trigram/bigram model
        bigram_model = gensim.models.phrases.Phraser(bigram)
        trigram_model = gensim.models.phrases.Phraser(trigram)

        corpus_new = [trigram_model[bigram_model[doc]] for doc in corpus_tokens]

        if verbose:
            for doc in corpus_new[0:5]:
                print(f'{" ".join(trigram_model[bigram_model[doc]]) } \n')

        return corpus_new
                      
    def vectorize(self, corpus, format='tokens'):
                      
        if format == 'tokens':    
            corpus = self.tokens2corpus(corpus)
    
        vectorizer = TfidfVectorizer(stop_words='english')
        matrix = vectorizer.fit_transform(corpus)

        return vectorizer, matrix


    def tokens2corpus(self, tokenized_docs):
        '''
        Helper fuction to convert from tokens to text
        '''
        for doc in tokenized_docs:
            yield ' '.join([token for token in doc])

## Testing Preprocessor

In [170]:
tp = TextPreprocessor

In [171]:
search = tp(stop_words)

In [172]:
clean_searches = list(search.preprocess(searches))

In [173]:
searches_trigram = search.trigram_model(clean_searches, verbose=True)

coffee 

bag chair 

kitchen wall tile 

mirror tile 

entryway 





In [174]:
searches_vec, searches_matrix = search.vectorize(searches_trigram)

# Test

**To Explore**
- Language detection 
- Snowball stemmer vs Lemmatizer 

# Archive

## Rec

In [127]:
#Make names df
names_df= prod_desc_cat.loc[:, ['SKU_ID','Product_name']]
names_df.set_index('Product_name', inplace=True)
names_df['clean_names'] = clean_names
names_df['clean_names'] = names_df['clean_names'].str.join(' ')


#Make queries df
queries_df = search_prod_levels.loc[:, ['Search_term']]
queries_df['clean_queries'] = clean_queries
queries_df = queries_df.drop_duplicates('Search_term')
queries_df.set_index('Search_term', inplace=True)
queries_df['clean_queries'] = queries_df['clean_queries'].str.join(' ')

In [128]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words=stop_words)
tfidf_matrix_names = tf.fit_transform(names_df['clean_names'])
tfidf_matrix_queries = tf.fit_transform(queries_df['clean_queries'])

In [None]:
#Cosine similarity
cosine_similarities = linear_kernel(tfidf_matrix_names, tfidf_matrix_names)

In [None]:
indices = pd.Series(names_df.index)

In [None]:
def recommendations(name, cosine_similarities = cosine_similarities):
    
    recommended_products = []
    
    # gettin the index of the hotel that matches the name
    idx = indices[indices == name].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar hotels except itself
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the names of the top 10 matching hotels
    for i in top_10_indexes:
        recommended_products.append(list(df.index)[i])
        

In [None]:
recommendations('')

## Create Preprocessor Object

In [None]:
class preprocessor:
    def __init__(self, cols_to_filter=None):
        
        self.cols_to_filter = cols_to_filter
    
    def fit(self, X, y=None):
        """learn any information from the training data we may need to transform the test data"""
        
        # learn from the training data and return the class itself. 
        # allows you to chain fit and predict methods like 
        
        # > p = preprocessor()
        # > p.fit(X).transform(X)
        
        return self
    
    def transform(self, X, y=None):
        """transform the training or test data"""
        # transform the training or test data based on class attributes learned in the `fit` step
        return X_new

## Define Imputation Strategy

## Encoding Categorical Features