# The Home Depot Decor Case

Getting Started  |  Data Prep  |  Data Exploration  |  **Preprocessing**  |  Model Tuning  |  Final Model

In [315]:
import pandas as pd
import numpy as np

import gensim

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.stem.snowball import SnowballStemmer

from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

#from langdetect import detect
#from langdetect.lang_detect_exception import LangDetectException

%matplotlib inline

In [130]:
import sys
import inspect
sys.path.insert(0, '../modules')


# now read in new functions
from helpers import read_in_dataset, get_num_of_levels, flatten_categories, search_cons_status

**Read in Data**

In [211]:
# Read in product related data
verbose_opt = False
catalog = read_in_dataset('Decor_catalog.csv', verbose=verbose_opt)
prod_desc = read_in_dataset('Product_name_description.csv', verbose=verbose_opt)
prod_engagement = read_in_dataset('Product_engagement.csv', verbose=verbose_opt)

# Read in search related data
navigations = read_in_dataset('Visual_navigations.csv', verbose=verbose_opt)
search_imp = read_in_dataset('Search_impression.csv', verbose=verbose_opt)

In [213]:
search_imp.head()

Unnamed: 0,Search_term,Impression
0,coffee,203054703;207061099;305561354;305561469;301692...
1,bag chair,305573411;305608772;301092388;301092383;301092...
2,kitchen wall tile,205140711;302603437;205762409;204923728;204337...
3,mirror tile,305696621;304142073;304142126;304142039;303058...
4,entryway,203532713;203532652;300750153;302042988;302042...


# Build Text Preprocessor Class

In [453]:
class TextPreprocessor:
    
    def __init__(self, search_imp=None, prod_desc=None, catalog=None):
        self.search_imp = search_imp
        self.prod_desc = prod_desc
        self.catalog = catalog
        
        self.stop_words_lst = ['in', 'sq','ft', 'yd', 'cm', 'mm','gal','lb' ,'lbs','qt','oz', 'h', 'w', 'ii', 'x']
        self.stop_words = list(set(stopwords.words('english') + new_stop_words))
        
    def read_in_dataset(self, data, data_folder='raw', verbose=False):
        '''
        Read in dataset (csv format) to pandas dataframe

        Keyword Arguments:
        ------------------
        * dataset - string with dataset filename
        * data_folder - string with either raw or processed
        * verbose - True will print intormation about the dataset

        Returns:
        --------
        a pandas dataframe
        '''
        df = pd.read_csv('../data/{}/{}.csv'.format(data_folder, data))
        
        return df

    def clean_text(self, df, corpus_col):
        '''Call preprocessor generator object'''

        corpus = df.loc[:, corpus_col].tolist()
        
        return list(self.preprocess_text(corpus))
        

    def preprocess_text(self, docs):
        '''
        Process docs

        Returns:
        --------
        tokenized list of docs
        '''
        #docs = docs.values
        method='lemmatizer'
        if method == 'lemmatizer':
            lemma = nltk.stem.WordNetLemmatizer()
            root = lemma.lemmatize
        elif method == 'stemmer':
            stemmer = nltk.stem.snowball.SnowballStemmer("english")
            root = stemmer.stem

        for doc in docs:
            tokens = gensim.utils.simple_preprocess(doc)
            yield(' '.join([root(token) for token in tokens if not token in self.stop_words]))
        
    def doc2tokens(corpus):
        return corpus.str.split()
    
    def compare_clean_searches(self, clean_searches, raw_search):
        '''Compare cleaned search queries to raw text'''
        st_compare = pd.DataFrame({'raw_search': raw_search['Search_term'], 'cleaned_search':clean_searches}).sort_values(by='raw_search')
        return st_compare.groupby('cleaned_search')['raw_search'].apply(list)

    def add_stopword(self, new_stopword):
        self.stop_words.append(new_stopword)

# Search Term Consolidation

## Preprocess Search Terms

In [None]:
def preprocess_text(docs, method='lemmatizer'):
    '''
    Process docs

    Returns:
    --------
    tokenized list of docs
    '''
    docs = docs.values
    stop_words = ['I']
    if method == 'lemmatizer':
        lemma = nltk.stem.WordNetLemmatizer()
        root = lemma.lemmatize
        
    elif method == 'stemmer':
        stemmer = nltk.stem.snowball.SnowballStemmer("english")
        root = stemmer.stem

    for doc in docs:
        tokens = gensim.utils.simple_preprocess(doc)
        yield(' '.join([root(token) for token in tokens if not token in stop_words]))
        
        
def doc2tokens(corpus):
    '''pandas series (docs) to tokens'''
        return corpus.str.split()
    
    
def cons_search_terms(clean_searches=clean_searches, raw_search=search_imp):
    '''Compare cleaned search queries to raw text'''
    st_compare = pd.DataFrame({'raw_search': raw_search['Search_term'], 'cleaned_search':clean_searches}).sort_values(by='raw_search')
    return st_compare.groupby('cleaned_search')['raw_search'].apply(list)

In [410]:
# Read in Data
search_imp = read_in_dataset('Search_impression.csv', verbose=verbose_opt)

# Clean Search Terms
clean_searches = pd.Series(list(preprocess_text(search_imp['Search_term'])))

# tokens
search_tokens = doc2tokens(clean_searches)

In [None]:
#ngram model
#search_trigrams = trigram_model(search_tokens, threshholds=(25,15), verbose=True)

In [422]:
# Compare clean searches to raw searches

cons_search_terms().head()

cleaned_search
accent table                [accent table, accent tables]
adhesive backsplash                 [adhesive backsplash]
adhesive tile backsplash       [adhesive tile backsplash]
arm chair                                     [arm chair]
armchair                                       [armchair]
Name: raw_search, dtype: object

# End

In [135]:
# Explore Search Term Similarity through collaborative filtering

# Copy df
search_compare_df = search_imp

#add cleaned text 
search_compare_df['cleaned_search'] = clean_searches_series

#split Impressions
search_compare_df['Impression'] = search_compare_df['Impression'].str.replace(';', ' ')

In [138]:
#groupby cleaned search
search_imp_clean = pd.DataFrame(search_compare_df.groupby('cleaned_search')['Impression'].apply(sum))

search_imp_clean.head()

Unnamed: 0_level_0,Impression
cleaned_search,Unnamed: 1_level_1
accent table,302192857 303444081 302267969 301285388 305852...
adhesive backsplash,202541460 206876241 204860117 301094937 207210...
adhesive tile backsplash,203601385 204208646 202823734 203601363 206110...
arm chair,205409569 207020841 206374902 304059401 302765...
armchair,205409569 206374902 207020841 203195567 205181...


In [79]:
#search_imp_clean_set = search_imp_clean['Impression'].apply(set)

#search_imp_clean_set.head()

cleaned_search
accent table                {302768792, 301239142, 306775214, 301654365, 2...
adhesive backsplash         {206885906, 301094798, 303617520, 302991748, 2...
adhesive tile backsplash    {205972972, 302681744, 203601363, 100521695, 2...
arm chair                   {206890535, 207020841, 306053407, 205786340, 3...
armchair                    {302796853, 207020841, 204083379, 205786340, 2...
Name: Impression, dtype: object

In [158]:
imp_docs = search_imp_clean['Impression'].tolist()

In [160]:
#Count vec

def dummy(doc):
    return doc

vec_imp = CountVectorizer(binary=True)
imp_matrix = vec_imp.fit_transform(imp_docs)

In [176]:
# TFIDF Vectorizer

tfvec_imp = TfidfVectorizer(binary=True)
tfimp_matrix = tfvec_imp.fit_transform(imp_docs)

In [161]:
pd.DataFrame(imp_matrix.toarray(), index=search_imp_clean.index, columns=vec_imp.get_feature_names()).head()

Unnamed: 0_level_0,100012014,100022800,100023109,100023973,100036137,100044505,100048075,100051570,100061089,100074869,...,307717049,307717219,307717221,307727052,307833295,307920434,307939445,307939707,307940057,307940314
cleaned_search,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
accent table,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
adhesive backsplash,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
adhesive tile backsplash,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
arm chair,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
armchair,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [177]:
pd.DataFrame(tfimp_matrix.toarray(), index=search_imp_clean.index, columns=tfvec_imp.get_feature_names()).head()

Unnamed: 0_level_0,100012014,100022800,100023109,100023973,100036137,100044505,100048075,100051570,100061089,100074869,...,307717049,307717219,307717221,307727052,307833295,307920434,307939445,307939707,307940057,307940314
cleaned_search,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
accent table,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
adhesive backsplash,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
adhesive tile backsplash,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
arm chair,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
armchair,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [178]:
term_sim = cosine_similarity(imp_matrix,imp_matrix)
term_sim_tf = cosine_similarity(tfimp_matrix,tfimp_matrix)

In [192]:
#Make term - ID dictionary

term2id = {}
for i,t in enumerate(search_imp_clean.index):
    term2id[t] = i

In [210]:
def topn_terms(term, n=1):
    index_num = term2id[term]
    
    term = search_imp_clean.index[index_num]
    
    print(f"Search Term: {term}")
    
    cv_result = pd.DataFrame(term_sim, index=search_imp_clean.index, columns=search_imp_clean.index).iloc[:,index_num].sort_values(ascending=False).head(n)
    tfidf_result = pd.DataFrame(term_sim_tf, index=search_imp_clean.index, columns=search_imp_clean.index).iloc[:,index_num].sort_values(ascending=False).head(n)
    return cv_result, tfidf_result

In [209]:
# word2vec

# Load Google's pre-trained Word2Vec model.
model = gensim.models.KeyedVectors.load_word2vec_format('../model/embedding/GoogleNews-vectors-negative300.bin', binary=True)  

## Vectorize

In [83]:
def vectorize(corpus):

    vectorizer = TfidfVectorizer(stop_words='english')
    matrix = vectorizer.fit_transform(corpus)

    return vectorizer, matrix

In [131]:
search_vec, search_matrix = vectorize(clean_searches)

In [133]:
search_matrix

<354x199 sparse matrix of type '<class 'numpy.float64'>'
	with 698 stored elements in Compressed Sparse Row format>

In [150]:
prod_vec, prod_matrix = vectorize(names_corpus)

# Text Processor

In [169]:
class TextPreprocessor:
    def __init__(self, stop_words):
        self.stop_words = stop_words
        
        
    def preprocess(self, corpus):
        '''
        Process documents

        Keyword Arguments:
        ------------------
        * corpus - list of documents
        * stop_words = set of stopwords

        Returns:
        --------
        Each document as a list of tokens (iterator)

        e.g., clean_corpus = list(preprocess(corpus))
        '''

        lemmatizer = WordNetLemmatizer()

        for doc in corpus:
            tokens = gensim.utils.simple_preprocess(doc, deacc=True)
            yield([lemmatizer.lemmatize(token) for token in tokens if not token in stop_words])
    
    
    def trigram_model(self, corpus_tokens, threshholds=(25,15), verbose=False):
        '''
        Build trigram model

        Keyword Arguments:
        ------------------
        * corpus - list of documents (as tokens)
        * stop_words = set of stopwords

        Returns:
        --------
        '''

        bigram = gensim.models.Phrases(corpus_tokens,
                                        min_count=threshholds[0])

        trigram = gensim.models.Phrases(bigram[corpus_tokens],
                                        min_count=threshholds[1])

        # trigram/bigram model
        bigram_model = gensim.models.phrases.Phraser(bigram)
        trigram_model = gensim.models.phrases.Phraser(trigram)

        corpus_new = [trigram_model[bigram_model[doc]] for doc in corpus_tokens]

        if verbose:
            for doc in corpus_new[0:5]:
                print(f'{" ".join(trigram_model[bigram_model[doc]]) } \n')

        return corpus_new
                      
    def vectorize(self, corpus, format='tokens'):
                      
        if format == 'tokens':    
            corpus = self.tokens2corpus(corpus)
    
        vectorizer = TfidfVectorizer(stop_words='english')
        matrix = vectorizer.fit_transform(corpus)

        return vectorizer, matrix


    def tokens2corpus(self, tokenized_docs):
        '''
        Helper fuction to convert from tokens to text
        '''
        for doc in tokenized_docs:
            yield ' '.join([token for token in doc])

## Testing Preprocessor

In [170]:
tp = TextPreprocessor

In [171]:
search = tp(stop_words)

In [172]:
clean_searches = list(search.preprocess(searches))

In [173]:
searches_trigram = search.trigram_model(clean_searches, verbose=True)

coffee 

bag chair 

kitchen wall tile 

mirror tile 

entryway 





In [174]:
searches_vec, searches_matrix = search.vectorize(searches_trigram)

# Test

**To Explore**
- Language detection 
- Snowball stemmer vs Lemmatizer 

# Archive

## Build the bigram and trigram models

In [None]:
def trigram_model(corpus, threshholds=(25,15), verbose=False):
    '''
    Build trigram model
    '''

    bigram = gensim.models.Phrases(clean_names,
                                    min_count=threshholds[0])

    trigram = gensim.models.Phrases(bigram[clean_names],
                                    min_count=threshholds[1])

    # trigram/bigram model
    bigram_model = gensim.models.phrases.Phraser(bigram)
    trigram_model = gensim.models.phrases.Phraser(trigram)

    corpus_new = [trigram_model[bigram_model[doc]] for doc in corpus]

    if verbose:
        for doc in corpus[0:5]:
            print(f'{" ".join(trigram_model[bigram_model[doc]]) } \n')
                  
    return corpus_new

In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(clean_names, min_count=25)

trigram = gensim.models.Phrases(bigram[clean_names], min_count=15) 

# Names as a trigram/bigram
bigram_model = gensim.models.phrases.Phraser(bigram)
trigram_model = gensim.models.phrases.Phraser(trigram)

## Preprocess Names

In [None]:
# Add Stopword List
# Define List of Stop Words
new_stop_words = ['in', 'sq','ft', 'yd', 'cm', 'mm','gal','lb' ,'lbs','qt','oz', 'h', 'w', 'ii', 'x']
stop_words = set(stopwords.words('english') + new_stop_words)

# Convert Product Name to Array
names = prod_desc['Product_name'].values

clean_names = list(preprocess_text(names))

names_corpus = list(join_tokens(clean_names))

## Text Preprocessor Class

In [399]:
class TextPreprocessor:
    
    def __init__(self, search_imp=None, prod_desc=None, catalog=None):
        self.search_imp = search_imp
        self.prod_desc = prod_desc
        self.catalog = catalog
        
        self.stop_words_lst = ['in', 'sq','ft', 'yd', 'cm', 'mm','gal','lb' ,'lbs','qt','oz', 'h', 'w', 'ii', 'x']
        self.stop_words = list(set(stopwords.words('english') + new_stop_words))
        
    def read_in_dataset(self, data, data_folder='raw', verbose=False):
        '''
        Read in dataset (csv format) to pandas dataframe

        Keyword Arguments:
        ------------------
        * dataset - string with dataset filename
        * data_folder - string with either raw or processed
        * verbose - True will print intormation about the dataset

        Returns:
        --------
        a pandas dataframe
        '''
        df = pd.read_csv('../data/{}/{}.csv'.format(data_folder, data))
        
        return df

    def clean_text(self, data, corpus_col):
        '''Call preprocessor generator object'''
        
#         if data == 'prod_desc':
#             corpus = self.prod_desc[corpus_col].tolist()
        
#         elif data == 'search_imp':
#             corpus = self.search_imp[corpus_col].tolist()
        
#         elif data == 'catalog':
#             corpus = self.catalog[corpus_col].tolist()
        corpus = data.loc[:, corpus_col].tolist()
        clean_corpus = list(self.preprocess_text(corpus))
        
        return clean_corpus
        

    def preprocess_text(docs, method='lemmatizer'):
        '''
        Process docs

        Returns:
        --------
        tokenized list of docs
        '''
        #docs = docs.values
        
        if method == 'lemmatizer':
            lemma = nltk.stem.WordNetLemmatizer()
            root = lemma.lemmatize
        elif method == 'stemmer':
            stemmer = nltk.stem.snowball.SnowballStemmer("english")
            root = stemmer.stem

        for doc in docs:
            tokens = gensim.utils.simple_preprocess(doc)
            yield(' '.join([root(token) for token in tokens if not token in self.stop_words]))
        
    def doc2tokens(corpus):
        return corpus.str.split()

    def add_stopword(self, new_stopword):
        self.stop_words.append(new_stopword)
    

## Preprocess Product Names

## Rec

In [127]:
#Make names df
names_df= prod_desc_cat.loc[:, ['SKU_ID','Product_name']]
names_df.set_index('Product_name', inplace=True)
names_df['clean_names'] = clean_names
names_df['clean_names'] = names_df['clean_names'].str.join(' ')


#Make queries df
queries_df = search_prod_levels.loc[:, ['Search_term']]
queries_df['clean_queries'] = clean_queries
queries_df = queries_df.drop_duplicates('Search_term')
queries_df.set_index('Search_term', inplace=True)
queries_df['clean_queries'] = queries_df['clean_queries'].str.join(' ')

In [128]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words=stop_words)
tfidf_matrix_names = tf.fit_transform(names_df['clean_names'])
tfidf_matrix_queries = tf.fit_transform(queries_df['clean_queries'])

In [None]:
#Cosine similarity
cosine_similarities = linear_kernel(tfidf_matrix_names, tfidf_matrix_names)

In [None]:
indices = pd.Series(names_df.index)

In [None]:
recommendations('')

## Create Preprocessor Object

In [None]:
class preprocessor:
    def __init__(self, cols_to_filter=None):
        
        self.cols_to_filter = cols_to_filter
    
    def fit(self, X, y=None):
        """learn any information from the training data we may need to transform the test data"""
        
        # learn from the training data and return the class itself. 
        # allows you to chain fit and predict methods like 
        
        # > p = preprocessor()
        # > p.fit(X).transform(X)
        
        return self
    
    def transform(self, X, y=None):
        """transform the training or test data"""
        # transform the training or test data based on class attributes learned in the `fit` step
        return X_new