# The Home Depot Decor - Visual Navigation

Getting Started | Data Exploration  |  **Preprocessing**  | Benchmark Model

In [1]:
__author__ = 'Jaime Garvey'
__email__ = 'jaimemgarvey@gmail.com'

In [315]:
import pandas as pd
import numpy as np

import gensim

import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

%matplotlib inline

In [130]:
import sys
import inspect
sys.path.insert(0, '../modules')


# now read in new functions
from helpers import read_in_dataset, get_num_of_levels, flatten_categories, search_cons_status

**Read in Data**

In [211]:
# Read in product related data
verbose_opt = False
catalog = read_in_dataset('Decor_catalog.csv', verbose=verbose_opt)
prod_desc = read_in_dataset('Product_name_description.csv', verbose=verbose_opt)
prod_engagement = read_in_dataset('Product_engagement.csv', verbose=verbose_opt)

# Read in search related data
navigations = read_in_dataset('Visual_navigations.csv', verbose=verbose_opt)
search_imp = read_in_dataset('Search_impression.csv', verbose=verbose_opt)

In [213]:
search_imp.head()

Unnamed: 0,Search_term,Impression
0,coffee,203054703;207061099;305561354;305561469;301692...
1,bag chair,305573411;305608772;301092388;301092383;301092...
2,kitchen wall tile,205140711;302603437;205762409;204923728;204337...
3,mirror tile,305696621;304142073;304142126;304142039;303058...
4,entryway,203532713;203532652;300750153;302042988;302042...


# Build Text Preprocessor Class

In [453]:
class TextPreprocessor:
    
    def __init__(self, search_imp=None, prod_desc=None, catalog=None):
        self.search_imp = search_imp
        self.prod_desc = prod_desc
        self.catalog = catalog
        
        self.stop_words_lst = ['in', 'sq','ft', 'yd', 'cm', 'mm','gal','lb' ,'lbs','qt','oz', 'h', 'w', 'ii', 'x']
        self.stop_words = list(set(stopwords.words('english') + new_stop_words))
        
    def read_in_dataset(self, data, data_folder='raw', verbose=False):
        '''
        Read in dataset (csv format) to pandas dataframe

        Keyword Arguments:
        ------------------
        * dataset - string with dataset filename
        * data_folder - string with either raw or processed
        * verbose - True will print intormation about the dataset

        Returns:
        --------
        a pandas dataframe
        '''
        df = pd.read_csv('../data/{}/{}.csv'.format(data_folder, data))
        
        return df

    def clean_text(self, df, corpus_col):
        '''Call preprocessor generator object'''

        corpus = df.loc[:, corpus_col].tolist()
        
        return list(self.preprocess_text(corpus))
        

    def preprocess_text(self, docs):
        '''
        Process docs

        Returns:
        --------
        tokenized list of docs
        '''
        #docs = docs.values
        method='lemmatizer'
        if method == 'lemmatizer':
            lemma = nltk.stem.WordNetLemmatizer()
            root = lemma.lemmatize
        elif method == 'stemmer':
            stemmer = nltk.stem.snowball.SnowballStemmer("english")
            root = stemmer.stem

        for doc in docs:
            tokens = gensim.utils.simple_preprocess(doc)
            yield(' '.join([root(token) for token in tokens if not token in self.stop_words]))
        
    def doc2tokens(corpus):
        return corpus.str.split()
    
    def compare_clean_searches(self, clean_searches, raw_search):
        '''Compare cleaned search queries to raw text'''
        st_compare = pd.DataFrame({'raw_search': raw_search['Search_term'], 'cleaned_search':clean_searches}).sort_values(by='raw_search')
        return st_compare.groupby('cleaned_search')['raw_search'].apply(list)

    def add_stopword(self, new_stopword):
        self.stop_words.append(new_stopword)

# Search Term Consolidation

## Preprocess Search Terms

In [None]:
def preprocess_text(docs, method='stemmer'):
    '''
    Process docs

    Returns:
    --------
    tokenized list of docs
    '''
    docs = docs.values
    stop_words = ['I']
    if method == 'lemmatizer':
        lemma = nltk.stem.WordNetLemmatizer()
        root = lemma.lemmatize
        
    elif method == 'stemmer':
        stemmer = nltk.stem.snowball.SnowballStemmer("english")
        root = stemmer.stem

    for doc in docs:
        tokens = gensim.utils.simple_preprocess(doc)
        yield(' '.join([root(token) for token in tokens if not token in stop_words]))
        
        
def doc2tokens(corpus):
    '''pandas series (docs) to tokens'''
        return corpus.str.split()
    
    
def cons_search_terms(clean_searches=clean_searches, raw_search=search_imp):
    '''Compare cleaned search queries to raw text'''
    st_compare = pd.DataFrame({'raw_search': raw_search['Search_term'], 'cleaned_search':clean_searches}).sort_values(by='raw_search')
    return st_compare.groupby('cleaned_search')['raw_search'].apply(list)

In [410]:
# Clean Search Terms
clean_searches = pd.Series(list(preprocess_text(search_imp['Search_term'])))

# tokens
search_tokens = doc2tokens(clean_searches)

#ngram model
search_trigrams = trigram_model(search_tokens, threshholds=(25,15), verbose=True)

In [422]:
# Compare clean searches to raw searches

cons_search_terms().head()

cleaned_search
accent table                [accent table, accent tables]
adhesive backsplash                 [adhesive backsplash]
adhesive tile backsplash       [adhesive tile backsplash]
arm chair                                     [arm chair]
armchair                                       [armchair]
Name: raw_search, dtype: object