# Outline


## Process
1. read data and check language, lengths, BoW
2. process/standardize
3. mannheem data?

## Human in loop 
1. topic model
2. color / size lookups

## Emebbing / Prediction Models
1. stacked targets?


# Instructions

a.) If we want to understand which catalog (such as clothing, shoes, accessories, beauty, jewelry etc.) each item is, how will you make that happen?


b.) How can you extract the additional information from the item names, such as the color, style, size, material, gender etc. if there is any?


c.) A plus. If you write the queries/codes, or build a machine learning model to achieve the goal of a.) or b.) above on the attached dataset. Python is preferred.

In [4]:
import pandas as pd
import string
from nltk.stem import WordNetLemmatizer
from colour import Color
from gensim import models, corpora
import pandas as pd
import pyLDAvis.gensim
import gensim

In [5]:
data = pd.read_csv('data/ecommerce_product_names.csv', header=0, names=['raw'])

In [6]:
data.head()

Unnamed: 0,raw
0,Alisha Solid Women's Cycling Shorts
1,FabHomeDecor Fabric Double Sofa Bed
2,AW Bellies
3,Sicons All Purpose Arnica Dog Shampoo
4,Eternal Gandhi Super Series Crystal Paper Weig...


In [7]:
# process data functions

GENDERS = ['boy', 'girl', 'kid', 'baby', 'infant', 'child', 'dog', 'cat', 'man', 'woman', 'pet']


def clean_raw_text(raw_text):
    """
    Clean Raw Text
    
    Removes punctuation and lowercases all letters
    
    
    args:
        raw_text: string of one/multiple tokens to be cleaned
        
    returns:
        clean_text: cleaned version of raw_text
    
    """
    
    clean_text = raw_text.translate(str.maketrans('', '', string.punctuation)).lower()

    return clean_text


def get_lemmas(raw_text):
    """
    Get Lemmas
    
    Uses nltk's lemmatizer to find lemmas for given string. 
    Because the lemmatizer only takes one word at a time, the function
    splits the string, then joins it again
    
    args:
        raw_text: string of one/multiple tokens to be lemmatized
        
    returns:
        lemmas: lemmatized version of raw_text
    
    """
    
    lemmatizer = WordNetLemmatizer()
    lemmas = ' '.join([lemmatizer.lemmatize(word) for word in raw_text.split()])
    
    return lemmas


def check_if_color(token):
    """
    Check if Color
    
    Helper function to detect color for a given word
    
    args:
        token: an unknown word (string)
        
    returns:
        True if token is a color, else False
    
    """
    
    try:
        Color(token)
        return True
    except ValueError:
        return False
    
    
def get_colors(raw_text):
    """
    Get Colors
    
    Calls check_if_color for every token in string passed to it
    Removes duplicates (white white) from same description
    
    args:
        raw_text: string to check for colors
        
    returns:
        colors: string of colors found in raw_text
    
    """

    colors = list(set([word for word in raw_text.split() if check_if_color(word)]))
    
    return colors


def get_gender(raw_text, genders_list):
    """
    Get Gender
    
    Uses pre-defined lookup table to find gender. 
    Removes duplicates (woman woman) from same description
    
    args:
        raw_text: string to check for gender
        genders_list: list of genders to search for
        
    returns:
        genders: list of genders found in raw_text
    
    """
    
    genders = list(set([word for word in raw_text.split() if word in genders_list]))
    
    return genders
    
    
    

In [8]:
data['clean'] = data['raw'].apply(lambda x: clean_raw_text(x))
data['lemmas'] = data['clean'].apply(lambda x: get_lemmas(x))
data['color'] = data['lemmas'].apply(lambda x: get_colors(x))
data['gender'] = data['lemmas'].apply(lambda x: get_gender(x, GENDERS))





In [9]:
#data[data['color'].str.contains('blue')]
data

Unnamed: 0,raw,clean,lemmas,color,gender
0,Alisha Solid Women's Cycling Shorts,alisha solid womens cycling shorts,alisha solid woman cycling short,[],[woman]
1,FabHomeDecor Fabric Double Sofa Bed,fabhomedecor fabric double sofa bed,fabhomedecor fabric double sofa bed,[],[]
2,AW Bellies,aw bellies,aw belly,[],[]
3,Sicons All Purpose Arnica Dog Shampoo,sicons all purpose arnica dog shampoo,sicons all purpose arnica dog shampoo,[],[dog]
4,Eternal Gandhi Super Series Crystal Paper Weig...,eternal gandhi super series crystal paper weig...,eternal gandhi super series crystal paper weig...,[silver],[]
...,...,...,...,...,...
12618,Purple Women Heels,purple women heels,purple woman heel,[purple],[woman]
12619,Uberlyfe Large Vinyl Sticker,uberlyfe large vinyl sticker,uberlyfe large vinyl sticker,[],[]
12620,We Witches Comfy Hues Women Wedges,we witches comfy hues women wedges,we witch comfy hue woman wedge,[],[woman]
12621,Stylistry Women Heels,stylistry women heels,stylistry woman heel,[],[woman]


In [None]:
#TODO one hot encode

# Topic models

In [11]:





def prep_data_for_tm(data):
    docs = []
    
    for i in range(len(data.index)):
        docs.append(data.loc[i,'tm_text'].split())


    dictionary = corpora.Dictionary(docs)
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    
    return corpus, dictionary, docs



In [12]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

        
 
data1 = data['tm_text'].values.tolist()
data_words = list(sent_to_words(data1))


KeyError: 'tm_text'

In [None]:
data_words

In [16]:
data

Unnamed: 0,raw,clean,lemmas,color,gender,tm_text
0,Alisha Solid Women's Cycling Shorts,alisha solid womens cycling shorts,alisha solid woman cycling short,[],[woman],alisha solid woman cycling short
1,FabHomeDecor Fabric Double Sofa Bed,fabhomedecor fabric double sofa bed,fabhomedecor fabric double sofa bed,[],[],fabhomedecor fabric double sofa bed
2,AW Bellies,aw bellies,aw belly,[],[],aw belly
3,Sicons All Purpose Arnica Dog Shampoo,sicons all purpose arnica dog shampoo,sicons all purpose arnica dog shampoo,[],[dog],sicons purpose arnica dog shampoo
4,Eternal Gandhi Super Series Crystal Paper Weig...,eternal gandhi super series crystal paper weig...,eternal gandhi super series crystal paper weig...,[silver],[],eternal gandhi super series crystal paper weig...
...,...,...,...,...,...,...
12618,Purple Women Heels,purple women heels,purple woman heel,[purple],[woman],purple woman heel
12619,Uberlyfe Large Vinyl Sticker,uberlyfe large vinyl sticker,uberlyfe large vinyl sticker,[],[],uberlyfe large vinyl sticker
12620,We Witches Comfy Hues Women Wedges,we witches comfy hues women wedges,we witch comfy hue woman wedge,[],[woman],witch comfy hue woman wedge
12621,Stylistry Women Heels,stylistry women heels,stylistry woman heel,[],[woman],stylistry woman heel


In [15]:
data['tm_text'] = data['lemmas'].apply(lambda x: remove_stopwords(x, STOP_WORDS))

In [13]:
from nltk.corpus import stopwords
STOP_WORDS = stopwords.words('english')
STOP_WORDS.extend(['from', 'set', 'use'])

bigram = gensim.models.Phrases(data_words, min_count=5, threshold=10) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=10)  

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)





NameError: name 'data_words' is not defined

In [14]:
def remove_stopwords(raw_text, stop_words):
    
    tm_text = ' '.join([word for word in raw_text.split() if word not in stop_words])
    
    return tm_text








def make_bigrams(texts):
    
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [None]:
# Remove Stop Words
#data_words_nostops = remove_stopwords(data_words, STOP_WORDS)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words)
data_words_trigrams = make_trigrams(make_bigrams(data_words))
#data_words_bigrams

In [None]:
# Create Dictionary
dictionary = corpora.Dictionary(data_words_trigrams)

# Term Document Frequency
corpus = [dictionary.doc2bow(text) for text in data_words_trigrams]


In [None]:
lda_model = models.LdaModel(corpus=corpus, 
 num_topics=20, 
 id2word=dictionary, 
 distributed=False, 
 chunksize=200, 
 passes=5, 
 update_every=1, 
 alpha='auto', 
 eta=None, 
 decay=0.5, 
 offset=1.0, 
 eval_every=10, 
 iterations=50, 
 gamma_threshold=0.001, 
 minimum_probability=0.01, 
 random_state=2, 
 ns_conf=None, 
 minimum_phi_value=0.01, 
 per_word_topics=False, 
 callbacks=None)

In [None]:
lda_visualization = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary, sort_topics=False, n_jobs=1, mds='mmds')
pyLDAvis.display(lda_visualization)

# BoW


In [None]:
words = []

for i in data.index:
    words = words + data.loc[i,'tm_text'].split()

In [None]:
def CountFrequency(my_list):
     
   # Creating an empty dictionary
   count = {}
   for i in my_list:
    count[i] = count.get(i, 0) + 1
   return count

In [None]:
wordfrq = CountFrequency(words)

# Keyword Extraction

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")


In [None]:
text = data.loc[2, 'tm_text']
text = """spaCy is an open-source software library for advanced natural language processing, 
written in the programming languages Python and Cython. The library is published under the MIT license
and its main developers are Matthew Honnibal and Ines Montani, the founders of the software company Explosion."""

In [None]:
doc = nlp(text)
print(doc.ents)

In [None]:
data.loc[1, 'tm_text']

In [None]:
data['kw_spacy'] = data['tm_text'].apply(lambda x: nlp(x).ends)

In [29]:
# yake
import yake


In [42]:
kw_extractor = yake.KeywordExtractor()
text = """spaCy is an open-source software library for advanced natural language processing, written in the programming languages Python and Cython. The library is published under the MIT license and its main developers are Matthew Honnibal and Ines Montani, the founders of the software company Explosion."""


text = data.loc[0, 'tm_text']

language = "en"
max_ngram_size = 2
deduplication_threshold = 0.9
numOfKeywords = 20
custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features=None)
keywords = custom_kw_extractor.extract_keywords(text)
for kw in keywords:
     print(kw)

('alisha solid', 0.04940384002065631)
('cycling short', 0.04940384002065631)
('solid woman', 0.09700399286574239)
('woman cycling', 0.09700399286574239)
('alisha', 0.15831692877998726)
('short', 0.15831692877998726)
('solid', 0.29736558256021506)
('woman', 0.29736558256021506)
('cycling', 0.29736558256021506)


In [27]:
from rake_nltk import Rake
rake_nltk_var = Rake()
text = """spaCy is an open-source software library for advanced natural language processing,
written in the programming languages Python and Cython. The library is published under the MIT license
and its main developers are Matthew Honnibal and Ines Montani, the founders of the software company Explosion."""

text = data.loc[1, 'tm_text']
rake_nltk_var.extract_keywords_from_text(text)
keyword_extracted = rake_nltk_var.get_ranked_phrases()
print(keyword_extracted)

['fabhomedecor fabric double sofa bed']
