# Outline


## Process
1. read data and check language, lengths, BoW
2. process/standardize
3. mannheem data?

## Human in loop 
1. topic model
2. color / size lookups

## Emebbing / Prediction Models
1. stacked targets?


# Instructions

a.) If we want to understand which catalog (such as clothing, shoes, accessories, beauty, jewelry etc.) each item is, how will you make that happen?


b.) How can you extract the additional information from the item names, such as the color, style, size, material, gender etc. if there is any?


c.) A plus. If you write the queries/codes, or build a machine learning model to achieve the goal of a.) or b.) above on the attached dataset. Python is preferred.

In [92]:
import pandas as pd
import string
from nltk.stem import WordNetLemmatizer
from colour import Color
from gensim import models, corpora
import pandas as pd
import pyLDAvis.gensim
import gensim

In [2]:
data = pd.read_csv('data/ecommerce_product_names.csv', header=0, names=['raw'])

In [3]:
data.head()

Unnamed: 0,raw
0,Alisha Solid Women's Cycling Shorts
1,FabHomeDecor Fabric Double Sofa Bed
2,AW Bellies
3,Sicons All Purpose Arnica Dog Shampoo
4,Eternal Gandhi Super Series Crystal Paper Weig...


In [45]:
# process data functions

GENDERS = ['boy', 'girl', 'kid', 'baby', 'infant', 'child', 'dog', 'cat', 'man', 'woman', 'pet']


def clean_raw_text(raw_text):
    """
    Clean Raw Text
    
    Removes punctuation and lowercases all letters
    
    
    args:
        raw_text: string of one/multiple tokens to be cleaned
        
    returns:
        clean_text: cleaned version of raw_text
    
    """
    
    clean_text = raw_text.translate(str.maketrans('', '', string.punctuation)).lower()

    return clean_text


def get_lemmas(raw_text):
    """
    Get Lemmas
    
    Uses nltk's lemmatizer to find lemmas for given string. 
    Because the lemmatizer only takes one word at a time, the function
    splits the string, then joins it again
    
    args:
        raw_text: string of one/multiple tokens to be lemmatized
        
    returns:
        lemmas: lemmatized version of raw_text
    
    """
    
    lemmatizer = WordNetLemmatizer()
    lemmas = ' '.join([lemmatizer.lemmatize(word) for word in raw_text.split()])
    
    return lemmas


def check_if_color(token):
    """
    Check if Color
    
    Helper function to detect color for a given word
    
    args:
        token: an unknown word (string)
        
    returns:
        True if token is a color, else False
    
    """
    
    try:
        Color(token)
        return True
    except ValueError:
        return False
    
    
def get_colors(raw_text):
    """
    Get Colors
    
    Calls check_if_color for every token in string passed to it
    Removes duplicates (white white) from same description
    
    args:
        raw_text: string to check for colors
        
    returns:
        colors: string of colors found in raw_text
    
    """

    colors = list(set([word for word in raw_text.split() if check_if_color(word)]))
    
    return colors


def get_gender(raw_text, genders_list):
    """
    Get Gender
    
    Uses pre-defined lookup table to find gender. 
    Removes duplicates (woman woman) from same description
    
    args:
        raw_text: string to check for gender
        genders_list: list of genders to search for
        
    returns:
        genders: list of genders found in raw_text
    
    """
    
    genders = list(set([word for word in raw_text.split() if word in genders_list]))
    
    return genders
    
    
    

In [46]:
data['clean'] = data['raw'].apply(lambda x: clean_raw_text(x))
data['lemmas'] = data['clean'].apply(lambda x: get_lemmas(x))
data['color'] = data['lemmas'].apply(lambda x: get_colors(x))
data['gender'] = data['lemmas'].apply(lambda x: get_gender(x, GENDERS))





In [121]:
#data[data['color'].str.contains('blue')]
data

Unnamed: 0,raw,clean,lemmas,color,gender,tm_text
0,Alisha Solid Women's Cycling Shorts,alisha solid womens cycling shorts,alisha solid woman cycling short,[],[woman],alisha solid woman cycling short
1,FabHomeDecor Fabric Double Sofa Bed,fabhomedecor fabric double sofa bed,fabhomedecor fabric double sofa bed,[],[],fabhomedecor fabric double sofa bed
2,AW Bellies,aw bellies,aw belly,[],[],aw belly
3,Sicons All Purpose Arnica Dog Shampoo,sicons all purpose arnica dog shampoo,sicons all purpose arnica dog shampoo,[],[dog],sicons purpose arnica dog shampoo
4,Eternal Gandhi Super Series Crystal Paper Weig...,eternal gandhi super series crystal paper weig...,eternal gandhi super series crystal paper weig...,[silver],[],eternal gandhi super series crystal paper weig...
...,...,...,...,...,...,...
12618,Purple Women Heels,purple women heels,purple woman heel,[purple],[woman],purple woman heel
12619,Uberlyfe Large Vinyl Sticker,uberlyfe large vinyl sticker,uberlyfe large vinyl sticker,[],[],uberlyfe large vinyl sticker
12620,We Witches Comfy Hues Women Wedges,we witches comfy hues women wedges,we witch comfy hue woman wedge,[],[woman],witch comfy hue woman wedge
12621,Stylistry Women Heels,stylistry women heels,stylistry woman heel,[],[woman],stylistry woman heel


In [41]:
#TODO one hot encode

# Topic models

In [82]:


data['tm_text'] = data['lemmas'].apply(lambda x: remove_stopwords(x, STOP_WORDS))


def prep_data_for_tm(data):
    docs = []
    
    for i in range(len(data.index)):
        docs.append(data.loc[i,'tm_text'].split())


    dictionary = corpora.Dictionary(docs)
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    
    return corpus, dictionary, docs



In [126]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

        
 
data1 = data['tm_text'].values.tolist()
data_words = list(sent_to_words(data1))


In [127]:
data_words

[['alisha', 'solid', 'woman', 'cycling', 'short'],
 ['fabhomedecor', 'fabric', 'double', 'sofa', 'bed'],
 ['aw', 'belly'],
 ['sicons', 'purpose', 'arnica', 'dog', 'shampoo'],
 ['eternal',
  'gandhi',
  'super',
  'series',
  'crystal',
  'paper',
  'weight',
  'silver',
  'finish'],
 ['dilli', 'bazaaar', 'belly', 'corporate', 'casuals', 'casuals'],
 ['ladela', 'belly'],
 ['carrel', 'printed', 'woman'],
 ['sicons', 'purpose', 'tea', 'tree', 'dog', 'shampoo'],
 ['freelance', 'vacuum', 'bottle', 'ml', 'bottle'],
 ['style', 'foot', 'belly'],
 ['sicons', 'conditioning', 'conditoner', 'dog', 'shampoo'],
 ['dongli', 'printed', 'boy', 'round', 'neck', 'tshirt'],
 ['swagga', 'woman', 'clog'],
 ['kennel',
  'rubber',
  'dumbell',
  'bell',
  'small',
  'rubber',
  'rubber',
  'toy',
  'dog'],
 ['glus', 'wedding', 'lingerie'],
 ['veelys', 'shiny', 'white', 'quad', 'roller', 'skate', 'size', 'uk'],
 ['bulaky', 'vanity', 'case', 'jewellery', 'vanity', 'case'],
 ['fdt', 'woman', 'legging'],
 ['madca

In [167]:
data1

['alisha solid woman cycling short',
 'fabhomedecor fabric double sofa bed',
 'aw belly',
 'sicons purpose arnica dog shampoo',
 'eternal gandhi super series crystal paper weight silver finish',
 'dilli bazaaar belly corporate casuals casuals',
 'ladela belly',
 'carrel printed woman',
 'sicons purpose tea tree dog shampoo',
 'freelance vacuum bottle 350 ml bottle',
 'style foot belly',
 'sicons conditioning conditoner dog shampoo',
 'dongli printed boy round neck tshirt',
 'swagga woman clog',
 'kennel rubber dumbell bell small rubber rubber toy dog',
 'glus wedding lingerie',
 'veelys shiny white quad roller skate size 45 uk',
 'bulaky vanity case jewellery vanity case',
 'fdt woman legging',
 'madcap c38gr30 men cargo',
 'bengal bloom rose artificial plant pot',
 'indcrown net embroidered semistitched lehenga choli material',
 'shopmania music band a5 notebook spiral bound',
 'tiara diary 20162017 designer la kaarta taking action getting result 3 b5 notebook hard bound',
 'kajci emb

In [160]:
from nltk.corpus import stopwords
STOP_WORDS = stopwords.words('english')
STOP_WORDS.extend(['from', 'set', 'use'])

bigram = gensim.models.Phrases(data_words, min_count=5, threshold=10) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=10)  

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)





In [129]:
def remove_stopwords(raw_text, stop_words):
    
    tm_text = ' '.join([word for word in raw_text.split() if word not in stop_words])
    
    return tm_text








def make_bigrams(texts):
    
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [161]:
# Remove Stop Words
#data_words_nostops = remove_stopwords(data_words, STOP_WORDS)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words)
data_words_trigrams = make_trigrams(make_bigrams(data_words))
#data_words_bigrams

In [162]:
# Create Dictionary
dictionary = corpora.Dictionary(data_words_trigrams)

# Term Document Frequency
corpus = [dictionary.doc2bow(text) for text in data_words_trigrams]


In [165]:
lda_model = models.LdaModel(corpus=corpus, 
 num_topics=20, 
 id2word=dictionary, 
 distributed=False, 
 chunksize=200, 
 passes=5, 
 update_every=1, 
 alpha='auto', 
 eta=None, 
 decay=0.5, 
 offset=1.0, 
 eval_every=10, 
 iterations=50, 
 gamma_threshold=0.001, 
 minimum_probability=0.01, 
 random_state=2, 
 ns_conf=None, 
 minimum_phi_value=0.01, 
 per_word_topics=False, 
 callbacks=None)

In [166]:
lda_visualization = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary, sort_topics=False, n_jobs=1, mds='mmds')
pyLDAvis.display(lda_visualization)

# BoW


In [178]:
words = []

for i in data.index:
    words = words + data.loc[i,'tm_text'].split()

In [182]:
def CountFrequency(my_list):
     
   # Creating an empty dictionary
   count = {}
   for i in my_list:
    count[i] = count.get(i, 0) + 1
   return count

In [183]:
wordfrq = CountFrequency(words)

ValueError: If using all scalar values, you must pass an index