## Use NLP to discover the current product trends

### Topic modeling visualization

In [1]:
%pylab inline

import pandas as pd
import numpy as np
import pickle as pk
from scipy import sparse as sp
import re

Populating the interactive namespace from numpy and matplotlib


In [2]:
df = pd.read_csv('all.csv')
df1 = df[df.website =='ASOS']
docs = df1['alltext']
df.sample(5)

Unnamed: 0.1,Unnamed: 0,alltext,brand,category,description,img_2,img_id,img_url,material,price,product_name,website,id
5362,5362,Mamalicious tank top It’s got you covered all ...,,Top,It’s got you covered all the way from bump to ...,,,https://images.asos-media.com/products/mamalic...,"Soft-touch stretch jersey,Why complicate thing...",$23.00,Mamalicious tank top,ASOS,5362
6345,6345,Missguided zig a zig ah slogan t-shirt in blac...,,Top,"Crew neck ,Slogan print to chest ,We all know ...",,,https://images.asos-media.com/products/missgui...,"Basic jersey,Why complicate things?,Main: 100%...",$24.00,Missguided zig a zig ah slogan t-shirt in black,ASOS,6345
6754,6754,Y.A.S Ruffle Detail Cotton Blouse Not for the ...,,Top,"Not for the clumsy ones,Crew neck,Ruffle trims...",,,https://images.asos-media.com/products/yas-ruf...,"Lightweight woven fabric,The kind that doesn't...",$60.00,Y.A.S Ruffle Detail Cotton Blouse,ASOS,6754
7505,7505,Nobody's Child Gingham Cami Dress With Waist B...,,Dress,Everything looks better with a little bit of g...,,,https://images.asos-media.com/products/nobodys...,"Woven fabric,The kind that doesn't stretch,Bel...",$42.00,Nobody's Child Gingham Cami Dress With Waist Belt,ASOS,7505
97,97,One-shoulder Top One-shoulder top in cotton je...,HM,Top,One-shoulder top in cotton jersey with elastic...,,97 One-shoulder Top,https://lp2.hm.com/hmgoepprod?set=source[/ad/3...,Cotton 100%,$14.99,One-shoulder Top,HM,97


In [3]:
print(docs.index)
docs.head()

Int64Index([ 3136,  3137,  3138,  3139,  3140,  3141,  3142,  3143,  3144,
             3145,
            ...
            10205, 10206, 10207, 10208, 10209, 10210, 10211, 10212, 10213,
            10214],
           dtype='int64', length=7079)


3136    ASOS DESIGN blouse with frill shoulder Lightwe...
3137    A Star Is Born Going Out Festival High Neck Bo...
3138    Converse Cons Skate Boarding Long Sleeve T-Shi...
3139    Vans White Finish Line Heritage Long Sleeve T-...
3140    ASOS DESIGN x glaad& Curve t-shirt with high n...
Name: alltext, dtype: object

### Pre-process and vectorize the documents

In [4]:
from nltk.tokenize import RegexpTokenizer

def preprocessDoc(docs):
    docs = np.array(docs)  # no longer have index
    tokenizer = RegexpTokenizer(r'\w+')
    for idx in range(len(docs)):
        docs[idx] = docs[idx].lower()  # Convert to lowercase.
        
        stop = ['ASOS', 'macy', 'bloomingdale', "fashion nova", "YAS", "Ditsy", "Noisy", "May", "Ted","Baker", "River","Island", "Karen","Scott","PrettyLittleThing","Roxy","DESIGN","Chi", \
               "Alfani","Boohoo","Sofie","Schnoor","Ellesse", "Jeannie","TFNC","Sacred", "Hawk","Urban","Bliss","Puma","adidas", "Stella", \
               'cm', 'size', 'web id', 'approx', 'model', 'height', 'is', 'and', 'she', 'wearing', 'small', 'approximate', 'measurements', 'height', 
               'bust', 'waist', 'hips', 'made', 'usa', 'things', 'regular', 'right', 'kind', 'that', 'things']

        lst = r'|'.join([x.lower() for x in stop])

        docs[idx] = re.sub(lst,'',docs[idx])
        
        docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

    docs = [[token for token in doc if not token.isdigit()] for doc in docs]  #remove numbers
    docs = [[token for token in doc if len(token) > 3] for doc in docs] #remove words that are short 
  
    return docs

docs = preprocessDoc(docs)

#### Compute bigrams/trigrams and remove less words and very common words:

In [5]:
from gensim.corpora import Dictionary
from gensim.models import Phrases
bigram = Phrases(docs, min_count=10, threshold=10)  # only ones that appear 10 times or more.
trigram = Phrases(bigram[docs])

for i in range(len(docs)):
    for token in bigram[docs[i]]:
        if '_' in token:
            docs[i].append(token)
#            print ("bi:", token)
    for token in trigram[docs[i]]:
        if token.count('_') > 2 :
            docs[i].append(token)
#            print ("tri:", token)

dictionary_ = Dictionary(docs)
print (len(dictionary_))
dictionary_.filter_extremes(no_below=10, no_above=0.2)
print (len(dictionary_))



3133
1518


### Vectorize data

- bag-of-words : frequency of words

In [6]:
corpus = [dictionary_.doc2bow(doc) for doc in docs]
print('unique tokens: %d' % len(dictionary_)) #gensim.corpora.dictionary.Dictionary
print('Number of records: %d' % len(corpus)) #List

unique tokens: 1518
Number of records: 7079


## Train LDA model

In [7]:
from gensim.models import LdaModel

# Set training parameters.
num_topics = 6
chunksize = 500 #important #number of documents to be used in each training chunk. (how many documents to load into memory)
passes = 20 #important #number of passes through the corpus during training
iterations = 400 #maximum number of iterations through the corpus when inferring the topic distribution of a corpus
eval_every = 1  #log perplexity is estimated every that many updates. Setting this to one slows down training by ~2x.

# Make a index to word dictionary.
dictionary_[0]  #this is only to "load" the dictionary. Otherwise, you will get the value error. 
id2word = dictionary_.id2token

model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, alpha='auto', eta='auto', iterations=iterations, num_topics=num_topics, passes=passes, eval_every=eval_every)


In [8]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

In [9]:
pyLDAvis.gensim.prepare(model, corpus, dictionary_)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [10]:
model.save('lda.model')

In [11]:
def explore_topic(lda_model, topic_number, topn, output=True):
    terms = []
    for term, frequency in lda_model.show_topic(topic_number, topn=topn):
        terms += [term]
        if output:
            print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))   
    return terms

In [12]:
print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')
for i in range(num_topics):
    print('Topic '+str(i)+' |---------------------\n')
    tmp = explore_topic(model,topic_number=i, topn=10, output=True )

term                 frequency

Topic 0 |---------------------

fully_lined          0.051
fully                0.051
more                 0.043
sequin               0.031
embellhed            0.025
definitely           0.021
more_definitely      0.021
extra                0.020
days                 0.019
call                 0.018
Topic 1 |---------------------

maxi                 0.056
skirt                0.031
close                0.028
slim                 0.027
little               0.022
length               0.020
narrow               0.019
sits_close           0.019
sits                 0.019
slim_narrow          0.019
Topic 2 |---------------------

lining               0.033
floral               0.030
lace                 0.025
fastening            0.025
midi                 0.021
over                 0.019
wrap                 0.018
ruffle               0.018
lined                0.015
true                 0.014
Topic 3 |---------------------

button               0.046
min

### Based on the above, giving a general name of each cluster.

In [13]:
top_labels = {0: 'floral', 1:'drapes', 2:'shirt', 3:'lace skirt', 4:'black bodysuit', 5:'conscious'}

In [14]:
from collections import OrderedDict
def get_doc_topic_dist(model, corpus, kwords=True):
    
    '''
    LDA transformation, for each doc only returns topics with non-zero weight
    This function makes a matrix transformation of docs in the topic space.
    '''
    top_dist =[]
    keys = []

    for d in corpus:
        tmp = {i:0 for i in range(num_topics)}
        tmp.update(dict(model[d]))
        vals = list(OrderedDict(tmp).values())
        top_dist += [array(vals)]
        if kwords:
            keys += [array(vals).argmax()]

    return array(top_dist), keys

In [15]:
top_dist, lda_keys= get_doc_topic_dist(model, corpus, True)


In [16]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(top_dist)

In [17]:
df1['X_tsne'] =X_tsne[:, 0]
df1['Y_tsne'] =X_tsne[:, 1]
df1['clusters'] = lda_keys

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [18]:
cluster_colors = {0: 'blue', 1: 'green', 2: 'yellow', 3: 'red', 4: 'skyblue', 5:'salmon', 6:'orange', 7:'maroon', 8:'crimson', 9:'black', 10:'gray'}

df1['colors'] = df1['clusters'].apply(lambda l: cluster_colors[l])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [19]:
df1

Unnamed: 0.1,Unnamed: 0,alltext,brand,category,description,img_2,img_id,img_url,material,price,product_name,website,id,X_tsne,Y_tsne,clusters,colors
3136,3136,ASOS DESIGN blouse with frill shoulder Lightwe...,,Top,"Lightweight smooth woven fabric,Granded collar...",,,https://images.asos-media.com/products/asos-de...,Main: 100% Polyester.,$40.00,ASOS DESIGN blouse with frill shoulder,ASOS,3136,-1.018314,-66.411041,3,red
3137,3137,A Star Is Born Going Out Festival High Neck Bo...,,Top,"Not just any body,High neck,Tassel trim,Hey, f...",,,https://images.asos-media.com/products/a-star-...,"Smooth lightweight fabric,Soft-touch style,Fab...",$108.00,A Star Is Born Going Out Festival High Neck Bo...,ASOS,3137,-31.656099,-15.553932,4,skyblue
3138,3138,Converse Cons Skate Boarding Long Sleeve T-Shi...,,Top,"Printed design to chest and sleeves,The bigger...",,,https://images.asos-media.com/products/convers...,"Soft-touch jersey ,You can count on me ,Body: ...",$36.00,Converse Cons Skate Boarding Long Sleeve T-Shirt,ASOS,3138,-22.886162,87.558418,5,salmon
3139,3139,Vans White Finish Line Heritage Long Sleeve T-...,,Top,"Nothing beats a fresh tee,Crew neck,Long sleev...",,,https://images.asos-media.com/products/vans-wh...,"Soft-touch jersey,T-shirt fabric, in other wor...",$36.00,Vans White Finish Line Heritage Long Sleeve T-...,ASOS,3139,-38.791687,77.826698,5,salmon
3140,3140,ASOS DESIGN x glaad& Curve t-shirt with high n...,,Top,We’re partnering with GLAAD in support of the ...,,,https://images.asos-media.com/products/asos-de...,"Soft-touch jersey,Why complicate things?,Main:...",$35.00,ASOS DESIGN x glaad& Curve t-shirt with high n...,ASOS,3140,65.291359,0.773236,2,yellow
3141,3141,ASOS DESIGN Tall t-shirt with textured neon ci...,,Top,"You can never have too many,Crew neck,Neon pri...",,,https://images.asos-media.com/products/asos-de...,"Go-with-everything jersey,It’s a soft all-roun...",$19.00,ASOS DESIGN Tall t-shirt with textured neon ci...,ASOS,3141,23.991770,33.961231,5,salmon
3142,3142,adidas Training All Me Vfa Bra In Black Gym mo...,,Top,"Gym motivation starts with a good kit,Quick dr...",,,https://images.asos.com/webcontent/greenroom/v...,"Smooth stretch fabric ,Uses adidas climalite t...",$46.00,adidas Training All Me Vfa Bra In Black,ASOS,3142,1.874848,22.459082,1,green
3143,3143,Converse Long Sleeve T-Shirt In Pink Crew neck...,,Top,"Crew neck,It's classic you,Converse print to c...",,,https://images.asos-media.com/products/convers...,"Soft-touch cotton,Nail the basics,Body: 100% C...",$30.00,Converse Long Sleeve T-Shirt In Pink,ASOS,3143,-29.134439,83.741135,5,salmon
3144,3144,ASOS DESIGN Curve bardot shirred top Off-shoul...,,Top,"Off-shoulder style,Long sleeves,Shirred-stretc...",,,https://images.asos-media.com/products/asos-de...,"Soft-touch jersey,You can count on me,Main: 10...",$29.00,ASOS DESIGN Curve bardot shirred top,ASOS,3144,-9.914086,2.079219,1,green
3145,3145,Calvin Klein Jeans box logo t shirt Wear it ag...,,Top,"Wear it again and again,Crew neck,Box logo to ...",,,https://images.asos-media.com/products/calvin-...,"Basic jersey,Why complicate things?,Body: 100%...",$36.00,Calvin Klein Jeans box logo t shirt,ASOS,3145,11.284685,40.044205,5,salmon


In [20]:
from bokeh.plotting import figure, show, output_notebook, save
from bokeh.models import HoverTool, value, LabelSet, Legend, ColumnDataSource
output_notebook()

In [21]:
source = ColumnDataSource(dict(
    x=df1['X_tsne'],
    y=df1['Y_tsne'],
    color=df1['colors'],
    label=df1['clusters'],
    topic_cluster= df1['clusters'],
    title= df1[u'product_name'],
    img_url = df1['img_url'],
    website = df1['website']
))

In [22]:
title = 'T-SNE visualization of topics'

plot_lda = figure(plot_width=1000, plot_height=600,
                     title=title, tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter(x='x', y='y', legend='label', source=source, color='color', alpha=0.8, size=10)

# hover tools
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = {"content": "Prod_name: @title, website: @website - Topic: @topic_cluster "}
plot_lda.legend.location = "top_left"

show(plot_lda)

#save the plot
save(plot_lda, '{}.html'.format(title))

  warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")
  warn("save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'")


'/Users/kellyho/Desktop/capstone/Kelly/T-SNE visualization of topics.html'