In [1]:
import pandas as pd
pd.options.display.max_columns = 30
import numpy as np

import cufflinks
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='solar')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df = pd.read_json('../../data/processed/products.json.gz', orient="records", compression="gzip")

In [3]:
df.head()

Unnamed: 0,product_id,keywords
0,B00001W0DG,sony mdrv500dj monitor series headphone swivel...
1,B00004TLW2,fujifilm mx2900 23mp digital camera 3x optical...
2,B00004VUM1,sony mvcfd95 mavica 2mp digital camera 10x opt...
3,B00004WFYN,plantronics h141 duoset convertible headset di...
4,B00004XSHN,fujifilm finepix 4900 43mp digital camera 6x o...


In [4]:
def get_top_n_words(corpus, n, remove_stop_words = True, ngram_from = 1, ngram_to = 1):
    if remove_stop_words:
        vec = CountVectorizer(stop_words='english', ngram_range=(ngram_from, ngram_to))
    else:
        vec = CountVectorizer(ngram_range=(ngram_from, ngram_to))
    
    return get_top_n_words_2(vec, corpus, n)


def get_top_n_words_2(vec, corpus, n):
    vec = vec.fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

Visualize Frequency Distribution

In [5]:
common_words = get_top_n_words(df['keywords'], n=10, remove_stop_words=True)

df1 = pd.DataFrame(common_words, columns = ['keywords' , 'count'])

df1.groupby('keywords').sum()['count'].sort_values().iplot(
    kind='barh', yTitle='Count', linecolor='black', title='Top 10 words in product keywords'
)

Bigrams Frequency Distribution

In [6]:
common_words = get_top_n_words(df['keywords'], n=10, remove_stop_words=True, ngram_from=2, ngram_to=2)

df2 = pd.DataFrame(common_words, columns = ['keywords' , 'count'])

df2.groupby('keywords').sum()['count'].sort_values().iplot(
    kind='barh', yTitle='Count', linecolor='black', title='Top 10 bigrams in product keywords'
)

Trigrams Frequency Distribution

In [7]:
common_words = get_top_n_words(df['keywords'], n=10, remove_stop_words=True, ngram_from=3, ngram_to=3)

df3 = pd.DataFrame(common_words, columns = ['keywords' , 'count'])

df3.groupby('keywords').sum()['count'].sort_values().iplot(
    kind='barh', yTitle='Count', linecolor='black', title='Top 10 trigrams in product keywords'
)

Description Length Distribution

In [8]:
df['word_count'] = df['keywords'].apply(lambda x: len(str(x).split()))

In [9]:
desc_lengths = list(df['word_count'])

print(
    "Number of keywords:",len(desc_lengths),
    "\nAverage word count", np.average(desc_lengths),
    "\nMinimum word count", min(desc_lengths),
    "\nMaximum word count", max(desc_lengths)
)

Number of keywords: 1798 
Average word count 89.01890989988877 
Minimum word count 3 
Maximum word count 276


In [10]:
df['word_count'].iplot(
    kind='hist',
    bins = 50,
    linecolor='black',
    xTitle='word count',
    yTitle='count',
    title='Word Count Distribution in Product Keywords')

In [11]:
for i in range(1, 10):
    print(i,':',(df[df.index == i][['keywords']].values[0])[0])

1 : fujifilm mx2900 23mp digital camera 3x optical zoom bundle fujifilm mx2900 digital camera include best feature digital image 23 megapixels pack 117inch ccd camera offer ability create high resolution image large 1800 1200 pixel print photoquality image 10 inch primary color filter auto white balance maintain realistic image want creategrass stay green sky remain blue optical 3x zoom lens introduces versatility image capture delivers coverage equivalent 35105mm range 35mm camera closeups portrait look feel traditional photograph camera boots time interval frame short wont miss shot
2 : sony mvcfd95 mavica 2mp digital camera 10x optical zoom canon selphy cp760 compact photo printer 32 mb multimediacard sd memory card sdhc memory card multimediacardplus f2849product typedigital camera compact width34 depth09 height22 weight
3 : plantronics h141 duoset convertible headset discontinue manufacturer convertible headset quick disconnect plm22 plp10 adapter require convertible overtheear ov