In [94]:
import pandas as pd
pd.options.display.max_columns = 30
import numpy as np
import re

import cufflinks
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='solar')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from sklearn.feature_extraction.text import CountVectorizer

In [116]:
df = pd.read_json('../../data/interim/final/products.json.gz', orient="records", compression="gzip")

In [117]:
df.head()

Unnamed: 0,product_id,name,description,price,image_url
0,B00001W0DG,Sony MDR-V500DJ Monitor Series Headphones with...,Revel in high-quality audio with the MDR-V500D...,6.61,[https://images-na.ssl-images-amazon.com/image...
1,B00004TLW2,Fujifilm MX2900 2.3MP Digital Camera w/ 3x Opt...,The FujiFilm MX-2900 digital camera includes s...,112.27,[https://images-na.ssl-images-amazon.com/image...
2,B00004VUM1,Sony MVC-FD95 Mavica 2MP Digital Camera with 1...,w/ Canon SELPHY CP760 Compact Photo Printer 32...,99.0,[https://images-na.ssl-images-amazon.com/image...
3,B00004WFYN,Plantronics H141 Duoset Convertible Headset (D...,- Convertible headset<br />- Quick disconnect ...,59.0,[https://images-na.ssl-images-amazon.com/image...
4,B00004XSHN,Fujifilm FinePix 4900 4.3MP Digital Camera w/ ...,Fuji's FinePix 4900 is one of a new style of c...,60.0,[https://images-na.ssl-images-amazon.com/image...


In [118]:
df['keywords'] = df['name'] + ' ' + df['description']

In [120]:
def remove_html_tags(text: str):
    return re.sub(re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});'), '', text)

df['keywords'] = df['keywords'].apply(remove_html_tags)

In [121]:
def get_top_n_words(corpus, n, remove_stop_words = True, ngram_from = 1, ngram_to = 1):
    if remove_stop_words:
        vec = CountVectorizer(stop_words='english', ngram_range=(ngram_from, ngram_to))
    else:
        vec = CountVectorizer(ngram_range=(ngram_from, ngram_to))
    
    return get_top_n_words_2(vec, corpus, n)

def get_top_n_words_2(vec, corpus, n):
    vec = vec.fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

Visualize Frequency Distribution Before Removing Stop Words

In [122]:
common_words = get_top_n_words(df['keywords'], n=10, remove_stop_words=False)

df1 = pd.DataFrame(common_words, columns = ['keywords' , 'count'])

df1.groupby('keywords').sum()['count'].sort_values().iplot(
    kind='barh', yTitle='Count', linecolor='black', title='Top 10 words in product keywords before removing stop words'
)

Visualize Frequency Distribution After Removing Stop Words

In [123]:
common_words = get_top_n_words(df['keywords'], n=10)

df2 = pd.DataFrame(common_words, columns = ['keywords' , 'count'])

df2.groupby('keywords').sum()['count'].sort_values().iplot(
    kind='barh', yTitle='Count', linecolor='black', title='Top 10 words in product keywords after removing stop words'
)

Bigrams Frequency Distribution Before Removing Stop Word

In [125]:
common_words = get_top_n_words(df['keywords'], n=10, remove_stop_words=False, ngram_from=2, ngram_to=2)

df3 = pd.DataFrame(common_words, columns = ['keywords' , 'count'])

df3.groupby('keywords').sum()['count'].sort_values().iplot(
    kind='barh', yTitle='Count', linecolor='black', title='Top 10 bigrams in product keywords before removing stop words'
)

Bigrams Frequency Distribution After Removing Stop Word

In [126]:
common_words = get_top_n_words(df['keywords'], n=10, remove_stop_words=True, ngram_from=2, ngram_to=2)

df4 = pd.DataFrame(common_words, columns = ['keywords' , 'count'])

df4.groupby('keywords').sum()['count'].sort_values().iplot(
    kind='barh', yTitle='Count', linecolor='black', title='Top 10 bigrams in product keywords after removing stop words'
)

Trigrams Frequency Distribution Before Removing Stop Words

In [127]:
common_words = get_top_n_words(df['keywords'], n=10, remove_stop_words=False, ngram_from=3, ngram_to=3)

df5 = pd.DataFrame(common_words, columns = ['keywords' , 'count'])

df5.groupby('keywords').sum()['count'].sort_values().iplot(
    kind='barh', yTitle='Count', linecolor='black', title='Top 10 trigrams in product keywords before removing stop words'
)

Trigrams Frequency Distribution After Removing Stop Words

In [138]:
common_words = get_top_n_words(df['keywords'], n=10, remove_stop_words=True, ngram_from=3, ngram_to=3)

df6 = pd.DataFrame(common_words, columns = ['keywords' , 'count'])

df6.groupby('keywords').sum()['count'].sort_values().iplot(
    kind='barh', yTitle='Count', linecolor='black', title='Top 10 trigrams in product keywords after removing stop words'
)

Description Length Distribution

In [130]:
df['word_count'] = df['keywords'].apply(lambda x: len(str(x).split()))

In [132]:
desc_lengths = list(df['word_count'])

print(
    "Number of keywords:",len(desc_lengths),
    "\nAverage word count", np.average(desc_lengths),
    "\nMinimum word count", min(desc_lengths),
    "\nMaximum word count", max(desc_lengths)
)

Number of keywords: 1798 
Average word count 121.98998887652948 
Minimum word count 3 
Maximum word count 368


In [134]:
df['word_count'].iplot(
    kind='hist',
    bins = 50,
    linecolor='black',
    xTitle='word count',
    yTitle='count',
    title='Word Count Distribution in Product Keywords')