In [100]:
import pandas as pd
pd.options.display.max_columns = 30
import numpy as np

import cufflinks
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='solar')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from sklearn.feature_extraction.text import CountVectorizer

In [101]:
df = pd.read_json('../../data/processed/reviews.json.gz', orient="records", compression="gzip")

In [102]:
df.head()

Unnamed: 0,cleaned_review,sentiment
0,get run dual monitor second time purchase moni...,1
1,good hop music low phone volume pretty good bl...,1
2,appreciate product need buy one promotion need...,1
3,get pay three star,1
4,arrive day work great recommend others work gr...,1


In [103]:
def get_top_n_words(corpus, n, remove_stop_words = True, ngram_from = 1, ngram_to = 1):
    if remove_stop_words:
        vec = CountVectorizer(stop_words='english', ngram_range=(ngram_from, ngram_to))
    else:
        vec = CountVectorizer(ngram_range=(ngram_from, ngram_to))
    
    return get_top_n_words_2(vec, corpus, n)


def get_top_n_words_2(vec, corpus, n):
    vec = vec.fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [104]:
df = df.rename(columns={"cleaned_review":"keywords"})

Visualize Frequency Distribution

In [105]:
common_words = get_top_n_words(df['keywords'], n=10, remove_stop_words=True)

df1 = pd.DataFrame(common_words, columns = ['keywords' , 'count'])

df1.groupby('keywords').sum()['count'].sort_values().iplot(
    kind='barh', yTitle='Count', linecolor='black', title='Top 10 words in review keywords'
)

Bigrams Frequency Distribution

In [106]:
common_words = get_top_n_words(df['keywords'], n=10, remove_stop_words=True, ngram_from=2, ngram_to=2)

df2 = pd.DataFrame(common_words, columns = ['keywords' , 'count'])

df2.groupby('keywords').sum()['count'].sort_values().iplot(
    kind='barh', yTitle='Count', linecolor='black', title='Top 10 bigrams in review keywords'
)

Trigrams Frequency Distribution

In [107]:
common_words = get_top_n_words(df['keywords'], n=10, remove_stop_words=True, ngram_from=3, ngram_to=3)

df3 = pd.DataFrame(common_words, columns = ['keywords' , 'count'])

df3.groupby('keywords').sum()['count'].sort_values().iplot(
    kind='barh', yTitle='Count', linecolor='black', title='Top 10 trigrams in review keywords'
)

Description Length Distribution

In [108]:
df['word_count'] = df['keywords'].apply(lambda x: len(str(x).split()))

In [109]:
desc_lengths = list(df['word_count'])

print(
    "Number of keywords:",len(desc_lengths),
    "\nAverage word count", np.average(desc_lengths),
    "\nMinimum word count", min(desc_lengths),
    "\nMaximum word count", max(desc_lengths)
)

Number of keywords: 59558 
Average word count 36.492310017126165 
Minimum word count 1 
Maximum word count 208


In [110]:
df['word_count'].iplot(
    kind='hist',
    bins = 50,
    linecolor='black',
    xTitle='word count',
    yTitle='count',
    title='Word Count Distribution in Product Keywords'
)