In [24]:
import pandas as pd
pd.options.display.max_columns = 30
import numpy as np
import re

import cufflinks
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='solar')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from sklearn.feature_extraction.text import CountVectorizer

In [25]:
df = pd.read_json('../../data/interim/final/reviews.json.gz', orient="records", compression="gzip")

In [26]:
df.head()

Unnamed: 0,user_id,product_id,ratings,review_text,summary,created_at
0,A0203183BAH3TR08FZGB,B0043T7FHK,5,I got this to run as a dual monitor. This is ...,This is my second time purchasing this monitor...,2015-06-30
1,A0261431Y0V4MHWY4B7W,B00AFH2E8E,4,"Not as good as I had hoped, music is very low,...",Bluetooth headset,2014-08-03
2,A034116598G557EYZ9BC,B0013FRNKG,5,Appreciate if product\nNeed to buy one more if...,great value,2012-11-28
3,A0404374X0HL5T332XSN,B00MNOPS1C,3,You get what you pay for,Three Stars,2016-02-02
4,A0431622H67YR5IPJRN,B0058UUR6E,5,Arrived in 2 days. working great. Recommend to...,working great. Recommend to others,2015-03-11


In [27]:
df['keywords'] = df['review_text'] + ' ' + df['summary']

In [28]:
def remove_html_tags(text: str):
    return re.sub(re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});'), '', text)

df['keywords'] = df['keywords'].apply(remove_html_tags)

In [29]:
def get_top_n_words(corpus, n, remove_stop_words = True, ngram_from = 1, ngram_to = 1):
    if remove_stop_words:
        vec = CountVectorizer(stop_words='english', ngram_range=(ngram_from, ngram_to))
    else:
        vec = CountVectorizer(ngram_range=(ngram_from, ngram_to))
    
    return get_top_n_words_2(vec, corpus, n)

def get_top_n_words_2(vec, corpus, n):
    vec = vec.fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

Visualize Frequency Distribution Before Removing Stop Words

In [30]:
common_words = get_top_n_words(df['keywords'], n=10, remove_stop_words=False)

df1 = pd.DataFrame(common_words, columns = ['keywords' , 'count'])

df1.groupby('keywords').sum()['count'].sort_values().iplot(
    kind='barh', yTitle='Count', linecolor='black', title='Top 10 words in review keywords before removing stop words'
)

Visualize Frequency Distribution After Removing Stop Words

In [31]:
common_words = get_top_n_words(df['keywords'], n=10)

df2 = pd.DataFrame(common_words, columns = ['keywords' , 'count'])

df2.groupby('keywords').sum()['count'].sort_values().iplot(
    kind='barh', yTitle='Count', linecolor='black', title='Top 10 words in review keywords after removing stop words'
)

Bigrams Frequency Distribution Before Removing Stop Word

In [32]:
common_words = get_top_n_words(df['keywords'], n=10, remove_stop_words=False, ngram_from=2, ngram_to=2)

df3 = pd.DataFrame(common_words, columns = ['keywords' , 'count'])

df3.groupby('keywords').sum()['count'].sort_values().iplot(
    kind='barh', yTitle='Count', linecolor='black', title='Top 10 bigrams in review keywords before removing stop words'
)

Bigrams Frequency Distribution After Removing Stop Word

In [33]:
common_words = get_top_n_words(df['keywords'], n=10, remove_stop_words=True, ngram_from=2, ngram_to=2)

df4 = pd.DataFrame(common_words, columns = ['keywords' , 'count'])

df4.groupby('keywords').sum()['count'].sort_values().iplot(
    kind='barh', yTitle='Count', linecolor='black', title='Top 10 bigrams in review keywords after removing stop words'
)

Trigrams Frequency Distribution Before Removing Stop Words

In [34]:
common_words = get_top_n_words(df['keywords'], n=10, remove_stop_words=False, ngram_from=3, ngram_to=3)

df5 = pd.DataFrame(common_words, columns = ['keywords' , 'count'])

df5.groupby('keywords').sum()['count'].sort_values().iplot(
    kind='barh', yTitle='Count', linecolor='black', title='Top 10 trigrams in review keywords before removing stop words'
)

Trigrams Frequency Distribution After Removing Stop Words

In [40]:
common_words = get_top_n_words(df['keywords'], n=10, remove_stop_words=True, ngram_from=3, ngram_to=3)

df6 = pd.DataFrame(common_words, columns = ['keywords' , 'count'])

df6.groupby('keywords').sum()['count'].sort_values().iplot(
    kind='barh', yTitle='Count', linecolor='black', title='Top 10 trigrams in review keywords after removing stop words'
)

Description Length Distribution

In [41]:
df['word_count'] = df['keywords'].apply(lambda x: len(str(x).split()))

In [42]:
desc_lengths = list(df['word_count'])

print(
    "Number of keywords:",len(desc_lengths),
    "\nAverage word count", np.average(desc_lengths),
    "\nMinimum word count", min(desc_lengths),
    "\nMaximum word count", max(desc_lengths)
)

Number of keywords: 59561 
Average word count 74.17058142072833 
Minimum word count 2 
Maximum word count 407


In [44]:
df['word_count'].iplot(
    kind='hist',
    bins = 50,
    linecolor='black',
    xTitle='word count',
    yTitle='count',
    title='Word Count Distribution in Review Keywords')