In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [46]:
data = pd.read_csv("C:/Users/janva/Documents/Git projects/amazon_nfu/gen/output/amazon_usa_clean.csv", sep = ",")

In [49]:
my_stopwords = nltk.corpus.stopwords.words('english')
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@'

In [51]:
def clean_review(review, bigrams=False):
    review = str(review)
    review = review.lower() # lower case
    review = re.sub('['+my_punctuation + ']+', ' ', review) # strip punctuation
    review = re.sub('\s+', ' ', review) #remove double spacing
    review = re.sub('([0-9]+)', '', review) # remove numbers
    review_token_list = [word for word in review.split(' ')
                            if word not in my_stopwords] # remove stopwords

    review_token_list = [word_rooter(word) if '#' not in word else word
                        for word in review_token_list] # apply word rooter
    if bigrams:
        review_token_list = review_token_list+[review_token_list[i]+'_'+review_token_list[i+1]
                                            for i in range(len(review_token_list)-1)]
    review = ' '.join(review_token_list)
    return review

In [52]:
data['review'] = data.review.apply(clean_review)

In [14]:
data['review']

0         skeptic get one renew iphon decid tri order ip...
1         definit skeptic come pixel xl og nexu p phone ...
2         appl phone alway great phone phone  day return...
3                                      mom love bought gift
4         surprisingli much faster pixel  best phone eve...
                                ...                        
128484    muy mala compra cada vez q está en una aplicac...
128485                             won’t hold charg charg  
128486    recommend cubot king  rug smartphon list g lte...
128487    phone work  batteri need replac doesnt last lo...
128488                                                  nan
Name: review, Length: 128489, dtype: object

In [54]:
low_nfu = data[data['median_variant_parent'] == 'low NFU']
high_nfu = data[data['median_variant_parent'] == 'high NFU']

In [55]:
print(len(low_nfu))
print(len(high_nfu))

67041
61448


In [56]:
# the vectorizer object will be used to transform text to vector form
vectorizer = CountVectorizer(max_df=0.8, min_df=25, token_pattern='\w+|\$[\d\.]+|\S+')

In [60]:
def display_topics(data, no_top_words):
    tf = vectorizer.fit_transform(data['review']).toarray()

    # tf_feature_names tells us what word each column in the matric represents
    tf_feature_names = vectorizer.get_feature_names()
    
    model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)
    
    model.fit(tf)
    
    topic_dict = {}
    i = 0
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(tf_feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        print(f"{i}/10")
        i += 1
    return pd.DataFrame(topic_dict)

In [61]:
number_of_topics = 10

low_nfu_topics = display_topics(data = low_nfu, no_top_words = 10)

high_nfu_topics = display_topics(data = high_nfu, no_top_words = 10)

0/10
1/10
2/10
3/10
4/10
5/10
6/10
7/10
8/10
9/10
0/10
1/10
2/10
3/10
4/10
5/10
6/10
7/10
8/10
9/10


In [64]:
type(low_nfu_topics)

pandas.core.frame.DataFrame

In [62]:
low_nfu_topics

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights,Topic 6 words,Topic 6 weights,Topic 7 words,Topic 7 weights,Topic 8 words,Topic 8 weights,Topic 9 words,Topic 9 weights
0,batteri,12107.2,phone,14591.0,phone,7846.9,phone,11559.5,phone,10043.7,phone,12621.7,la,1898.1,phone,7320.7,phone,19703.1,phone,9047.3
1,phone,8729.0,work,4495.8,new,6357.1,sim,6280.5,screen,4820.9,call,5389.5,el,1883.1,screen,5406.6,great,8601.6,camera,4388.1
2,life,6289.8,return,3578.2,work,4692.5,work,5590.5,one,3418.9,use,4886.6,de,1875.1,use,2351.3,love,7274.8,android,3328.1
3,good,6241.5,month,2805.2,great,4052.7,unlock,5075.5,get,2212.1,text,3147.0,que,1475.1,speaker,2309.1,price,3507.3,g,2966.0
4,use,4033.4,would,2399.2,came,3509.8,card,4682.6,new,2162.9,app,2699.2,lo,1246.1,like,2219.0,like,2817.9,use,2936.8
5,charg,3940.0,issu,2327.1,condit,3391.6,’t,4261.1,like,1852.2,g,2694.4,en,1164.1,button,2212.7,use,2521.3,pixel,2924.5
6,day,3361.3,amazon,2273.0,look,3123.8,verizon,3811.4,back,1846.1,get,2601.5,es,1080.1,sound,1746.7,work,2380.2,get,2381.4
7,great,2884.5,get,2224.9,good,3052.2,’,3136.7,got,1659.7,work,2305.3,muy,953.1,case,1530.8,good,2147.2,samsung,2344.1
8,last,2792.1,back,2188.6,product,2940.8,mobil,2730.4,case,1596.0,set,2131.4,se,868.6,camera,1454.5,one,2051.7,better,2096.0
9,work,2596.2,day,2183.6,like,2936.1,carrier,2272.8,buy,1536.7,need,1849.2,un,802.1,good,1353.5,bought,2025.4,like,1999.1


In [63]:
high_nfu_topics

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights,Topic 6 words,Topic 6 weights,Topic 7 words,Topic 7 weights,Topic 8 words,Topic 8 weights,Topic 9 words,Topic 9 weights
0,’t,5994.2,phone,14708.4,good,6199.2,screen,6339.0,phone,9748.9,phone,10361.8,phone,12701.1,el,2134.1,phone,2693.0,phone,11232.1
1,phone,5497.4,work,4195.3,phone,5488.1,phone,3650.0,new,7293.7,great,7995.6,batteri,5482.7,la,1988.1,use,2368.6,work,3949.0
2,’,5255.5,unlock,3355.5,great,1950.7,charger,2589.0,batteri,3968.1,work,7585.9,use,3728.3,de,1983.1,app,1948.2,call,2334.2
3,it,3986.9,sim,3309.1,nice,1820.3,case,2134.0,like,3628.8,new,4471.6,camera,3674.3,que,1541.1,get,1502.4,charg,2148.8
4,i,3672.7,servic,2584.2,work,1735.7,protector,2132.5,one,3581.4,condit,4098.7,great,2977.0,en,1434.1,updat,1485.4,time,2075.8
5,love,2391.0,card,2567.0,use,1733.3,came,1811.8,buy,3410.4,came,3110.8,life,2857.1,lo,1410.1,set,1367.5,get,1916.1
6,’m,2189.0,verizon,2458.9,love,1577.2,scratch,1756.0,iphon,3242.2,look,2540.6,last,2212.5,muy,1202.1,iphon,1295.0,return,1867.7
7,batteri,2064.5,g,2154.5,like,1481.1,charg,1675.1,look,2808.7,like,2419.4,day,2144.6,es,1090.1,devic,1114.8,month,1840.4
8,work,1993.3,mobil,2108.9,batteri,1238.2,come,1401.2,brand,2648.4,purchas,2206.9,like,1697.4,con,953.9,one,1094.3,back,1664.0
9,came,1694.0,use,2089.5,realli,1193.8,box,1280.8,amazon,2426.9,product,2182.0,good,1642.5,un,890.1,like,1075.3,speaker,1629.9


In [67]:
high_nfu_topics.to_csv("C:/Users/janva/Documents/Git projects/amazon_nfu/gen/paper/high_nfu_topics.csv", sep = ";")
low_nfu_topics.to_csv("C:/Users/janva/Documents/Git projects/amazon_nfu/gen/paper/low_nfu_topics.csv", sep = ";")