In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from datetime import datetime

In [2]:
data = pd.read_csv("C:/Users/janva/Documents/Git projects/amazon_nfu/gen/output/amazon_usa_clean.csv", sep = ",")

In [3]:
my_stopwords = nltk.corpus.stopwords.words('english')
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
my_punctuation = '!"$%&\'’()*+,-./:;<=>?[\\]^_`{|}~•@'

In [4]:
my_stopwords.append('phone')

In [5]:
def clean_review(review, bigrams=False):
    review = str(review)
    review = review.lower() # lower case
    review = re.sub('['+my_punctuation + ']+', ' ', review) # strip punctuation
    review = re.sub('\s+', ' ', review) #remove double spacing
    review = re.sub('([0-9]+)', '', review) # remove numbers
    review_token_list = [word for word in review.split(' ')
                            if word not in my_stopwords] # remove stopwords

    review_token_list = [word_rooter(word) if '#' not in word else word
                        for word in review_token_list] # apply word rooter
    if bigrams:
        review_token_list = review_token_list+[review_token_list[i]+'_'+review_token_list[i+1]
                                            for i in range(len(review_token_list)-1)]
    review = ' '.join(review_token_list)
    return review

In [6]:
data['review'] = data.review.apply(clean_review)

In [51]:
data = data[data['brand_overall'] == 'apple']
data

Unnamed: 0.1,Unnamed: 0,variant,Unnamed..0.1,asin_url,brand,model,link,product_title,name,rating,...,uniq_brand_overall,uniq_parent,uniq_sku_parent,uniq_variant_parent,uniq_variant_brand,count_variant_parent,conspicuousness,oldest_review,time_since_oldest_review,median_variant_parent
0,1,black,0,https://www.amazon.com/dp/B07ZPKN6YR,Apple,IPhone 11,https://amazon.com/Apple-iPhone-11-64GB-Black/...,"Apple iPhone 11, 64GB, Black - Unlocked (Renew...",wp,4.0,...,0.323818,718,0.005571,0.813370,0.267239,6,1.0,2022-07-20,38,low NFU
2,3,black,2,https://www.amazon.com/dp/B07ZPKN6YR,Apple,IPhone 11,https://amazon.com/Apple-iPhone-11-64GB-Black/...,"Apple iPhone 11, 64GB, Black - Unlocked (Renew...",Dov,4.0,...,0.323818,718,0.069638,0.813370,0.267239,6,1.0,2022-03-30,141,low NFU
8,9,black,8,https://www.amazon.com/dp/B07ZPKN6YR,Apple,IPhone 11,https://amazon.com/Apple-iPhone-11-64GB-Black/...,"Apple iPhone 11, 64GB, Black - Unlocked (Renew...",Salvatore Ferrigno,5.0,...,0.323818,718,0.019499,0.813370,0.267239,6,1.0,2022-05-25,106,low NFU
9,10,black,9,https://www.amazon.com/dp/B07ZPKN6YR,Apple,IPhone 11,https://amazon.com/Apple-iPhone-11-64GB-Black/...,"Apple iPhone 11, 64GB, Black - Unlocked (Renew...",John Franklin,4.0,...,0.323818,718,0.069638,0.813370,0.267239,6,1.0,2022-03-30,153,low NFU
10,11,black,10,https://www.amazon.com/dp/B07ZPKN6YR,Apple,IPhone 11,https://amazon.com/Apple-iPhone-11-64GB-Black/...,"Apple iPhone 11, 64GB, Black - Unlocked (Renew...",Charis Walker,4.0,...,0.323818,718,0.069638,0.813370,0.267239,6,1.0,2022-03-30,164,low NFU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128342,128343,,122004,https://www.amazon.com/dp/B079H86JGH,Apple,MJVE2LL/A,https://amazon.com/Apple-MacBook-MJVE2LL-13-in...,Apple MacBook Air MJVE2LL/A 13-inch Laptop 1.6...,Luis Pradel,5.0,...,0.323818,39195,,0.083761,0.020718,16,,2011-12-11,3679,high NFU
128343,128344,,122005,https://www.amazon.com/dp/B079H86JGH,Apple,MJVE2LL/A,https://amazon.com/Apple-MacBook-MJVE2LL-13-in...,Apple MacBook Air MJVE2LL/A 13-inch Laptop 1.6...,Tilda,5.0,...,0.323818,39195,,0.083761,0.020718,16,,2011-12-11,3581,high NFU
128476,128477,,25188,https://www.amazon.com/dp/B077H73WSR,Apple,IPhone 7,https://amazon.com/Apple-iPhone-Fully-Unlocked...,"Apple iPhone 7, Fully Unlocked, 128GB (Renewed...",Teresa,5.0,...,0.323818,39195,,0.083761,0.020718,16,,2011-12-11,3865,high NFU
128483,128484,,25186,https://www.amazon.com/dp/B077H73WSR,Apple,IPhone 7,https://amazon.com/Apple-iPhone-Fully-Unlocked...,"Apple iPhone 7, Fully Unlocked, 128GB (Renewed...",Mark T.,5.0,...,0.323818,39195,,0.083761,0.020718,16,,2011-12-11,3908,high NFU


In [8]:
low_nfu = data[data['median_variant_parent'] == 'low NFU']
high_nfu = data[data['median_variant_parent'] == 'high NFU']

In [52]:
asins = [asin for asin in data['version_asin'].unique() if str(asin) != 'nan']


In [53]:
len(asins)

472

In [54]:
reviews = []
i = 0
for asin in asins:
    subset = data[data['version_asin'] == asin]
    reviews_asin = {'asin' : asin}
    reviews_asin['review'] = ' '.join(subset['review'])
    reviews_asin['nfu'] = subset['median_variant_parent'].unique()[0]
    reviews.append(reviews_asin)
    print(f"{i}/{len(asins)}")
    i += 1 

0/472
1/472
2/472
3/472
4/472
5/472
6/472
7/472
8/472
9/472
10/472
11/472
12/472
13/472
14/472
15/472
16/472
17/472
18/472
19/472
20/472
21/472
22/472
23/472
24/472
25/472
26/472
27/472
28/472
29/472
30/472
31/472
32/472
33/472
34/472
35/472
36/472
37/472
38/472
39/472
40/472
41/472
42/472
43/472
44/472
45/472
46/472
47/472
48/472
49/472
50/472
51/472
52/472
53/472
54/472
55/472
56/472
57/472
58/472
59/472
60/472
61/472
62/472
63/472
64/472
65/472
66/472
67/472
68/472
69/472
70/472
71/472
72/472
73/472
74/472
75/472
76/472
77/472
78/472
79/472
80/472
81/472
82/472
83/472
84/472
85/472
86/472
87/472
88/472
89/472
90/472
91/472
92/472
93/472
94/472
95/472
96/472
97/472
98/472
99/472
100/472
101/472
102/472
103/472
104/472
105/472
106/472
107/472
108/472
109/472
110/472
111/472
112/472
113/472
114/472
115/472
116/472
117/472
118/472
119/472
120/472
121/472
122/472
123/472
124/472
125/472
126/472
127/472
128/472
129/472
130/472
131/472
132/472
133/472
134/472
135/472
136/472
137/472
138/47

In [60]:
df = pd.DataFrame(reviews)

In [61]:
low_nfu = df[df['nfu'] == 'low NFU']
high_nfu = df[df['nfu'] == 'high NFU']

In [41]:
high_nfu

Unnamed: 0,asin,review,nfu
4,B079JXY4TJ,best ever wont break sever fall thing deslik f...,high NFU
20,B07T1L2SBK,bought replac iphon plu broke want fix suppos...,high NFU
21,B08FRS8NPP,seem good qualiti like previou spigen case how...,high NFU
23,B07P6Y7954,recommend buy phone amazon lost time wait day ...,high NFU
30,B08PNTHD4N,impress new love size batteri last day proble...,high NFU
...,...,...,...
1342,B08HC5GMK2,unlock descript inaccur husband keept smash on...,high NFU
1343,B07TS37SR1,troubl sim card first sim card use tello sim c...,high NFU
1345,B099W1PQV5,band nice ship promis origin purchas color b...,high NFU
1350,B09P1W6NLN,product say tmobil put sim card told call spri...,high NFU


In [19]:
reviews = []
i = 0
#for asin in asins:
    reviews_asin = {'asin' : asin}
    string_of_reviews = ' '.join([row['review'] for index, row in data.iterrows()
                                if str(row['version_asin']) == str(asin)])
    if string_of_reviews != '':
        reviews_asin['review'] = string_of_reviews
        reviews.append(reviews_asin)
    print(f"{i}/{len(asins)}")
    i += 1
print(len(reviews))

[{'asin': 'B084HM9YWW',
  'review': 'skeptic get one renew iphon decid tri order iphon   gb black receipt  day got upon inspect expect perfect condit give  star littl hair scratch lower side realli see sinc appli screen protector see 😁 around camera len light scratch also see hardli notic batteri life mine say  bad think let see goe heavi daili user could problem peopl connect mobil account work everyth els work great camera speaker sound face recognit great buy save money new iphon would recommend ye everyon experi differ give  day return exchang  amaz came perfect condit cours scratch end expect use case bought cover matter  came like new condit satisfi  advertis excel condit even came screen protector save seller purchas futur '},
 {'asin': 'B08H8VZ6PV',
  'review': 'definit skeptic come pixel xl og nexu p meet need refresh updat secur protect  mom love bought gift surprisingli much faster pixel  best ever far make sure download nova launcher custom everyth want  pixel first foray b

In [44]:
# the vectorizer object will be used to transform text to vector form
vectorizer = CountVectorizer(max_df=0.8, min_df=25, token_pattern='\w+|\$[\d\.]+|\S+')

In [45]:
def display_topics(dataset, no_top_words):
    tf = vectorizer.fit_transform(dataset['review']).toarray()

    # tf_feature_names tells us what word each column in the matric represents
    tf_feature_names = vectorizer.get_feature_names()
    
    model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)
    
    model.fit(tf)
    
    topic_dict = {}
    i = 0
    for topic_idx, topic in enumerate(model.components_):
        begin = datetime.now()
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(tf_feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        end = datetime.now()
        print(f"{i}/{number_of_topics} in {end-begin}")
        i += 1
    return pd.DataFrame(topic_dict)

In [72]:
number_of_topics = 5

low_nfu_topics = display_topics(dataset = low_nfu, no_top_words = 20)

high_nfu_topics = display_topics(dataset = high_nfu, no_top_words = 20)

0/5 in 0:00:00
1/5 in 0:00:00
2/5 in 0:00:00
3/5 in 0:00:00
4/5 in 0:00:00
0/5 in 0:00:00
1/5 in 0:00:00
2/5 in 0:00:00
3/5 in 0:00:00
4/5 in 0:00:00


In [70]:
for row in high_nfu['review']:
    print(len(row))

17389
225699
279
3953
4946
997
1685
277
39872
50
1359
301
1567
173
5647
2713
618
271
400
43
2571
4303
125
3
5819
117
2535
585
245
475
710
571
62
6
22
1354
689
3375
82
103
13
33
77270
1537
1639
20851
3477
3463
995
10126
4780
601
2218
1893
4475
303
3666
456
5737
1344
1077
1202
46
4659
411
21
73
5863
99
1567
88
164
107
65
54712
21952
108655
102566
114720
9563
73
6656
12936
1839
9201
6673
7532
1483
8445
28859
35223
20455
1624
8744
1411
2087
313
7200
1135
25685
6626
467
757
460
205
3106
2382
1496
1924
263
476
2079
3139
387
2230
3230
1101
443
934
7957
37
983
2317
1449
259
675
1943
354
3104
1138
2136
92
194
51
371
1230
54
28
45
45203
15441
118189
66053
2096
9867
25145
15634
14066
1379
12748
3204
4306
2716
15562
19344
14624
3334
4063
9106
366
7380
230
923
229
37195
1216
3227
898
1126
1235
1544
1953
3462
1567
316
742
713
8327
5332
66
665
335
635
1464
310
570
117
313
131
120
42
28963
11235
9758
21339
1345
6319
2841
333
3526
1065
49
838
1035
577
288
1766
327
305
1305
159
94
58
585
18
148
412
143


In [64]:
type(low_nfu_topics)

pandas.core.frame.DataFrame

In [73]:
low_nfu_topics

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights
0,scratch,404.9,issu,121.9,charger,439.1,use,128.8,iphon,993.8
1,iphon,391.4,charg,92.5,appl,425.9,one,106.6,good,777.2
2,good,375.4,purchas,82.6,use,410.5,product,103.7,use,497.3
3,love,273.1,one,81.5,one,391.3,scratch,102.4,one,484.1
4,would,257.6,time,81.2,product,360.5,amazon,97.4,would,435.0
5,use,251.4,get,79.0,purchas,323.1,good,84.3,get,394.8
6,life,234.0,life,76.4,charg,302.9,even,69.2,seller,381.1
7,brand,224.7,use,70.9,would,297.9,brand,62.3,purchas,366.7
8,purchas,216.1,day,66.2,get,294.1,renew,62.2,life,360.7
9,perfect,192.8,turn,64.6,good,293.0,got,61.6,charg,359.2


In [74]:
high_nfu_topics

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights
0,work,3157.4,new,3137.7,work,1545.1,iphon,564.4,batteri,1754.0
1,great,2232.2,came,2808.6,batteri,1422.8,new,429.6,work,1693.1
2,new,1807.9,work,2782.7,new,1284.5,appl,372.7,iphon,1184.8
3,batteri,1694.6,great,2463.7,iphon,1250.2,like,335.7,new,1060.3
4,good,1674.3,batteri,2372.7,screen,1072.3,use,287.3,great,914.2
5,iphon,1520.5,scratch,2368.3,great,1041.6,screen,230.8,use,783.7
6,like,1284.1,screen,2112.7,like,1002.6,would,219.0,like,779.6
7,use,1129.5,like,1940.9,scratch,973.1,batteri,218.9,good,722.1
8,came,1041.4,good,1934.1,good,904.3,one,215.4,buy,632.3
9,look,967.8,iphon,1688.1,came,820.6,great,211.3,get,614.4


In [75]:
high_nfu_topics.to_csv("C:/Users/janva/Documents/Git projects/amazon_nfu/gen/paper/high_nfu_by_asin_iphone_topics.csv", sep = ";")
low_nfu_topics.to_csv("C:/Users/janva/Documents/Git projects/amazon_nfu/gen/paper/low_nfu_by_asin_iphone_topics.csv", sep = ";")

In [30]:
data['review'][0]+data['review'][1]

'skeptic get one renew iphon decid tri order iphon   gb black receipt  day got upon inspect can’t expect perfect condit that’ i’m give  star littl hair scratch lower side can’t realli see sinc appli screen protector can’t see 😁 around camera len light scratch also can’t see hardli notic batteri life mine say  bad think let’ see goe i’m heavi daili user could problem peopl connect mobil account it’ work everyth els work great camera speaker sound face recognit great buy save money new iphon would recommend ye everyone’ experi differ give  day return exchang definit skeptic come pixel xl og nexu p meet need refresh updat secur protect '