### Import  packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

# Connect to Comprehend to get Sentiment
import boto3

### Demo: Detecting the Dominant Language Using the AWS SDK for Python (Boto)

In [2]:
import json

session = boto3.Session(region_name='us-east-1')
client = session.client('comprehend')
text = "It is raining today in Seattle"

print('Calling DetectDominantLanguage')
print(json.dumps(client.detect_dominant_language(Text = text), sort_keys=True, indent=4))
print("End of DetectDominantLanguage\n")

Calling DetectDominantLanguage
{
    "Languages": [
        {
            "LanguageCode": "en",
            "Score": 0.9963054656982422
        }
    ],
    "ResponseMetadata": {
        "HTTPHeaders": {
            "content-length": "64",
            "content-type": "application/x-amz-json-1.1",
            "date": "Tue, 18 Feb 2020 00:57:26 GMT",
            "x-amzn-requestid": "37574462-8ba1-47f4-a274-796c503aa6d0"
        },
        "HTTPStatusCode": 200,
        "RequestId": "37574462-8ba1-47f4-a274-796c503aa6d0",
        "RetryAttempts": 0
    }
}
End of DetectDominantLanguage



In [3]:
import json

session = boto3.Session(region_name='us-east-1')
client = session.client('comprehend')
text = "今天西雅图下雨了"

print('Calling DetectDominantLanguage')
print(json.dumps(client.detect_dominant_language(Text = text), sort_keys=True, indent=4))
print("End of DetectDominantLanguage\n")

Calling DetectDominantLanguage
{
    "Languages": [
        {
            "LanguageCode": "zh",
            "Score": 1.0
        }
    ],
    "ResponseMetadata": {
        "HTTPHeaders": {
            "content-length": "49",
            "content-type": "application/x-amz-json-1.1",
            "date": "Tue, 18 Feb 2020 00:57:26 GMT",
            "x-amzn-requestid": "dd9ca092-92c6-426f-9418-d65576f80a4d"
        },
        "HTTPStatusCode": 200,
        "RequestId": "dd9ca092-92c6-426f-9418-d65576f80a4d",
        "RetryAttempts": 0
    }
}
End of DetectDominantLanguage



### Detecting Named Entities Using the AWS SDK for Python (Boto)

In [4]:
import boto3
import json

session = boto3.Session(region_name='us-east-1')
client = session.client('comprehend')
text = "I'm an avid photographer, and I'm primarily found shooting with my DSLR \
or my instant film camera that I carry around for casual use. While nothing beats \
my DSLR in power and convenience, there's something magical about my instant film \
camera. Perhaps it's that you're shooting on actual film, or maybe it's that every \
shot you take is a unique physical artifact (which is special in today's world of \
Instagram and Facebook, where photos are a dime a dozen). All I know for sure is \
that they are incredibly fun to use and peoples' eyes light up when you pull one of these out at a party."

print('Calling DetectEntities')
print(json.dumps(client.detect_entities(Text=text, LanguageCode='en'), sort_keys=True, indent=4))
print('End of DetectEntities\n')

Calling DetectEntities
{
    "Entities": [
        {
            "BeginOffset": 313,
            "EndOffset": 318,
            "Score": 0.6073688268661499,
            "Text": "every",
            "Type": "QUANTITY"
        },
        {
            "BeginOffset": 384,
            "EndOffset": 389,
            "Score": 0.8533614277839661,
            "Text": "today",
            "Type": "DATE"
        },
        {
            "BeginOffset": 401,
            "EndOffset": 410,
            "Score": 0.6224591732025146,
            "Text": "Instagram",
            "Type": "ORGANIZATION"
        },
        {
            "BeginOffset": 415,
            "EndOffset": 423,
            "Score": 0.6715020537376404,
            "Text": "Facebook",
            "Type": "ORGANIZATION"
        },
        {
            "BeginOffset": 451,
            "EndOffset": 456,
            "Score": 0.7550779581069946,
            "Text": "dozen",
            "Type": "QUANTITY"
        },
        {
            "Beg

### Keyphrase Extraction

In [5]:
import boto3

session = boto3.Session(region_name='us-east-1')
client = session.client('comprehend')
text = "The MSBA imparts strong technical and quantitative training plus comprehensive business acumen, all within a top 20 business school. This intense program is designed to produce a business data scientist, one who can speak the language of business, technology, and data. In the Capstone project, student teams apply their skills to solve sponsor firms’ business problems using the firm’s proprietary data."

print('Calling DetectKeyPhrases')
print(json.dumps(client.detect_key_phrases(Text=text, LanguageCode='en'), sort_keys=True, indent=4))
print('End of DetectKeyPhrases\n')

Calling DetectKeyPhrases
{
    "KeyPhrases": [
        {
            "BeginOffset": 0,
            "EndOffset": 8,
            "Score": 0.9999974370002747,
            "Text": "The MSBA"
        },
        {
            "BeginOffset": 17,
            "EndOffset": 59,
            "Score": 0.9999995827674866,
            "Text": "strong technical and quantitative training"
        },
        {
            "BeginOffset": 65,
            "EndOffset": 94,
            "Score": 0.9999765753746033,
            "Text": "comprehensive business acumen"
        },
        {
            "BeginOffset": 107,
            "EndOffset": 131,
            "Score": 0.999969482421875,
            "Text": "a top 20 business school"
        },
        {
            "BeginOffset": 133,
            "EndOffset": 153,
            "Score": 0.9999768733978271,
            "Text": "This intense program"
        },
        {
            "BeginOffset": 177,
            "EndOffset": 202,
            "Score": 0.999990284

### Sentiment Analysis

In [6]:
# Try some examples
sentiment = client.detect_sentiment(
    Text="Works awesome for apt size 110 dryer - Works awesome for apt \
    size 110 dryer. Handles load from apt size washer just fine. It does take \
    longer to dry. Electric cost savings over a full size 220 is worth the time. \
    Does not add much humidity unless lint filter is full.",
    LanguageCode='en'
)
sentiment['Sentiment'],sentiment['SentimentScore']

('POSITIVE',
 {'Positive': 0.9983564019203186,
  'Negative': 3.536563235684298e-05,
  'Neutral': 0.0015746206045150757,
  'Mixed': 3.36409175361041e-05})

In [7]:
# Try some examples
sentiment = client.detect_sentiment(
    Text="才刚买的，用了两天就坏了，说换货一直没换，这么大的店，早干嘛呢！",
    LanguageCode='zh'
)
sentiment['Sentiment'],sentiment['SentimentScore']

('NEGATIVE',
 {'Positive': 0.003523502266034484,
  'Negative': 0.9938187003135681,
  'Neutral': 0.00265671918168664,
  'Mixed': 1.0957356835206156e-06})

In [8]:
# Try some examples
sentiment = client.detect_sentiment(
    Text="확실한 점은 영화가 대중에게 다양한 메시지를 전달하고 사회 문제를 \
    적나라하게 드러냈다는 것입니다. 그래서 영화를 보면 매우 불편할 수도 있습\
    니다. 영화를 보고 정말 왜 제목이 조커인지 알 수 있었습니다. 결말도 깔끔하고 만족스러웠습니다.",
    LanguageCode='ko'
)
sentiment['Sentiment'],sentiment['SentimentScore']


('POSITIVE',
 {'Positive': 0.9995538592338562,
  'Negative': 0.00016010676336009055,
  'Neutral': 0.0002690566470846534,
  'Mixed': 1.6909925761865452e-05})

### Get the dataset : use category Major Appliances as example

In [9]:
!aws s3 cp s3://amazon-reviews-pds/tsv/amazon_reviews_us_Major_Appliances_v1_00.tsv.gz .

download: s3://amazon-reviews-pds/tsv/amazon_reviews_us_Major_Appliances_v1_00.tsv.gz to ./amazon_reviews_us_Major_Appliances_v1_00.tsv.gz


In [10]:
df = pd.read_csv('amazon_reviews_us_Major_Appliances_v1_00.tsv.gz',
                 sep='\t',error_bad_lines=False,warn_bad_lines=True)

b'Skipping line 5583: expected 15 fields, saw 22\nSkipping line 22814: expected 15 fields, saw 22\nSkipping line 22883: expected 15 fields, saw 22\nSkipping line 29872: expected 15 fields, saw 22\nSkipping line 37242: expected 15 fields, saw 22\nSkipping line 59693: expected 15 fields, saw 22\n'


### EDA

In [11]:
print('Rows: {0}, Columns: {1}'.format(df.shape[0],df.shape[1]))

Rows: 96834, Columns: 15


In [12]:
df.index.max()

96833

In [13]:
df.columns

Index(['marketplace', 'customer_id', 'review_id', 'product_id',
       'product_parent', 'product_title', 'product_category', 'star_rating',
       'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
       'review_headline', 'review_body', 'review_date'],
      dtype='object')

In [14]:
df.isna().any(axis=0)

marketplace          False
customer_id          False
review_id            False
product_id           False
product_parent       False
product_title        False
product_category     False
star_rating          False
helpful_votes        False
total_votes          False
vine                 False
verified_purchase    False
review_headline       True
review_body           True
review_date           True
dtype: bool

In [15]:
# Look for any rows that have NA
rows_missing_values = df.isna().any(axis=1)

In [16]:
df[rows_missing_values]

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
3254,US,29686651,R1DGB2U8KV9HKP,B00GOFUISY,138120585,"FIREBIRD New 36"" European Style Wall Mount Sta...",Major Appliances,3,2,3,N,Y,Three Stars,,2015-08-06
8640,US,733945,R1QOC0UPHADKFZ,B00GHXU3VA,931824698,"GOLDEN VANTAGE 30"" European Style Ventless/Duc...",Major Appliances,5,9,9,N,Y,Five Stars,,2015-06-20
11556,US,18030318,R36Z529A9SVZ14,B002ROS27U,461806580,Whynter UIM-155 Stainless Steel Built-In Ice M...,Major Appliances,1,54,72,N,Y,One Star,,2015-05-25
15500,US,52655156,RZR2BV8UJXB3J,B00DNSO2UK,316513931,Haier Wine Cellar with Electronic Controls,Major Appliances,5,10,10,N,Y,Working great so far!,,2015-04-16
16453,US,24105158,R15NCTE2RINP6W,B005KT4LK6,236627965,Whirlpool WTW8800YW Cabrio 4.6 Cu. Ft. White T...,Major Appliances,1,1,1,N,N,One Star,,2015-04-07
22583,US,48624154,R223L9DVYCY4J5,B000S0PRNM,136191470,LG : WM2233HW 27 XL Front-Load Washer - White,Major Appliances,1,0,0,N,N,the worse washer and dryer set ever,,2015-02-15
29680,US,51669844,R18VF51XXHU2UE,B00DOHHZHM,221894244,Koolatron Beer Keg Cooler Brown,Major Appliances,1,10,18,N,N,Paperweight,,2014-12-13
36130,US,8711378,R3NXEY6CSAUFR,B0050KKS5C,758706493,316075103 BAKE ELEMENT REPAIR PART FOR FRIGIDA...,Major Appliances,3,1,1,N,Y,,Did not fix problem - my fault.,2014-10-08
95250,US,14267148,R3FCCZQ31S2Z4Q,B000IN22I2,99564707,"Igloo FR28WH 2.8-Cu-Ft Refrigerator, White",Major Appliances,5,0,0,N,N,"It does what it says on the tin""\tThis little ...",2008-07-21,


In [17]:
df['review_headline'] = df['review_headline'].fillna(' ')
df['review_body'] = df['review_body'].fillna(' ')

In [18]:
# Replace embedded new lines, tabs and carriage return
pattern = r'[\n\t\r]+'

In [19]:
# Use Regex module sub method to identify patterns of interest and replace the matching text.
text = 'ab,cd\n\tef'

print('original text:', text)

print('after substituition:', re.sub(pattern,' ', text))

original text: ab,cd
	ef
after substituition: ab,cd ef


In [20]:
df['product_title'] = df['product_title'].map(lambda x: re.sub(pattern,' ',x))
df['review_headline'] = df['review_headline'].map(lambda x: re.sub(pattern,' ',x))
df['review_body'] = df['review_body'].map(lambda x: re.sub(pattern,' ',x))

In [21]:
df.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,16199106,R203HPW78Z7N4K,B0067WNSZY,633038551,"FGGF3032MW Gallery Series 30"" Wide Freestandin...",Major Appliances,5,0,0,N,Y,"If you need a new stove, this is a winner.",What a great stove. What a wonderful replacem...,2015-08-31
1,US,16374060,R2EAIGVLEALSP3,B002QSXK60,811766671,Best Hand Clothes Wringer,Major Appliances,5,1,1,N,Y,Five Stars,worked great,2015-08-31
2,US,15322085,R1K1CD73HHLILA,B00EC452R6,345562728,Supco SET184 Thermal Cutoff Kit,Major Appliances,5,0,0,N,Y,Fast Shipping,Part exactly what I needed. Saved by purchasi...,2015-08-31
3,US,32004835,R2KZBMOFRMYOPO,B00MVVIF2G,563052763,Midea WHS-160RB1 Compact Single Reversible Doo...,Major Appliances,5,1,1,N,Y,Five Stars,Love my refrigerator! ! Keeps everything cold...,2015-08-31
4,US,25414497,R6BIZOZY6UD01,B00IY7BNUW,874236579,Avalon Bay Portable Ice Maker,Major Appliances,5,0,0,N,Y,Five Stars,No more running to the store for ice! Works p...,2015-08-31


In [22]:
# Examine the head of reviews
df['review_body'].head()

0    What a great stove.  What a wonderful replacem...
1                                         worked great
2    Part exactly what I needed.  Saved by purchasi...
3    Love my refrigerator! ! Keeps everything  cold...
4    No more running to the store for ice!  Works p...
Name: review_body, dtype: object

In [23]:
# Some examples of review title and body
for i in range(10):
    print(df.iloc[i]['review_headline'] + ' - ' + df.iloc[i]['review_body'])
    print()

If you need a new stove, this is a winner. - What a great stove.  What a wonderful replacement for my sort of antique.  Enjoy it every day.

Five Stars - worked great

Fast Shipping - Part exactly what I needed.  Saved by purchasing myself.

Five Stars - Love my refrigerator! ! Keeps everything  cold..will recommend!

Five Stars - No more running to the store for ice!  Works perfectly.

Piece of Junk - It would not cool below 55 degrees and has now stopped working all together.  I would NOT recommend this piece of junk to anyone.

Works awesome for apt size 110 dryer - Works awesome for apt size 110 dryer. Handles load from apt size washer just fine. It does take longer to dry. Electric cost savings over a full size 220 is worth the time. Does not add much humidity unless lint filter is full.

Five Stars - exactly what I wanted!

Four Stars - AS advertised

but has poor insulation in the top - It works as advertised, but has poor insulation in the top. Like the 3rd shelf, it comes in h

In [27]:
df.shape

(96834, 15)

In [28]:
# Take the first 10000 rows to try out sentiment analysis
df_first_10000 = df.iloc[0:10000,]

In [36]:
# Get sentiment for all reviews (25 reviews at a time) 
# To avoid the costs, please don't run this code. I have uploaded the exported csv file for your convenience.
step = 25 
results = [] 
for i in range(0,df_first_10000.shape[0],step): 
    print('***',i,i+step) 
    review = list((df_first_10000.iloc[i:i+step]['review_headline'] + ' - ' + df_first_10000.iloc[i:i+step]['review_body'].str.slice(0,4000)).values) 
    # initialize place holder for return values 
    temp_results = ['']*len(review) 
    sentiment = client.batch_detect_sentiment(TextList=review,LanguageCode='en') 
    # Get the sentiment 
    for s in sentiment['ResultList']: 
        print(s['Index']+i,s['Sentiment']) 
        temp_results[s['Index']] = s['Sentiment'] 
        results.extend(temp_results)

*** 0 25
0 POSITIVE
1 POSITIVE
2 POSITIVE
3 POSITIVE
4 POSITIVE
5 NEGATIVE
6 POSITIVE
7 POSITIVE
8 POSITIVE
9 MIXED
10 NEGATIVE
11 POSITIVE
12 MIXED
13 POSITIVE
14 POSITIVE
15 POSITIVE
16 NEGATIVE
17 POSITIVE
18 NEGATIVE
19 POSITIVE
20 POSITIVE
21 NEGATIVE
22 POSITIVE
23 POSITIVE
24 POSITIVE
*** 25 50
25 NEGATIVE
26 POSITIVE
27 NEGATIVE
28 POSITIVE
29 NEGATIVE
30 POSITIVE
31 POSITIVE
32 POSITIVE
33 MIXED
34 NEGATIVE
35 POSITIVE
36 POSITIVE
37 MIXED
38 NEGATIVE
39 POSITIVE
40 POSITIVE
41 POSITIVE
42 MIXED
43 POSITIVE
44 POSITIVE
45 NEGATIVE
46 MIXED
47 MIXED
48 MIXED
49 POSITIVE
*** 50 75
50 POSITIVE
51 NEGATIVE
52 NEGATIVE
53 POSITIVE
54 POSITIVE
55 MIXED
56 MIXED
57 POSITIVE
58 POSITIVE
59 POSITIVE
60 POSITIVE
61 POSITIVE
62 NEGATIVE
63 POSITIVE
64 POSITIVE
65 MIXED
66 POSITIVE
67 NEGATIVE
68 POSITIVE
69 MIXED
70 POSITIVE
71 MIXED
72 POSITIVE
73 MIXED
74 NEGATIVE
*** 75 100
75 POSITIVE
76 NEGATIVE
77 POSITIVE
78 POSITIVE
79 NEGATIVE
80 POSITIVE
81 POSITIVE
82 NEGATIVE
83 NEGATIVE
84 P

In [69]:
# Export the result to csv
df_first_10000.to_csv('customer_reviews_with_sentiment_compressed.txt.gz',index=False,header=True,compression='gzip',sep='\t')

In [25]:
# Read the results back for examination
results = pd.read_csv('Results.csv',header=None)

In [29]:
df_first_10000['sentiment']=results.iloc[:,1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [30]:
# In the last column, we can see the appended sentiment for each review
df_first_10000.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,sentiment
0,US,16199106,R203HPW78Z7N4K,B0067WNSZY,633038551,"FGGF3032MW Gallery Series 30"" Wide Freestandin...",Major Appliances,5,0,0,N,Y,"If you need a new stove, this is a winner.",What a great stove. What a wonderful replacem...,2015-08-31,POSITIVE
1,US,16374060,R2EAIGVLEALSP3,B002QSXK60,811766671,Best Hand Clothes Wringer,Major Appliances,5,1,1,N,Y,Five Stars,worked great,2015-08-31,POSITIVE
2,US,15322085,R1K1CD73HHLILA,B00EC452R6,345562728,Supco SET184 Thermal Cutoff Kit,Major Appliances,5,0,0,N,Y,Fast Shipping,Part exactly what I needed. Saved by purchasi...,2015-08-31,POSITIVE
3,US,32004835,R2KZBMOFRMYOPO,B00MVVIF2G,563052763,Midea WHS-160RB1 Compact Single Reversible Doo...,Major Appliances,5,1,1,N,Y,Five Stars,Love my refrigerator! ! Keeps everything cold...,2015-08-31,POSITIVE
4,US,25414497,R6BIZOZY6UD01,B00IY7BNUW,874236579,Avalon Bay Portable Ice Maker,Major Appliances,5,0,0,N,Y,Five Stars,No more running to the store for ice! Works p...,2015-08-31,POSITIVE
