In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import boto3

In [2]:
# Open calendar and listings data
boston_reviews = pd.read_csv("boston_airbnb_data/reviews.csv")

In [3]:
# Preview of the data
boston_reviews.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,1178162,4724140,2013-05-21,4298113,Olivier,My stay at islam's place was really cool! Good...
1,1178162,4869189,2013-05-29,6452964,Charlotte,Great location for both airport and city - gre...
2,1178162,5003196,2013-06-06,6449554,Sebastian,We really enjoyed our stay at Islams house. Fr...
3,1178162,5150351,2013-06-15,2215611,Marine,The room was nice and clean and so were the co...
4,1178162,5171140,2013-06-16,6848427,Andrew,Great location. Just 5 mins walk from the Airp...


In [6]:
# Date range for boston reviews
min_date_boston_reviews = boston_reviews["date"].min()
max_date_boston_reviews = boston_reviews["date"].max()

print("Boston calendar goes from: {} to {} ".format(min_date_boston_reviews,max_date_boston_reviews))

Boston calendar goes from: 2009-03-21 to 2016-09-06 


In [4]:
len(boston_reviews)

68275

In [14]:
client = boto3.client('comprehend')

In [15]:
def sentiment_comments_todict(df,client):
    result_dict = {}
    size = len(df)
    general_count,correct_count,fail_count = 1,1,1
    for index, row in df.iterrows():
        general_percentage = round(general_count*100/size,2)
        correct_percentage =round(correct_count*100/general_count,2)
        fail_percentage = round(fail_count*100/general_count,2)
        print("Progress:{}%, Correct:{}%, Fail:{}% ............".format(general_percentage,
                                                                        correct_percentage,fail_percentage),end='\r')
        general_count += 1
        listing_id = row[0]
        comment = row[5]
        # Use aws comprehend to extract the sentiment of the comment
        try:
            response = client.detect_sentiment(Text=comment,LanguageCode='en')
            if listing_id not in result_dict.keys():
                result_dict[listing_id]={"Positive":[response["SentimentScore"]["Positive"]],
                                         "Negative":[response["SentimentScore"]["Negative"]],
                                         "Neutral":[response["SentimentScore"]["Neutral"]],
                                         "Mixed":[response["SentimentScore"]["Mixed"]]}
            else:
                result_dict[listing_id]["Positive"].append(response["SentimentScore"]["Positive"])
                result_dict[listing_id]["Negative"].append(response["SentimentScore"]["Negative"])
                result_dict[listing_id]["Neutral"].append(response["SentimentScore"]["Neutral"])
                result_dict[listing_id]["Mixed"].append(response["SentimentScore"]["Mixed"])

            correct_count += 1
        except:
            fail_count += 1
            continue
            
    return result_dict

In [None]:
def mean_sentiment_comments(result_dict):
    resume_dict = {}
    for listing_id in result_dict.keys():
        mean_positive = round(sum(result_dict[listing_id]["Positive"])*100/len(result_dict[listing_id]["Positive"]),2)
        mean_negative = round(sum(result_dict[listing_id]["Negative"])*100/len(result_dict[listing_id]["Negative"]),2)
        mean_neutral = round(sum(result_dict[listing_id]["Neutral"])*100/len(result_dict[listing_id]["Neutral"]),2)
        mean_mixed = round(sum(result_dict[listing_id]["Mixed"])*100/len(result_dict[listing_id]["Mixed"]),2)
        number_reviews = len(result_dict[listing_id]["Positive"])

        resume_dict[listing_id]={"mean_positive":mean_positive,"mean_negative":mean_negative,"mean_neutral":mean_neutral,
                                "mean_mixed":mean_mixed,"number_reviews":number_reviews}
    return resume_dict

In [16]:
result_dict = sentiment_comments_todict(boston_reviews,client)

Progress:100.0%, Correct:99.92%, Fail:0.08% ............

In [18]:
df = pd.DataFrame.from_dict(result_dict,orient='index')
df.to_csv("df_sentiment_comments_backup.csv")
df.to_json('df_sentiment_comments_backup.json', orient='index')

In [40]:
mean_df = pd.DataFrame.from_dict(resume_dict,orient='index')
mean_df.index.name = 'listing_id'
mean_df.to_csv("mean_sentiment_comments.csv")
mean_df.to_json("mean_sentiment_comments.json", orient='index')