In [1]:
# import AWS credentials
# import config.py ##for local
%run "/dbfs/FileStore/tables/config" ##for databricks

In [2]:
import pandas as pd
import numpy as np

from pyspark.sql import SparkSession
from pyspark import SparkFiles
from pyspark.sql import DataFrame
from pyspark.sql.types import *
import pyspark.sql.functions as F

In [3]:
# get or create Spark session
app_name = "spark-airbnb-sentiment"
spark = SparkSession.builder.appName(app_name).getOrCreate()

In [4]:
import boto3

secret_name = my_secret_name
region_name = my_region_name
access_key  = my_access_key
secret_key  = my_secret_key

session      = boto3.session.Session(aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name=region_name)
client       = session.client('secretsmanager')
secret_value = client.get_secret_value(SecretId=secret_name)

In [5]:
import json
def get_connection(secret_value):
    return json.loads(secret_value['SecretString'])

In [6]:
connection = get_connection(secret_value)

# Postgres credentials
jdbcHostname = connection['host']
jdbcPort     = connection['port']
jdbcDatabase = "postgres"
dialect      = "postgresql"
jdbcUsername = connection['username']
jdbcPassword = connection['password']

jdbcUrl = f"jdbc:{dialect}://{jdbcHostname}:{jdbcPort}/{jdbcDatabase}"
connectionProperties = {
  "user"     : jdbcUsername,
  "password" : jdbcPassword,
  "driver"   : "org.postgresql.Driver" 
}

In [7]:
# Read from reviews_full table

table = "reviews_full"

reviews_df = spark.read.jdbc(url=jdbcUrl, table=table, properties=connectionProperties)
reviews_df.printSchema()

In [8]:
from pyspark.ml import Pipeline
import sparknlp
sparknlp.start()
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline

pipeline = PretrainedPipeline("analyze_sentiment", lang="en")

In [9]:
reviews_df = reviews_df.withColumnRenamed("comments", "text").filter("text IS NOT NULL")

In [10]:
annotations_df = pipeline.transform(reviews_df)

In [11]:
# return the overall sentiment by votes
@F.udf()
def voteSentiment(sentClassifications):
  sentConf = [
      [x["result"], x["metadata"]["confidence"]]
    for x in sentClassifications
  ]
  posSent = np.mean([float(x[1]) for x in sentConf if x[0] == "positive"])
  negSent = np.mean([float(x[1]) for x in sentConf if x[0] == "negative"])
  return "positive" if posSent>=negSent else "negative"


# return the sentiment with the highest confidence value
@F.udf()
def highestConfidence(sentClassifications):
  try:
    idx = np.argmax([x["metadata"]["confidence"] for x in sentClassifications if x is not None])
    return sentClassifications[idx]["result"]
  except:
    return "unknown"


# return sentiment of the longest sentence
@F.udf()
def longestSentiment(sentClassifications):
  sentLengths = [
      [x["result"], x["end"] - x["begin"]]
    for x in sentClassifications
  ]
  posSent = sum([x[1] for x in sentLengths if x[0] == "positive"])
  negSent = sum([x[1] for x in sentLengths if x[0] == "negative"])
  return "positive" if posSent>negSent else "negative"


# return the overall average sentiment after defining a 'distance' function
@F.udf()
def classifySentiment(sentClassifications):
  
  sentConf = [
      [x["result"], x["metadata"]["confidence"]]
    for x in sentClassifications
  ]
  posSentConf = np.mean([float(x[1]) for x in sentConf if x[0] == "positive"])
  negSentConf = np.mean([float(x[1]) for x in sentConf if x[0] == "negative"])
  
  sentLengths = [
      [x["result"], x["end"] - x["begin"]]
    for x in sentClassifications
  ]
  posSentLengths = sum([x[1] for x in sentLengths if x[0] == "positive"])
  negSentLengths = sum([x[1] for x in sentLengths if x[0] == "negative"])
  
  # set to NaN if we have null comments
  if posSentLengths == 0 and negSentLengths == 0:
    return np.nan
  
  if abs(posSentLengths - negSentLengths)/(posSentLengths + negSentLengths) > 0.2:
    return "positive" if posSentLengths>negSentLengths else "negative"
  
  else:
    return "positive" if posSentConf>negSentConf else "negative"

In [12]:
sentiment_df = annotations_df.select("listing_id", "id", "date", "reviewer_id", "reviewer_name", "text", "sentiment",
                                     voteSentiment("sentiment").alias("vote_sentiment"),
                                     highestConfidence("sentiment").alias("high_conf_sentiment"),
                                     longestSentiment("sentiment").alias("long_conf_sentiment"),
                                     classifySentiment("sentiment").alias("classified_sentiment")
                                    )
display(sentiment_df)

listing_id,id,date,reviewer_id,reviewer_name,text,sentiment,vote_sentiment,high_conf_sentiment,long_conf_sentiment,classified_sentiment
5456,16489,2009-11-07,46119,Kevin,"Very accommodating, great space.","List(List(sentiment, 0, 31, negative, Map(confidence -> 0.5582), List()))",negative,negative,negative,negative
5456,18215,2009-11-24,54243,Jane,"6th Street is a bit of a walk but location is central to lots of shopping, nightlife and eateries. Place is clean, private and offers an array of videos and dcs. Great coffee, coffee pot and Fiesta Ware in the kitchen.","List(List(sentiment, 0, 97, negative, Map(confidence -> 0.5202), List()), List(sentiment, 99, 160, positive, Map(confidence -> 0.4819), List()), List(sentiment, 162, 217, positive, Map(confidence -> 0.4907), List()))",negative,negative,positive,negative
5456,20136,2009-12-13,50357,Gerald,"Great cozy/modern space, nice location, free wifi, and accommodating hostess made this place perfect for our stay.","List(List(sentiment, 0, 113, positive, Map(confidence -> 0.4509), List()))",negative,positive,positive,positive
5456,20640,2009-12-17,45412,Hannah,Sylvia's place was lovely! Perfect for me and my friend who stayed just before christmas 2009. Sylvia is super lovely and extra helpful! Thanks Sylvia!,"List(List(sentiment, 0, 25, positive, Map(confidence -> 0.7493), List()), List(sentiment, 27, 93, positive, Map(confidence -> 0.4745), List()), List(sentiment, 95, 135, negative, Map(confidence -> 0.4042), List()), List(sentiment, 137, 150, positive, Map(confidence -> 0.7150), List()))",positive,positive,positive,positive
5456,21240,2009-12-23,46775,Rochelle,"Sylvia was very kind and accommodating. The space itself was a very quaint home away from home. I work online so having the internet is essential. Sylvia's place work out great! Also, the location was good - not far from downtown Austin and there are a couple of great coffee shops in the neighborhood itself.","List(List(sentiment, 0, 38, positive, Map(confidence -> 0.5150), List()), List(sentiment, 40, 94, negative, Map(confidence -> 0.4519), List()), List(sentiment, 96, 145, negative, Map(confidence -> 0.4250), List()), List(sentiment, 147, 176, positive, Map(confidence -> 0.6271), List()), List(sentiment, 178, 308, positive, Map(confidence -> 0.4969), List()))",positive,positive,positive,positive
5456,24587,2010-01-22,64841,Vinisha,"The apt was clean, comfortable and cosy. Sylvia was very accommodating.","List(List(sentiment, 0, 39, positive, Map(confidence -> 0.5097), List()), List(sentiment, 41, 70, positive, Map(confidence -> 0.5332), List()))",negative,positive,positive,positive
5456,24721,2010-01-24,62019,Christine,"I'm late in posting this but that is no indication of how my stay went. it was fantastic! Sylvia is more than accommodating, just like everyone else has mentioned. it's all true. not only did she pick me up downtown, she took me to the airport for $10, and then when i realized i left my iphone in the apartment, she went to GREAT lengths to send it back to me. i almost had a panic attack when i realized i left it there, but was only consoled in the fact that at least it was in very good hands. the apartment itself was perfect, with plenty of windows, free coffee, etc. i would definitely stay again. Thanks Sylvia! sorry for the late review.","List(List(sentiment, 0, 70, positive, Map(confidence -> 0.5307), List()), List(sentiment, 73, 89, positive, Map(confidence -> 0.6489), List()), List(sentiment, 92, 164, negative, Map(confidence -> 0.5141), List()), List(sentiment, 167, 180, negative, Map(confidence -> 0.5147), List()), List(sentiment, 182, 363, positive, Map(confidence -> 0.4912), List()), List(sentiment, 366, 500, negative, Map(confidence -> 0.4834), List()), List(sentiment, 503, 577, positive, Map(confidence -> 0.5020), List()), List(sentiment, 580, 609, positive, Map(confidence -> 0.5408), List()), List(sentiment, 611, 624, positive, Map(confidence -> 0.7150), List()), List(sentiment, 627, 652, negative, Map(confidence -> 0.4677), List()))",positive,positive,positive,positive
5456,27248,2010-02-22,18328,Jeremy,Another fantastic stay! Sylvia is a great host.,"List(List(sentiment, 0, 22, positive, Map(confidence -> 0.7145), List()), List(sentiment, 24, 46, positive, Map(confidence -> 0.5256), List()))",negative,positive,positive,positive
5456,27738,2010-02-27,21519,Jessica,"The studio is charming and cheerful and Sylvia and her four dogs (who stayed mostly in the main house but were there to greet us when we arrived :))were very welcoming. The location is convenient and Sylvia provided every ammenity we could need including not just a space heater in the main room but, thoughtfully, one in the bathroom too. Thanks, Sylvia!","List(List(sentiment, 0, 167, positive, Map(confidence -> 0.5505), List()), List(sentiment, 169, 338, negative, Map(confidence -> 0.4845), List()), List(sentiment, 340, 354, positive, Map(confidence -> 0.4579), List()))",positive,positive,positive,positive
5456,28538,2010-03-07,24075,Michael,Cozy and clean place. It was close to I-35 so that was convenient.,"List(List(sentiment, 0, 20, positive, Map(confidence -> 0.4789), List()), List(sentiment, 23, 66, positive, Map(confidence -> 0.5287), List()))",negative,positive,positive,positive


In [13]:
vote_sentiment_df = sentiment_df.groupBy("listing_id").pivot("vote_sentiment", ["positive", "negative"]).count().orderBy("listing_id")
vote_sentiment_df = vote_sentiment_df.withColumnRenamed("positive", "vote_sentiment_positive_count") \
                                   .withColumnRenamed("negative", "vote_sentiment_negative_count") \
                                   .withColumn("vote_sentiment_positive_percent", F.expr("CASE WHEN vote_sentiment_negative_count is null and vote_sentiment_positive_count > 0 THEN 1 WHEN vote_sentiment_positive_count is null and vote_sentiment_negative_count > 0 THEN 0 ELSE vote_sentiment_positive_count/(vote_sentiment_positive_count + vote_sentiment_negative_count) END")) \
                                   .withColumn("vote_sentiment_positivity", F.expr("CASE WHEN vote_sentiment_positive_percent > 0.7 THEN 1 ELSE 0 END"))

In [14]:
high_conf_sentiment_df = sentiment_df.groupBy("listing_id").pivot("high_conf_sentiment", ["positive", "negative"]).count().orderBy("listing_id")
high_conf_sentiment_df = high_conf_sentiment_df.withColumnRenamed("positive", "high_conf_sentiment_positive_count") \
                                               .withColumnRenamed("negative", "high_conf_sentiment_negative_count") \
                                               .withColumn("high_conf_sentiment_positive_percent", F.expr("CASE WHEN high_conf_sentiment_negative_count is null and high_conf_sentiment_positive_count > 0 THEN 1 WHEN high_conf_sentiment_positive_count is null and high_conf_sentiment_negative_count > 0 THEN 0 ELSE high_conf_sentiment_positive_count/(high_conf_sentiment_positive_count + high_conf_sentiment_negative_count) END")) \
                                               .withColumn("high_conf_sentiment_positivity", F.expr("CASE WHEN high_conf_sentiment_positive_percent > 0.7 THEN 1 ELSE 0 END"))

In [15]:
long_conf_sentiment_df = sentiment_df.groupBy("listing_id").pivot("long_conf_sentiment", ["positive", "negative"]).count().orderBy("listing_id")
long_conf_sentiment_df = long_conf_sentiment_df.withColumnRenamed("positive", "long_conf_sentiment_positive_count") \
                                               .withColumnRenamed("negative", "long_conf_sentiment_negative_count") \
                                               .withColumn("long_conf_sentiment_positive_percent", F.expr("CASE WHEN long_conf_sentiment_negative_count is null and long_conf_sentiment_positive_count > 0 THEN 1 WHEN long_conf_sentiment_positive_count is null and long_conf_sentiment_negative_count > 0 THEN 0 ELSE long_conf_sentiment_positive_count/(long_conf_sentiment_positive_count + long_conf_sentiment_negative_count) END")) \
                                               .withColumn("long_conf_sentiment_positivity", F.expr("CASE WHEN long_conf_sentiment_positive_percent > 0.7 THEN 1 ELSE 0 END"))

In [16]:
class_sentiment_df = sentiment_df.groupBy("listing_id").pivot("classified_sentiment", ["positive", "negative"]).count().orderBy("listing_id")
class_sentiment_df = class_sentiment_df.withColumnRenamed("positive", "classified_sentiment_positive_count") \
                                       .withColumnRenamed("negative", "classified_sentiment_negative_count") \
                                       .withColumn("classified_sentiment_positive_percent", F.expr("CASE WHEN classified_sentiment_negative_count is null and classified_sentiment_positive_count> 0 THEN 1 WHEN classified_sentiment_positive_count is null and classified_sentiment_negative_count > 0 THEN 0 ELSE classified_sentiment_positive_count/(classified_sentiment_positive_count + classified_sentiment_negative_count) END")) \
                                       .withColumn("classified_sentiment_positivity", F.expr("CASE WHEN classified_sentiment_positive_percent > 0.7 THEN 1 ELSE 0 END"))

In [17]:
all_sentiment_df = vote_sentiment_df.join(high_conf_sentiment_df, ["listing_id"], how="inner") \
                                   .join(long_conf_sentiment_df, ["listing_id"], how="inner") \
                                   .join(class_sentiment_df, ["listing_id"], how="inner")

In [18]:
listings_df = spark.read.jdbc(url=jdbcUrl, table="listings_full", properties=connectionProperties)
listings_df = listings_df.select("id", "latitude", "longitude", "review_scores_rating", "host_name", "listing_url", "name", "price")

In [19]:
final_df = all_sentiment_df.join(listings_df, all_sentiment_df.listing_id == listings_df.id, how="inner").drop(F.col("id"))
display(final_df)

listing_id,vote_sentiment_positive_count,vote_sentiment_negative_count,vote_sentiment_positive_percent,vote_sentiment_positivity,high_conf_sentiment_positive_count,high_conf_sentiment_negative_count,high_conf_sentiment_positive_percent,high_conf_sentiment_positivity,long_conf_sentiment_positive_count,long_conf_sentiment_negative_count,long_conf_sentiment_positive_percent,long_conf_sentiment_positivity,classified_sentiment_positive_count,classified_sentiment_negative_count,classified_sentiment_positive_percent,classified_sentiment_positivity,latitude,longitude,review_scores_rating,host_name,listing_url,name,price
32396,8.0,1.0,0.8888888888888888,1,8.0,1.0,0.8888888888888888,1,5.0,4.0,0.5555555555555556,0,8.0,1.0,0.8888888888888888,1,30.2027,-97.8444,97.0,Tiffany,https://www.airbnb.com/rooms/32396,South Austin Comfort,55.0
69352,227.0,163.0,0.5820512820512821,0,321.0,68.0,0.8251928020565553,1,268.0,122.0,0.6871794871794872,0,297.0,92.0,0.7634961439588689,1,30.2577,-97.7635,95.0,Particular Properties,https://www.airbnb.com/rooms/69352,Enchanting Zilker Cottage~Downtown,159.0
145210,106.0,76.0,0.5824175824175825,0,154.0,28.0,0.8461538461538461,1,142.0,40.0,0.7802197802197802,1,155.0,27.0,0.8516483516483516,1,30.3022,-97.7414,98.0,Dan,https://www.airbnb.com/rooms/145210,Maiden Lane Guesthouse,129.0
300574,193.0,114.0,0.6286644951140065,0,266.0,41.0,0.8664495114006515,1,225.0,82.0,0.7328990228013029,1,245.0,61.0,0.8006535947712419,1,30.2572,-97.7556,97.0,Veronica,https://www.airbnb.com/rooms/300574,Hip Retro Groovy LOFT South Austin Style,124.0
364121,1.0,,1.0,1,1.0,,1.0,1,,1.0,0.0,0,1.0,,1.0,1,30.3941,-97.9191,100.0,Bruce,https://www.airbnb.com/rooms/364121,Private romantic getaway 150' above Lake Travis,210.0
681118,5.0,5.0,0.5,0,7.0,3.0,0.7,0,5.0,5.0,0.5,0,6.0,4.0,0.6,0,30.2498,-97.7216,98.0,Jules & Lance,https://www.airbnb.com/rooms/681118,East-Central Craftsman on Hike & Bike Trail,1200.0
953608,3.0,1.0,0.75,1,4.0,,1.0,1,1.0,3.0,0.25,0,2.0,2.0,0.5,0,30.1595,-97.8454,100.0,Patricia,https://www.airbnb.com/rooms/953608,Nice house in great south neighborh,150.0
969258,1.0,,1.0,1,1.0,,1.0,1,1.0,,1.0,1,1.0,,1.0,1,30.2536,-97.7784,100.0,Daris And Tom,https://www.airbnb.com/rooms/969258,Barton Hills Abode 2/2,349.0
4334621,7.0,6.0,0.5384615384615384,0,12.0,1.0,0.9230769230769232,1,9.0,4.0,0.6923076923076923,0,11.0,2.0,0.8461538461538461,1,30.2629,-97.7637,93.0,Valentina,https://www.airbnb.com/rooms/4334621,House on Cliff ~ 3 minutes from ACL entrance,120.0
4516994,7.0,3.0,0.7,0,9.0,1.0,0.9,1,5.0,5.0,0.5,0,7.0,3.0,0.7,0,30.2377,-97.7574,73.0,Jes,https://www.airbnb.com/rooms/4516994,5 MIN FROM ZILKER Huge 1BR SOCO Luxury FOR SXSW,170.0


In [20]:
final_df.count()