In [1]:
# import AWS credentials
# import config.py ##for local
%run "/dbfs/FileStore/tables/config" ##for databricks

In [2]:
import pandas as pd
import numpy as np

from pyspark.sql import SparkSession
from pyspark import SparkFiles
from pyspark.sql import DataFrame
from pyspark.sql.types import *
import pyspark.sql.functions as F

In [3]:
# get or create Spark session
app_name = "spark-airbnb-sentiment"
spark = SparkSession.builder.appName(app_name).getOrCreate()

In [4]:
import boto3

secret_name = my_secret_name
region_name = my_region_name
access_key  = my_access_key
secret_key  = my_secret_key

session      = boto3.session.Session(aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name=region_name)
client       = session.client('secretsmanager')
secret_value = client.get_secret_value(SecretId=secret_name)

In [5]:
import json
def get_connection(secret_value):
    return json.loads(secret_value['SecretString'])

In [6]:
connection = get_connection(secret_value)

# Postgres credentials
jdbcHostname = connection['host']
jdbcPort     = connection['port']
jdbcDatabase = "postgres"
dialect      = "postgresql"
jdbcUsername = connection['username']
jdbcPassword = connection['password']

jdbcUrl = f"jdbc:{dialect}://{jdbcHostname}:{jdbcPort}/{jdbcDatabase}"
connectionProperties = {
  "user"     : jdbcUsername,
  "password" : jdbcPassword,
  "driver"   : "org.postgresql.Driver" 
}

In [7]:
# Read from reviews_full table

table = "reviews_full"

reviews_df = spark.read.jdbc(url=jdbcUrl, table=table, properties=connectionProperties)
reviews_df.printSchema()

In [8]:
from pyspark.ml import Pipeline
import sparknlp
sparknlp.start()
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline

pipeline = PretrainedPipeline("analyze_sentiment", lang="en")

In [9]:
# remove empty reviews because they break John Snow and automated cancellation postings
reviews_df = reviews_df.withColumnRenamed("comments", "text").filter("text IS NOT NULL") \
                       .filter(~F.col("text").contains("This is an automated posting."))

In [10]:
annotations_df = pipeline.transform(reviews_df)

In [11]:
# return the overall sentiment by votes
@F.udf()
def voteSentiment(sentClassifications):
  sentConf = [
      [x["result"], x["metadata"]["confidence"]]
    for x in sentClassifications
  ]
  posSent = np.mean([float(x[1]) for x in sentConf if x[0] == "positive"])
  negSent = np.mean([float(x[1]) for x in sentConf if x[0] == "negative"])
  return "positive" if posSent >= negSent else "negative"


# return the sentiment with the highest confidence value
@F.udf()
def highestConfidence(sentClassifications):
  try:
    idx = np.argmax([x["metadata"]["confidence"] for x in sentClassifications if x is not None])
    return sentClassifications[idx]["result"]
  except:
    return "unknown"


# return sentiment of the longest sentences after summing lengths
@F.udf()
def longestSentiment(sentClassifications):
  sentLengths = [
      [x["result"], x["end"] - x["begin"]]
    for x in sentClassifications
  ]
  posSent = sum([x[1] for x in sentLengths if x[0] == "positive"])
  negSent = sum([x[1] for x in sentLengths if x[0] == "negative"])
  return "positive" if posSent > negSent else "negative"


# return the overall sentiment by votes after defining a 'distance' function
# based on the distance between lengths of positive and negative sentences
@F.udf()
def classifySentiment(sentClassifications):
  
  sentConf = [
      [x["result"], x["metadata"]["confidence"]]
    for x in sentClassifications
  ]
  posSentConf = np.mean([float(x[1]) for x in sentConf if x[0] == "positive"])
  negSentConf = np.mean([float(x[1]) for x in sentConf if x[0] == "negative"])
  
  sentLengths = [
      [x["result"], x["end"] - x["begin"]]
    for x in sentClassifications
  ]
  posSentLengths = sum([x[1] for x in sentLengths if x[0] == "positive"])
  negSentLengths = sum([x[1] for x in sentLengths if x[0] == "negative"])
  
  # set to NaN if we have null comments
  if posSentLengths == 0 and negSentLengths == 0:
    return np.nan
  
  if abs(posSentLengths - negSentLengths)/(posSentLengths + negSentLengths) > 0.2:
    return "positive" if posSentLengths > negSentLengths else "negative"
  
  else:
    return "positive" if posSentConf > negSentConf else "negative"

In [12]:
# apply UDFs to 'sentiment'
sentiment_df = annotations_df.select("listing_id", "id", "date", "reviewer_id", "reviewer_name", "text", "sentiment",
                                     voteSentiment("sentiment").alias("vote_sentiment"),
                                     highestConfidence("sentiment").alias("high_conf_sentiment"),
                                     longestSentiment("sentiment").alias("long_conf_sentiment"),
                                     classifySentiment("sentiment").alias("classified_sentiment")
                                    )
display(sentiment_df)

In [13]:
vote_sentiment_df = sentiment_df.groupBy("listing_id").pivot("vote_sentiment", ["positive", "negative"]).count().orderBy("listing_id")
vote_sentiment_df = vote_sentiment_df.withColumnRenamed("positive", "vote_sentiment_positive_count") \
                                   .withColumnRenamed("negative", "vote_sentiment_negative_count") \
                                   .withColumn("vote_sentiment_positive_percent", F.expr("CASE WHEN vote_sentiment_negative_count is null and vote_sentiment_positive_count > 0 THEN 1 WHEN vote_sentiment_positive_count is null and vote_sentiment_negative_count > 0 THEN 0 ELSE vote_sentiment_positive_count/(vote_sentiment_positive_count + vote_sentiment_negative_count) END")) \
                                   .withColumn("vote_sentiment_positivity", F.expr("CASE WHEN vote_sentiment_positive_percent > 0.7 THEN 1 ELSE 0 END"))

In [14]:
high_conf_sentiment_df = sentiment_df.groupBy("listing_id").pivot("high_conf_sentiment", ["positive", "negative"]).count().orderBy("listing_id")
high_conf_sentiment_df = high_conf_sentiment_df.withColumnRenamed("positive", "high_conf_sentiment_positive_count") \
                                               .withColumnRenamed("negative", "high_conf_sentiment_negative_count") \
                                               .withColumn("high_conf_sentiment_positive_percent", F.expr("CASE WHEN high_conf_sentiment_negative_count is null and high_conf_sentiment_positive_count > 0 THEN 1 WHEN high_conf_sentiment_positive_count is null and high_conf_sentiment_negative_count > 0 THEN 0 ELSE high_conf_sentiment_positive_count/(high_conf_sentiment_positive_count + high_conf_sentiment_negative_count) END")) \
                                               .withColumn("high_conf_sentiment_positivity", F.expr("CASE WHEN high_conf_sentiment_positive_percent > 0.7 THEN 1 ELSE 0 END"))

In [15]:
long_conf_sentiment_df = sentiment_df.groupBy("listing_id").pivot("long_conf_sentiment", ["positive", "negative"]).count().orderBy("listing_id")
long_conf_sentiment_df = long_conf_sentiment_df.withColumnRenamed("positive", "long_conf_sentiment_positive_count") \
                                               .withColumnRenamed("negative", "long_conf_sentiment_negative_count") \
                                               .withColumn("long_conf_sentiment_positive_percent", F.expr("CASE WHEN long_conf_sentiment_negative_count is null and long_conf_sentiment_positive_count > 0 THEN 1 WHEN long_conf_sentiment_positive_count is null and long_conf_sentiment_negative_count > 0 THEN 0 ELSE long_conf_sentiment_positive_count/(long_conf_sentiment_positive_count + long_conf_sentiment_negative_count) END")) \
                                               .withColumn("long_conf_sentiment_positivity", F.expr("CASE WHEN long_conf_sentiment_positive_percent > 0.7 THEN 1 ELSE 0 END"))

In [16]:
class_sentiment_df = sentiment_df.groupBy("listing_id").pivot("classified_sentiment", ["positive", "negative"]).count().orderBy("listing_id")
class_sentiment_df = class_sentiment_df.withColumnRenamed("positive", "classified_sentiment_positive_count") \
                                       .withColumnRenamed("negative", "classified_sentiment_negative_count") \
                                       .withColumn("classified_sentiment_positive_percent", F.expr("CASE WHEN classified_sentiment_negative_count is null and classified_sentiment_positive_count> 0 THEN 1 WHEN classified_sentiment_positive_count is null and classified_sentiment_negative_count > 0 THEN 0 ELSE classified_sentiment_positive_count/(classified_sentiment_positive_count + classified_sentiment_negative_count) END")) \
                                       .withColumn("classified_sentiment_positivity", F.expr("CASE WHEN classified_sentiment_positive_percent > 0.7 THEN 1 ELSE 0 END"))

In [17]:
# join all counts by UDFs
all_sentiment_df = vote_sentiment_df.join(high_conf_sentiment_df, ["listing_id"], how="inner") \
                                   .join(long_conf_sentiment_df, ["listing_id"], how="inner") \
                                   .join(class_sentiment_df, ["listing_id"], how="inner")

In [18]:
listings_df = spark.read.jdbc(url=jdbcUrl, table="listings_full", properties=connectionProperties)
select_listings_df = listings_df.select("id", "latitude", "longitude", "review_scores_rating", "host_name", "listing_url", "name", "accommodates",
                                   "price", "property_type", "room_type", "number_of_reviews", "minimum_nights", "maximum_nights")

In [19]:
# join to get columns from select_listings_df
final_df = all_sentiment_df.join(select_listings_df, all_sentiment_df.listing_id == select_listings_df.id, how="inner").drop(F.col("id"))

# display final output
# THIS TAKES A LONG TIME
display(final_df)

In [20]:
# save to downloadable CSV file
final_df.coalesce(1).write.format("com.databricks.spark.csv").mode('overwrite').option("header", "true").save("dbfs:/FileStore/Tables/sentiment_results")

# display filenames to know what to download using CLI
display(dbutils.fs.ls("/FileStore/Tables/sentiment_results"))

path,name,size
dbfs:/FileStore/Tables/sentiment_results/_SUCCESS,_SUCCESS,0
dbfs:/FileStore/Tables/sentiment_results/_committed_5068235423027364206,_committed_5068235423027364206,115
dbfs:/FileStore/Tables/sentiment_results/_started_5068235423027364206,_started_5068235423027364206,0
dbfs:/FileStore/Tables/sentiment_results/part-00000-tid-5068235423027364206-40ea3852-e4ed-44c9-88c3-ebaf6c0df36b-15013-1-c000.csv,part-00000-tid-5068235423027364206-40ea3852-e4ed-44c9-88c3-ebaf6c0df36b-15013-1-c000.csv,2423031
