In [1]:
# import AWS credentials
# import config.py ##for local
%run "/dbfs/FileStore/tables/config" ##for databricks

In [2]:
import pandas as pd
import numpy as np

from pyspark.sql import SparkSession
from pyspark import SparkFiles
from pyspark.sql import DataFrame
from pyspark.sql.types import *
import pyspark.sql.functions as F

In [3]:
# get or create Spark session
app_name = "spark-airbnb-sentiment"
spark = SparkSession.builder.appName(app_name).getOrCreate()

In [4]:
import boto3

secret_name = my_secret_name
region_name = my_region_name
access_key  = my_access_key
secret_key  = my_secret_key

session      = boto3.session.Session(aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name=region_name)
client       = session.client('secretsmanager')
secret_value = client.get_secret_value(SecretId=secret_name)

In [5]:
import json
def get_connection(secret_value):
    return json.loads(secret_value['SecretString'])

In [6]:
connection = get_connection(secret_value)

# Postgres credentials
jdbcHostname = connection['host']
jdbcPort     = connection['port']
jdbcDatabase = "postgres"
dialect      = "postgresql"
jdbcUsername = connection['username']
jdbcPassword = connection['password']

jdbcUrl = f"jdbc:{dialect}://{jdbcHostname}:{jdbcPort}/{jdbcDatabase}"
connectionProperties = {
  "user"     : jdbcUsername,
  "password" : jdbcPassword,
  "driver"   : "org.postgresql.Driver" 
}

In [7]:
# Read from reviews_full table

table = "reviews_full"

reviews_df = spark.read.jdbc(url=jdbcUrl, table=table, properties=connectionProperties)
reviews_df.printSchema()

In [8]:
from pyspark.ml import Pipeline
import sparknlp
sparknlp.start()
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline

pipeline = PretrainedPipeline("explain_document_ml", lang="en")

In [9]:
reviews_df = reviews_df.withColumnRenamed("comments", "text").filter("text IS NOT NULL")

In [10]:
annotations_df = pipeline.transform(reviews_df)

In [11]:
display(annotations_df)

In [12]:
# create pipeline for tokenized words in review
stop_words_cleaner = StopWordsCleaner() \
        .setInputCols(["token"]) \
        .setOutputCol("cleanTokens") \
        .setCaseSensitive(False) \
        .setStopWords(["this", "is", "and", ",", "!", ".", "\n", "\t", "ourselves", "hers", "between", "yourself", "but", "again", "there", "about", "once", "during", "out", "very", "having", "with", "they", "own", "an", "be", "some", "for", "do", "its", "yours", "such", "into", "of", "most", "itself", "other", "off", "is", "s", "am", "or", "who", "as", "from", "him", "each", "the", "themselves", "until", "below", "are", "we", "these", "your", "his", "through", "don", "nor", "me", "were", "her", "more", "himself", "this", "down", "should", "our", "their", "while", "above", "both", "up", "to", "ours", "had", "she", "all", "no", "when", "at", "any", "before", "them", "same", "and", "been", "have", "in", "will", "on", "does", "yourselves", "then", "that", "because", "what", "over", "why", "so", "can", "did", "not", "now", "under", "he", "you", "herself", "has", "just", "where", "too", "only", "myself", "which", "those", "i", "after", "few", "whom", "t", "being", "if", "theirs", "my", "against", "a", "by", "doing", "it", "how", "further", "was", "here", "than", "(", ")", "-", "/", ";"])

unigrams = NGramGenerator() \
            .setInputCols(["cleanTokens"]) \
            .setOutputCol("unigrams") \
            .setN(1) \
            .setEnableCumulative(False)

bigrams = NGramGenerator() \
            .setInputCols(["cleanTokens"]) \
            .setOutputCol("bigrams") \
            .setN(2) \
            .setEnableCumulative(False)

extra_pipeline = Pipeline(
    stages = [
      stop_words_cleaner,
      unigrams,
      bigrams
  ])

In [13]:
model = extra_pipeline.fit(annotations_df)
annotations_df = model.transform(annotations_df)

In [14]:
display(annotations_df)

In [15]:
listings_df = spark.read.jdbc(url=jdbcUrl, table="listings_full", properties=connectionProperties)
listings_df = listings_df.select("id", "neighbourhood_cleansed")

In [16]:
# join to get zipcodes
labeled_df = annotations_df.withColumnRenamed("id", "review_id")\
                           .join(listings_df, annotations_df.listing_id == listings_df.id, how="inner") \
                           .drop(F.col("id"))

In [17]:
display(labeled_df)

In [18]:
# extract ngram results into list
@F.udf()
def extract_results(ngram):
  combined_result = [x["result"] for x in ngram ]
  return combined_result

In [19]:
output_df = labeled_df.select("listing_id", "neighbourhood_cleansed", \
                                             extract_results("unigrams").alias("unigram_list"), \
                                             extract_results("bigrams").alias("bigram_list")
                             ).withColumnRenamed("neighbourhood_cleansed", "zipcode")

In [20]:
display(output_df)

In [21]:
# group by zipcode and create list of unigrams and bigrams
agg_output_df = output_df.groupBy("zipcode").agg(F.collect_list('unigram_list').alias("unigram_list"), F.collect_list('bigram_list').alias("bigram_list"))

In [22]:
# error occurred here because the string array is a complex data type. Need to flatten it first to save to CSV
display(agg_output_df)

In [23]:
# flatten string arrays to strings
def array_to_string(my_list):
    return '[' + ','.join([str(elem) for elem in my_list]) + ']'

array_to_string_udf = udf(array_to_string,StringType())

final_agg_output_df = agg_output_df.withColumn('unigram_list_str',array_to_string_udf(F.col("unigram_list"))).withColumn('bigram_list_str',array_to_string_udf(F.col("bigram_list"))).drop("unigram_list").drop("bigram_list")

In [24]:
# display and save final output
display(final_agg_output_df)