In [1]:
from pyspark.sql import SparkSession, Row
from pyspark.ml.feature import PCA, StopWordsRemover, Normalizer
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import *
from pyspark.sql.types import *
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import datetime
start_time=datetime.datetime.now()
spark = SparkSession \
    .builder \
    .appName("Assignment2 - COMP5349-Stage4") \
    .getOrCreate()
awsData = "s3://amazon-reviews-pds/tsv/amazon_reviews_us_Music_v1_00.tsv.gz"

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
1,application_1558512418166_0002,pyspark3,idle,Link,Link,✔


SparkSession available as 'spark'.


In [2]:
from pyspark.ml.feature import Word2Vec
from pyspark.ml.feature import Word2VecModel
"""Loading data into a dataframe"""
musicData = spark.read.csv(awsData,header=True,sep='\t')
musicData.printSchema()

root
 |-- marketplace: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_parent: string (nullable = true)
 |-- product_title: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- star_rating: string (nullable = true)
 |-- helpful_votes: string (nullable = true)
 |-- total_votes: string (nullable = true)
 |-- vine: string (nullable = true)
 |-- verified_purchase: string (nullable = true)
 |-- review_headline: string (nullable = true)
 |-- review_body: string (nullable = true)
 |-- review_date: string (nullable = true)

In [3]:
"""Selecting required columns"""
requiredData = musicData.select('customer_id','product_id','product_title','star_rating','review_id','review_body')
requiredData.cache()
requiredData.count()

4751577

In [4]:
"""Identifying top 10 products based on number of reviews received."""
"""Picking 10th product from the top 10 products identified"""
top_10_product_ids=requiredData.filter(col("review_id").isNotNull()).groupBy("product_id").count().sort(col("count").desc()).limit(10)
top_10_product_ids=top_10_product_ids.sort(col("count")).limit(1)

In [5]:
"""Separating positive and negative reviews based on star rating"""
positive_reviews=requiredData.join(top_10_product_ids.select("product_id"),"product_id","inner").filter(col("star_rating")>=4)
negative_reviews=requiredData.join(top_10_product_ids.select("product_id"),"product_id","inner").filter(col("star_rating")<=2)

In [6]:
positive_reviews.cache()
positive_reviews.count()

1323

In [7]:
negative_reviews.cache()
negative_reviews.count()

415

In [8]:
"""Splitting review body into sentences based on . or ? and saving as list"""
positive_review_sentences=positive_reviews.filter(col("review_body").isNotNull()).select("review_id","review_body")
positive_review_sentences=positive_review_sentences.withColumn("review_body_sentences",split(col("review_body"), r"\.|\?"))
negative_review_sentences=negative_reviews.filter(col("review_body").isNotNull()).select("review_id","review_body")
negative_review_sentences=negative_review_sentences.withColumn("review_body_sentences",split(col("review_body"), r"\.|\?"))
#positive_review_sentences.show(2)

In [9]:
"""Exploding review sentences to have one to many mapping for review_id vs review sentences. Also, cleaning review sentences 
of any special characters/trim additional spaces. Later on, we split sentence into words to remove any stop words in next step"""
positive_review_sentences=positive_review_sentences.withColumn("review_body_sentences",explode("review_body_sentences"))
positive_review_sentences=positive_review_sentences.withColumn("review_body_sentences_joined",col("review_body_sentences"))
positive_review_sentences=positive_review_sentences.withColumn("review_body_sentences",regexp_replace("review_body_sentences","<br />|\s+|$|,|!|#|@|<|>|/|&|#|;|:|[0-9]|-"," ")) \
.withColumn("review_body_sentences",regexp_replace("review_body_sentences","\s+"," ")) \
.withColumn("review_body_sentences",trim(col("review_body_sentences"))) \
.filter(col("review_body_sentences").isNotNull()) \
.filter(col("review_body_sentences")!='') \
.withColumn("review_body_sentences",lower(col("review_body_sentences"))) \
.withColumn("review_body_sentences",split(trim(col("review_body_sentences")), "\s+"))
#positive_review_sentences.show(2,truncate=False)
negative_review_sentences=negative_review_sentences.withColumn("review_body_sentences",explode("review_body_sentences"))
negative_review_sentences=negative_review_sentences.withColumn("review_body_sentences_joined",col("review_body_sentences"))
negative_review_sentences=negative_review_sentences.withColumn("review_body_sentences",regexp_replace("review_body_sentences","<br />|\s+|$|,|!|#|@|<|>|/|&|#|;|:|[0-9]|-"," ")) \
.withColumn("review_body_sentences",regexp_replace("review_body_sentences","\s+"," ")) \
.withColumn("review_body_sentences",trim(col("review_body_sentences"))) \
.filter(col("review_body_sentences").isNotNull()) \
.filter(col("review_body_sentences")!='') \
.withColumn("review_body_sentences",lower(col("review_body_sentences"))) \
.withColumn("review_body_sentences",split(trim(col("review_body_sentences")), "\s+"))
#positive_review_sentences.show(2,truncate=False)


In [10]:
"""Stop words remover will remove stop words. Stop words list is retreived from nltk using below:
import nltk
from nltk.corpus import stopwords
 set(stopwords.words('english'))
"""
stopwords_list=['a', 'ah','br',  'about', 'quot', 'above',  'after',  'again',  'against',  'ain',  'all',  'am',  'an',  'and',  'any',  'are',  'aren',  "aren't",  'as',  'at',  'be',  'because',  'been',  'before',  'being',  'below',  'between',  'both',  'but',  'by',  'can',  'couldn',  "couldn't",  'd',  'did',  'didn',  "didn't",  'do',  'does',  'doesn',  "doesn't",  'doing',  'don',  "don't",  'down',  'during',  'each',  'few',  'for',  'from',  'further',  'had',  'hadn',  "hadn't",  'has',  'hasn',  "hasn't",  'have',  'haven',  "haven't",  'having',  'he',  'her',  'here',  'hers',  'herself',  'him',  'himself',  'his',  'how',  'i',  'if',  'in',  'into',  'is',  'isn',  "isn't",  'it',  "it's",  'its',  'itself',  'just',  'll',  'm',  'ma',  'me',  'mightn',  "mightn't",  'more',  'most',  'mustn',  "mustn't",  'my',  'myself',  'needn',  "needn't",  'no',  'nor',  'not',  'now',  'o',  'of',  'off',  'on',  'once',  'only',  'or',  'other',  'our',  'ours',  'ourselves',  'out',  'over',  'own',  're',  's',  'same',  'shan',  "shan't",  'she',  "she's",  'should',  "should've",  'shouldn',  "shouldn't",  'so',  'some',  'such',  't',  'than',  'that',  "that'll",  'the',  'their',  'theirs',  'them',  'themselves',  'then',  'there',  'these',  'thanks',  'they',  'this',  'those',  'through',  'to',  'too',  'under',  'until',  'up',  've',  'very',  'was',  'wasn',  "wasn't",  'we',  'were',  'weren',  "weren't",  'what',  'when',  'where',  'which',  'while',  'who',  'whom',  'why',  'will',  'with',  'won',  "won't",  'wouldn',  "wouldn't",  'y',  'you',  "you'd",  "you'll",  "you're",  "you've",  'your',  'yours',  'yourself',  'yourselves']
removerSW=StopWordsRemover(inputCol="review_body_sentences",outputCol="filtered", stopWords=stopwords_list)
positive_review_sentences_new=removerSW.transform(positive_review_sentences)
negative_review_sentences_new=removerSW.transform(negative_review_sentences)

In [11]:
"""removing any sentences with less than two words in them"""
positive_review_sentences_new=positive_review_sentences_new.filter(size(col("filtered"))>2)
positive_review_sentences_new.cache()
positive_review_sentences_new.count()

8667

In [12]:
#from pyspark.sql.functions import concat, length
negative_review_sentences_new=negative_review_sentences_new.filter(size(col("filtered"))>2)
negative_review_sentences_new.cache()
negative_review_sentences_new.count()

3048

In [13]:
"""Setting word2vec model with vector size of 100 and iterations 100. We fit the model first and then transform the sentences
to vectors by calling transform function on dataframe"""
vsize=100
word2Vec_model = Word2Vec(vectorSize=vsize, windowSize=2,minCount=0, inputCol="filtered", outputCol="features",seed=42, maxIter=100, stepSize=0.1)
model = word2Vec_model.fit(positive_review_sentences_new)
result = model.transform(positive_review_sentences_new)
model_n = word2Vec_model.fit(negative_review_sentences_new)
result_n = model_n.transform(negative_review_sentences_new)
#result.show(2)

In [14]:
"""Implementing PCA to convert 1x100 dimension vectors into 1x2 dimension vectors by picking 2 principal components."""
"""Setting up normaliser to allow taking dot product for cosine similarity directly"""
pca = PCA(k=2, inputCol="features", outputCol="pca_features")
model = pca.fit(result)
pca_result = model.transform(result).select("review_id","pca_features","review_body_sentences_joined","filtered")
normalizer = Normalizer(inputCol="pca_features", outputCol="features_norm", p=2.0)
pca_result = normalizer.transform(pca_result)
#pca_result.show(2)
pca_n = PCA(k=2, inputCol="features", outputCol="pca_features")
model_n = pca_n.fit(result_n)
pca_result_n = model_n.transform(result_n).select("review_id","pca_features","review_body_sentences_joined","filtered")
normalizer_n = Normalizer(inputCol="pca_features", outputCol="features_norm", p=2.0)
pca_result_n = normalizer_n.transform(pca_result_n)

In [15]:
"""Appending row number with index column name."""
pca_result_withIndex = pca_result.rdd.zipWithUniqueId().map(lambda row: (row[1],row[0][0],row[0][1],row[0][2],row[0][3],row[0][4])).toDF()
pca_result_withIndex = pca_result_withIndex.select(col("_1").alias("index"), col("_2").alias("review_id"), col("_3").alias("pca_features"), col("_4").alias("review_body_sentences_joined"), col("_5").alias("filtered"), col("_6").alias("features_norm"))
#pca_result_withIndex.show(2)
pca_result_withIndex_n = pca_result_n.rdd.zipWithUniqueId().map(lambda row: (row[1],row[0][0],row[0][1],row[0][2],row[0][3],row[0][4])).toDF()
pca_result_withIndex_n = pca_result_withIndex_n.select(col("_1").alias("index"), col("_2").alias("review_id"), col("_3").alias("pca_features"), col("_4").alias("review_body_sentences_joined"), col("_5").alias("filtered"), col("_6").alias("features_norm"))
#pca_result_withIndex_n.show(2)

----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 54810)
Traceback (most recent call last):
  File "/usr/lib64/python3.6/socketserver.py", line 320, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/lib64/python3.6/socketserver.py", line 351, in process_request
    self.finish_request(request, client_address)
  File "/usr/lib64/python3.6/socketserver.py", line 364, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/lib64/python3.6/socketserver.py", line 724, in __init__
    self.handle()
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/accumulators.py", line 266, in handle
    poll(authenticate_and_accum_updates)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/accumulators.py", line 241, in poll
    if func():
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/accumulators.py", line 254, in authenticate_and_accum_updates
    received_to

In [16]:
"""Collecting all positive normalised vectors into numpy array and reshaping to have each row to contain 
vector from corresponding review sentence """
all_vectors_array=np.array(pca_result_withIndex.select("features_norm").collect())
all_vectors_array=all_vectors_array.reshape((all_vectors_array.shape[0],2))
#all_vectors_array.shape
"""Similarly for negative"""
all_vectors_array_n=np.array(pca_result_withIndex_n.select("features_norm").collect())
all_vectors_array_n=all_vectors_array_n.reshape((all_vectors_array_n.shape[0],2))
#all_vectors_array.shape

In [17]:
"""Defining function to take intra class similarity. It takes dot product between normalised features with the array formed 
in previous steps. We exclude the vector itself from the main array as we aim to find similarity between other vectors. This 
is achieved by droping the column at index of the row feature as took transpose for dot product.Once dot product is done, 
we take 1-dot product to get cosine distance and take average using numpy.mean"""
"""Applying UDF defined for both negative and positive cases"""
def intra_class_similarity(features_vec,index):
    current_vec=np.array(features_vec)[np.newaxis,:]
    all_vec_norm=all_vectors_array.T
    all_vec_norm=np.delete(all_vec_norm, index, axis=1)
    avg_dist=np.mean(1-np.dot(current_vec,all_vec_norm))
    return float(avg_dist)

def intra_class_similarity_n(features_vec,index):
    current_vec=np.array(features_vec)[np.newaxis,:]
    all_vec_norm=all_vectors_array_n.T
    all_vec_norm=np.delete(all_vec_norm, index, axis=1)
    avg_dist=np.mean(1-np.dot(current_vec,all_vec_norm))
    return float(avg_dist)

intra_class_similarity_udf = udf(intra_class_similarity, FloatType())
intra_class_similarity_n_udf = udf(intra_class_similarity_n, FloatType())
pca_result_withIndex=pca_result_withIndex.withColumn("avg_distance", intra_class_similarity_udf(pca_result_withIndex["features_norm"],pca_result_withIndex["index"]))
pca_result_withIndex_n=pca_result_withIndex_n.withColumn("avg_distance", intra_class_similarity_n_udf(pca_result_withIndex_n["features_norm"],pca_result_withIndex_n["index"]))

In [18]:
pca_result_upd=pca_result_withIndex.select("index","review_id","review_body_sentences_joined","filtered","features_norm","avg_distance").sort(col("avg_distance"))
pca_result_upd_n=pca_result_withIndex_n.select("index","review_id","review_body_sentences_joined","filtered","features_norm","avg_distance").sort(col("avg_distance"))
#pca_result_upd.show(2, truncate=False)

In [19]:
"""Extracting case center by sorting in ascending based on average distance"""
positive_center=pca_result_upd.select("index","review_id","features_norm","avg_distance",col("review_body_sentences_joined").alias("center_sentence_text")).limit(1)
negative_center=pca_result_upd_n.select("index","review_id","features_norm","avg_distance",col("review_body_sentences_joined").alias("center_sentence_text")).limit(1)

In [20]:
#print("Maximum value of avg_distance for positive is ",pca_result_upd.agg({"avg_distance": "max"}).collect()[0])

In [21]:
"""Using center identified, we apply the function similar to previous one, to identify distance and sort in ascending order. 
The first element is considered to be iself as vector will have 0 distance(least) with itself. """

def ten_most_similar(features_vec,index):
    current_vec=np.array(features_vec)[np.newaxis,:]
    all_vec_norm=all_vectors_array.T
    #all_vec_norm=np.delete(all_vec_norm, index, axis=1)
    index_list=np.argsort(1-np.dot(current_vec,all_vec_norm))[:,1:11][0]
    return index_list.tolist()

def ten_most_similar_n(features_vec,index):
    current_vec=np.array(features_vec)[np.newaxis,:]
    all_vec_norm=all_vectors_array_n.T
    #all_vec_norm=np.delete(all_vec_norm, index, axis=1)
    index_list=np.argsort(1-np.dot(current_vec,all_vec_norm))[:,1:11][0]
    return index_list.tolist()

ten_most_similar_udf = udf(ten_most_similar, ArrayType(IntegerType()))
ten_most_similar_n_udf = udf(ten_most_similar_n, ArrayType(IntegerType()))
ten_most_similar_df=positive_center.withColumn("10_most_similar_index",ten_most_similar_udf(col("features_norm"),col("index")))
ten_most_similar_df=ten_most_similar_df.drop(col("index")).withColumn("index",explode("10_most_similar_index"))
ten_most_similar_n_df=negative_center.withColumn("10_most_similar_index",ten_most_similar_n_udf(col("features_norm"),col("index")))
ten_most_similar_n_df=ten_most_similar_n_df.drop(col("index")).withColumn("index",explode("10_most_similar_index"))

In [22]:
"""Based on index identified in previous step, we make a join with main dataframe and extract results in required format."""
final_result=ten_most_similar_df.select(col("review_id").alias("center_review_id"),col("avg_distance").alias("center_avg_dist"),col("index"), col("center_sentence_text")).join(pca_result_upd,"index", "inner")
final_result_n=ten_most_similar_n_df.select(col("review_id").alias("center_review_id"),col("avg_distance").alias("center_avg_dist"),col("index"), col("center_sentence_text")).join(pca_result_upd_n,"index", "inner")
#final_result.show()

In [23]:
positive_final=final_result.select("center_review_id","center_sentence_text",col("review_id").alias("10_nearestneighbour_rv_id"), col("review_body_sentences_joined").alias("corresponding_sentence"))
negative_final=final_result_n.select("center_review_id","center_sentence_text",col("review_id").alias("10_nearestneighbour_rv_id"), col("review_body_sentences_joined").alias("corresponding_sentence"))
#df.write.parquet("s3a://bucket-name/shri/test.parquet",mode="overwrite")

In [29]:
"""Result for positive case. It includes center review id, center sentence text, 
10 nearest neighbours review id and their corresponding sentences"""
positive_final.show()

+----------------+--------------------+-------------------------+----------------------+
|center_review_id|center_sentence_text|10_nearestneighbour_rv_id|corresponding_sentence|
+----------------+--------------------+-------------------------+----------------------+
|  R380QK1PRRA9V5|\\" It's a song w...|            RTEW7ZYKXX2TB|   I mean at least ...|
|  R380QK1PRRA9V5|\\" It's a song w...|           R191KI5DAR1DNF|   Just because bec...|
|  R380QK1PRRA9V5|\\" It's a song w...|           R1X96BRSNMO8OU|   And i hope that ...|
|  R380QK1PRRA9V5|\\" It's a song w...|           R3BT24PSNC8E7J|    Inside there is...|
|  R380QK1PRRA9V5|\\" It's a song w...|            R962EPIVN1H8J|   They seem like t...|
|  R380QK1PRRA9V5|\\" It's a song w...|           R2F2ESD4D4VHRD|  I usually dislike...|
|  R380QK1PRRA9V5|\\" It's a song w...|           R3PPZZTVMCTOGD|    not all music c...|
|  R380QK1PRRA9V5|\\" It's a song w...|           R3EJV8X5FA1MUC|    Still Joel, Ben...|
|  R380QK1PRRA9V5|\\"

In [30]:
"""Result for negative case. It includes center review id, center sentence text, 
10 nearest neighbours review id and their corresponding sentences"""
negative_final.show()

+----------------+--------------------+-------------------------+----------------------+
|center_review_id|center_sentence_text|10_nearestneighbour_rv_id|corresponding_sentence|
+----------------+--------------------+-------------------------+----------------------+
|  R24EC70OSKLM9S|Yes! Another anti...|           R1JV7HP59NBLEF|   it must really s...|
|  R24EC70OSKLM9S|Yes! Another anti...|            R2C39UD7SM4S7|    It's a [weak] e...|
|  R24EC70OSKLM9S|Yes! Another anti...|            R1H0G8SIFXMTV|   With this new cd...|
|  R24EC70OSKLM9S|Yes! Another anti...|            RPRKZI8RFOP4Q|   White Strpes are...|
|  R24EC70OSKLM9S|Yes! Another anti...|           R1RC2DNAX46F37|  What ever happene...|
|  R24EC70OSKLM9S|Yes! Another anti...|           R2ENY6FUCP8901|  Well looking at t...|
|  R24EC70OSKLM9S|Yes! Another anti...|           R1J1X1BIVUG431|   Just keep tellin...|
|  R24EC70OSKLM9S|Yes! Another anti...|           R1SFOZZMB69VDS|   Just keep tellin...|
|  R24EC70OSKLM9S|Yes

In [31]:
positive_center.show(truncate=False)

+-----+--------------+----------------------------------------+------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+
|index|review_id     |features_norm                           |avg_distance|center_sentence_text                                                                                                                                     |
+-----+--------------+----------------------------------------+------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+
|7728 |R380QK1PRRA9V5|[0.39348735810627605,0.9193300272538384]|0.9360323   |\\" It's a song where you're feeling depressed and no one cares and they are saying Hold On things will get better and we all have [things] to go through|
+-----+--------------+----------------------------------------+------------+

In [27]:
negative_center.show()

+-----+--------------+--------------------+------------+--------------------+
|index|     review_id|       features_norm|avg_distance|center_sentence_text|
+-----+--------------+--------------------+------------+--------------------+
|  313|R24EC70OSKLM9S|[0.16685070289897...|   0.8396373|Yes! Another anti...|
+-----+--------------+--------------------+------------+--------------------+

In [28]:
end_time=datetime.datetime.now()
print("Execution time:",(end_time-start_time))

Execution time: 0:06:05.498508