In [1]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql import Row

In [2]:
spark = SparkSession.builder.appName("Recommendation System").getOrCreate()

24/05/20 14:28:32 WARN Utils: Your hostname, Shofiyyahs-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.0.17 instead (on interface en0)
24/05/20 14:28:32 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/20 14:29:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
post = spark.read.orc("data/post-20240407.orc")
post.show()

+----------+--------------------+------+--------------------+--------------+
|      time|                text|rating|          place_name|place_category|
+----------+--------------------+------+--------------------+--------------+
|2011-12-30|You need to pay f...|     4|             ALIMARA| accommodation|
|2012-09-26|For &quot;free&qu...|     4|             ALIMARA| accommodation|
|2013-11-11|A wonderful place...|     5|           Agora BCN| accommodation|
|2013-02-28|Very simply furni...|     4|           Agora BCN| accommodation|
|2013-07-26|Nice place for yo...|     4|           Agora BCN| accommodation|
|2011-11-19|It was far from t...|     4|        Inout Hostel| accommodation|
|2011-06-27|This was easily o...|     5|        Inout Hostel| accommodation|
|2011-06-16|Okay, it&#39;s no...|     5|Gran Hotel La Flo...| accommodation|
|2012-08-09|A bit far from ci...|     4|Catalonia Park Güell| accommodation|
|2013-09-20|I had a lovely vi...|     5|Bed and Breakfast...| accommodation|

In [4]:
event = spark.read.orc("data/meetup-20240407.orc")
event.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+--------------------+--------------------+
|                 _id|               title|           hosted_by|          event_time|          gmaps_link|      vanue_location|vanue_location_detail|         description|              topics|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+--------------------+--------------------+
|{65f8d6b76c9cc986...|Coffee Walk & Bea...|             Mary G.|Friday, April 5, ...|https://www.googl...|          Itnig Café| C. de Pujades, 10...|🗓️ FRI, APR 5 • ...|[Coffee, Social, ...|
|{65f8d6b76c9cc986...|🎧 SILENT ECSTATI...|             Juli B.|Saturday, March 2...|https://www.googl...|      Espigó del Gas|      · Barcelona, CT|Do you imagine da...|[Dance Movement T...|
|{65f8d6b76c9cc986...|Barcelona SRE Mee...

In [5]:
new_post = [Row(time="2023-11-28", text="The hotel is conveniently located in the city center, making it easy to access popular attractions and dining options. The staff were friendly and helpful, always ready to assist with any inquiries or requests.",
               rating=5, place_name="Ibis Hotel", place_category="accommodation")]
new_post = spark.createDataFrame(new_post)

In [6]:
post = post.union(new_post)

In [7]:
post.show()

+----------+--------------------+------+--------------------+--------------+
|      time|                text|rating|          place_name|place_category|
+----------+--------------------+------+--------------------+--------------+
|2011-12-30|You need to pay f...|     4|             ALIMARA| accommodation|
|2012-09-26|For &quot;free&qu...|     4|             ALIMARA| accommodation|
|2013-11-11|A wonderful place...|     5|           Agora BCN| accommodation|
|2013-02-28|Very simply furni...|     4|           Agora BCN| accommodation|
|2013-07-26|Nice place for yo...|     4|           Agora BCN| accommodation|
|2011-11-19|It was far from t...|     4|        Inout Hostel| accommodation|
|2011-06-27|This was easily o...|     5|        Inout Hostel| accommodation|
|2011-06-16|Okay, it&#39;s no...|     5|Gran Hotel La Flo...| accommodation|
|2012-08-09|A bit far from ci...|     4|Catalonia Park Güell| accommodation|
|2013-09-20|I had a lovely vi...|     5|Bed and Breakfast...| accommodation|

## Modelling

In [8]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
VERB_CODES = {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'}

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shofiyyahnadhiroh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/shofiyyahnadhiroh/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shofiyyahnadhiroh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shofiyyahnadhiroh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
def preprocess_sentences(text):
    text = text.lower()
    temp_sent = []
    words = word_tokenize(text)
    tags = pos_tag(words)
    
    for i, word in enumerate(words):
        if tags[i][1] in VERB_CODES:
            lemmatized = lemmatizer.lemmatize(word, 'v')
        else:
            lemmatized = lemmatizer.lemmatize(word)
        if lemmatized not in stop_words and lemmatized.isalpha():
            temp_sent.append(lemmatized)
    
    finalsent = ' '.join(temp_sent)
    finalsent = finalsent.replace("n't", " not")
    finalsent = finalsent.replace("'m", " am")
    finalsent = finalsent.replace("'s", " is")
    finalsent = finalsent.replace("'re", " are")
    finalsent = finalsent.replace("'ll", " will")
    finalsent = finalsent.replace("'ve", " have")
    finalsent = finalsent.replace("'d", " would")
    
    return finalsent

In [10]:
preprocess_udf = udf(preprocess_sentences, StringType())

In [11]:
post = post.withColumn("text_proc", preprocess_udf(post["text"]))
post.show()

[Stage 3:>                                                          (0 + 1) / 1]

+----------+--------------------+------+--------------------+--------------+--------------------+
|      time|                text|rating|          place_name|place_category|           text_proc|
+----------+--------------------+------+--------------------+--------------+--------------------+
|2011-12-30|You need to pay f...|     4|             ALIMARA| accommodation|need pay internet...|
|2012-09-26|For &quot;free&qu...|     4|             ALIMARA| accommodation|quot free quot wi...|
|2013-11-11|A wonderful place...|     5|           Agora BCN| accommodation|wonderful place s...|
|2013-02-28|Very simply furni...|     4|           Agora BCN| accommodation|simply furnished ...|
|2013-07-26|Nice place for yo...|     4|           Agora BCN| accommodation|nice place young ...|
|2011-11-19|It was far from t...|     4|        Inout Hostel| accommodation|far center barcel...|
|2011-06-27|This was easily o...|     5|        Inout Hostel| accommodation|easily one best h...|
|2011-06-16|Okay, it

                                                                                

In [12]:
from pyspark.ml.feature import Tokenizer, CountVectorizer
from pyspark.ml.linalg import Vectors, VectorUDT
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
tokenizer = Tokenizer(inputCol="text_proc", outputCol="text_proc_tokens")
post = tokenizer.transform(post)

In [14]:
cv = CountVectorizer(inputCol="text_proc_tokens", outputCol="features")

cv_model = cv.fit(post)
post = cv_model.transform(post)

                                                                                

In [15]:
def sparse_to_dense(sparse_vector):
    return Vectors.dense(sparse_vector.toArray())

In [16]:
sparse_to_dense_udf = udf(sparse_to_dense, VectorUDT())
post = post.withColumn("dense_features", sparse_to_dense_udf(post["features"]))

In [17]:
dense_features = post.select("dense_features").collect()
dense_feature_array = np.array([row["dense_features"].toArray() for row in dense_features])
cosine_sim = cosine_similarity(dense_feature_array)
print(cosine_sim)

                                                                                

[[1.         0.         0.10112998 ... 0.07009996 0.09090909 0.14301939]
 [0.         1.         0.04225771 ... 0.04393748 0.11396058 0.05976143]
 [0.10112998 0.04225771 1.         ... 0.2079501  0.13483997 0.03535534]
 ...
 [0.07009996 0.04393748 0.2079501  ... 1.         0.45564976 0.07352146]
 [0.09090909 0.11396058 0.13483997 ... 0.45564976 1.         0.04767313]
 [0.14301939 0.05976143 0.03535534 ... 0.07352146 0.04767313 1.        ]]


In [18]:
latest_post_similarities = cosine_sim[len(dense_feature_array) - 1]

In [19]:
similarity_list = list(enumerate(latest_post_similarities))

In [20]:
sorted_similarities = sorted(similarity_list, key=lambda x: x[1], reverse=True)

In [21]:
sorted_similarities

[(150, 1.0),
 (111, 0.25819888974716115),
 (74, 0.23312620206007845),
 (136, 0.22934123614693147),
 (145, 0.2201927530252721),
 (20, 0.21516574145596762),
 (79, 0.2091650066335189),
 (23, 0.20412414523193148),
 (106, 0.20225995873897265),
 (24, 0.19999999999999998),
 (71, 0.19999999999999998),
 (66, 0.1978141420187361),
 (17, 0.19518001458970666),
 (34, 0.19462473604038075),
 (65, 0.1892057519422899),
 (19, 0.18650096164806276),
 (117, 0.18633899812498247),
 (110, 0.17928429140015906),
 (62, 0.17837651700316892),
 (108, 0.1732050807568877),
 (94, 0.1690308509457033),
 (67, 0.16484511834894675),
 (32, 0.16269784336399212),
 (138, 0.15811388300841897),
 (89, 0.15811388300841894),
 (131, 0.1556997888323046),
 (30, 0.1538967528127731),
 (99, 0.14907119849998596),
 (0, 0.14301938838683884),
 (129, 0.13987572123604708),
 (47, 0.1341640786499874),
 (123, 0.1341640786499874),
 (76, 0.13333333333333333),
 (46, 0.1318760946791574),
 (69, 0.13176156917368248),
 (44, 0.12909944487358055),
 (139, 0

In [22]:
sorted_indices = [idx for idx, similarity in sorted_similarities]
data_list = post.collect()
sorted_data_list = [data_list[idx] for idx in sorted_indices]
sorted_post = spark.createDataFrame(sorted_data_list, schema=post.schema)
sorted_post.select("time", "text", "rating", "place_name", "place_category").show(truncate=False)

                                                                                

+----------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

24/05/20 14:29:21 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
