In [1]:
# importing required packages 
from pyspark.sql import SparkSession
from pyspark.ml.feature import HashingTF, IDF, Normalizer, Word2Vec 
from pyspark.ml.linalg import DenseVector, Vectors, VectorUDT
from pyspark.sql.functions import col, explode, udf, concat_ws, collect_list, split
from pyspark.ml.recommendation import ALS
from pyspark.sql.types import DoubleType

In [2]:
# Setting a spark session 
spark = SparkSession.builder.appName("Workload-1").getOrCreate()
sc = spark.sparkContext
sc.defaultParallelism
sc.getConf().getAll()
spark.conf.set('spark.sql.adaptive.enabled',True)
# spark.conf.set('spark.sql.adaptive.coalescePartitions.enabled',True)
# spark.conf.set("spark.sql.shuffle.partitions", 100)

In [3]:
# Reading data
data = spark.read.option("multiline","true").json('tweets.json')
# data.cache()
# data.show(truncate=False)

In [4]:
# Formatting data in required format for tfidf and word2vec
data_required_w1 = data.select('id','replyto_id','retweet_id','user_id').cache()
# data_required_w1.show(3)
data1_with_dr = data_required_w1.withColumn("Document representation",concat_ws(",",col("replyto_id"),col("retweet_id")))
# data1_with_dr.show(5)
data1_grouped_dr = data1_with_dr.groupBy("user_id").agg(concat_ws(',', collect_list("Document representation"))).withColumnRenamed("concat_ws(,, collect_list(Document representation))","Document_representation")
# data1_grouped_dr.show(truncate = False)
data1_required_format = data1_grouped_dr.select(col('user_id'),split(col("Document_representation"),",").alias("document_representation"))
# print(data1_required_format.dtypes)
# data1_required_format.show(truncate=False)

In [5]:
# UDF to calculate cosine similarity 
cosine_similarity = udf(lambda x,y: float(x.dot(y)/(x.norm(2)*y.norm(2))), DoubleType())

In [6]:
# Building a tf-idf model 
hashingTF = HashingTF(inputCol="document_representation", outputCol="term_frequency",numFeatures=16384)
tf_model = hashingTF.transform(data1_required_format)
tf_model.cache()
idf_model = IDF(inputCol="term_frequency", outputCol="converted_features").fit(tf_model)
tfidf_matrix = idf_model.transform(tf_model)

# tfidf_matrix.show(truncate = False)

In [7]:
# Selecting a random row 
random_row = tfidf_matrix.rdd.takeSample(False, 1, seed=42)
# random_row
uid = random_row[0][0]
uid_feature = random_row[0][3]
feature_of_selected_uid = udf(lambda: uid_feature, VectorUDT())
data1_tfidf_for_cosine_similarity = tfidf_matrix.withColumn("feature_of_selected_uid", feature_of_selected_uid())

In [8]:
# Calculating cosine similarity for tf-idf
data_tfidf_cosine_similarity = data1_tfidf_for_cosine_similarity.withColumn('cosine_similarity',cosine_similarity(data1_tfidf_for_cosine_similarity.converted_features,data1_tfidf_for_cosine_similarity.feature_of_selected_uid))

In [9]:
# Final required result
print('Top 5 users with similar interest to user id :',uid,' with tfidf feature extractor')
data_tfidf_cosine_similarity.sort(col("cosine_similarity").desc()).select('user_id').show(5,truncate=False)

Top 5 users with similar interest to user id : 9223062  with tfidf feature extractor
+-------------------+
|user_id            |
+-------------------+
|48698695           |
|3345780705         |
|17105179           |
|546060993          |
|1000400513219768321|
+-------------------+
only showing top 5 rows



In [10]:
# Building the word2vec model
word2vec = Word2Vec(vectorSize=15, minCount=1, inputCol="document_representation", outputCol="word2vec_feature")
word2vec_model = word2vec.fit(data1_required_format)
word2vec_matrix = word2vec_model.transform(data1_required_format)
# word2vec_matrix.show(10,truncate=False)

In [11]:
# selecting a random row
random_row_w2v = word2vec_matrix.rdd.takeSample(False, 1, seed=42)
# random_row_w2v
user_id_w2v = random_row_w2v[0][1]
uid_feature_w2v = random_row_w2v[0][2]
feature_of_selected_uid_w2v = udf(lambda: uid_feature_w2v, VectorUDT())
data_w2v_for_cosine_similarity = word2vec_matrix.withColumn("feature_of_selected_uid", feature_of_selected_uid_w2v())
# data_w2v_for_cosine_similarity.show(truncate = False)
# data_w2v_for_cosine_similarity.dtypes

In [12]:
# Calculating cosine similaity for word2vec
data_w2v_cosine_similarity = data_w2v_for_cosine_similarity.withColumn('cosine_similarity',cosine_similarity(data_w2v_for_cosine_similarity.word2vec_feature ,data_w2v_for_cosine_similarity.feature_of_selected_uid ))
# data_w2v_cosine_similarity.dtypes
# data_w2v_cosine_similarity.show(truncate=False)

In [13]:
print('Top 5 users with similar interest to user id :',user_id_w2v[0],' with word2vec feature extractor')
data_w2v_cosine_similarity.sort(col("cosine_similarity").desc()).select('user_id').show(5,truncate=False)

Top 5 users with similar interest to user id : 1390027514332991489  with word2vec feature extractor
+--------+
|user_id |
+--------+
|14880939|
|17547658|
|14997273|
|15866004|
|17473883|
+--------+
only showing top 5 rows

