In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, col, row_number, udf, monotonically_increasing_id
from pyspark.sql.types import DoubleType, IntegerType
from geopy.distance import geodesic
from pyspark.sql.window import Window
import json

In [2]:
spark = SparkSession.builder.appName("Nearest Place Data").getOrCreate()

24/06/04 23:49:43 WARN Utils: Your hostname, Shofiyyahs-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.0.17 instead (on interface en0)
24/06/04 23:49:43 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/04 23:50:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/06/04 23:50:14 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/06/04 23:50:14 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [3]:
def load_json_to_df(file_path, category):
    with open(file_path, 'r') as file:
        data = json.load(file)

    # Extract place names and coordinates
    places = []
    for result in data['results']:
        place_id = result['place_id']
        lat = result['geometry']['location']['lat']
        lng = result['geometry']['location']['lng']
        places.append({'place_id': place_id, 'latitude': lat, 'longitude': lng})

    # Convert to RDD and DataFrame
    rdd = spark.sparkContext.parallelize(places)
    df = spark.createDataFrame(rdd)
    
    # Add category column
    df = df.withColumn("category", lit(category))
    
    return df

In [5]:
df_gym = load_json_to_df('data/gym.json', 'gym')
df_museum = load_json_to_df('data/museum.json', 'museum')
df_restaurant = load_json_to_df('data/restaurant.json', 'restaurant')

                                                                                

In [6]:
df_places = df_gym.union(df_museum).union(df_restaurant)

In [7]:
df_meetup = spark.read.json("data/post.json")

In [8]:
def calculate_distance(lat1, lon1, lat2, lon2):
    return geodesic((lat1, lon1), (lat2, lon2)).kilometers

In [9]:
distance_udf = udf(calculate_distance, DoubleType())

In [10]:
df_places_renamed = df_places.withColumnRenamed("latitude", "place_latitude").withColumnRenamed("longitude", "place_longitude").withColumnRenamed("place_id", "place_id")

In [11]:
df_meetup_with_index = df_meetup.withColumn("key", lit(1))
df_places_with_index = df_places_renamed.withColumn("key", lit(1))

In [12]:
df_cross = df_meetup_with_index.join(df_places_with_index, "key").drop("key")

In [13]:
df_cross = df_cross.withColumn("distance", distance_udf(col("lat"), col("lon"), col("place_latitude"), col("place_longitude")))

In [14]:
df_cross = df_cross.select(
    col("post_id"),
    col("lat").alias("from_latitude"),
    col("lon").alias("from_longitude"),
    col("place_id"),
    col("place_latitude").alias("latitude"),
    col("place_longitude").alias("longitude"),
    col("distance")
)

In [15]:
window = Window.partitionBy("post_id").orderBy("distance")
df_nearest_places = df_cross.withColumn("rank", row_number().over(window)).filter(col("rank") <= 4).drop("rank")

In [16]:
df_nearest_places = df_nearest_places.withColumn("nearest_places_id", row_number().over(Window.orderBy(monotonically_increasing_id())))

In [17]:
final_df = df_nearest_places.select(col("nearest_places_id"), col("post_id"), col("place_id"))

In [18]:
final_df.show(truncate=False)

24/06/04 23:51:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/04 23:51:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/04 23:51:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/04 23:51:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/04 23:51:55 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/04 23:51:57 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/04 2

+-----------------+-------+---------------------------+
|nearest_places_id|post_id|place_id                   |
+-----------------+-------+---------------------------+
|1                |1      |ChIJ12d3ZlmYpBIRStDXELkkdJE|
|2                |1      |ChIJNbGKpPyipBIRWOE2AiC82hc|
|3                |1      |ChIJC6CEtf2ipBIRkFQaF95uL1Q|
|4                |1      |ChIJeYwrA_uipBIRtLLknGLSRQA|
|5                |2      |ChIJBTfBbwCjpBIRK-jjm940utc|
|6                |2      |ChIJa-K8mf6ipBIRY8wAB3EeASc|
|7                |2      |ChIJC6CEtf2ipBIRkFQaF95uL1Q|
|8                |2      |ChIJveK7vP6ipBIRiegu-jsxGX8|
|9                |3      |ChIJMSlpOfGipBIRf5cyHKdpsHg|
|10               |3      |ChIJ4aTqxfaipBIRi9Kz759j6GI|
|11               |3      |ChIJ7z9uFveipBIRzDCxOaCzzSU|
|12               |3      |ChIJfx2I8vaipBIR2bIdtf-roQE|
|13               |4      |ChIJ12d3ZlmYpBIRStDXELkkdJE|
|14               |4      |ChIJNbGKpPyipBIRWOE2AiC82hc|
|15               |4      |ChIJC6CEtf2ipBIRkFQaF

                                                                                

In [20]:
final_df.write.csv("data/nearest_places.csv", header=True)

24/06/04 23:53:28 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/04 23:53:28 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/04 23:53:28 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/04 23:53:28 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/04 23:53:28 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/04 23:53:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/06/04 2