In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, col, row_number, udf
from pyspark.sql.types import DoubleType
from geopy.distance import geodesic
from pyspark.sql.window import Window
import json

In [2]:
# Initialize Spark session
spark = SparkSession.builder.appName("Place Data").getOrCreate()

def load_json_to_df(file_path, category):
    with open(file_path, 'r') as file:
        data = json.load(file)

    # Extract place names and coordinates
    places = []
    for result in data['results']:
        name = result['name']
        lat = result['geometry']['location']['lat']
        lng = result['geometry']['location']['lng']
        places.append({'name': name, 'latitude': lat, 'longitude': lng})

    # Convert to RDD and DataFrame
    rdd = spark.sparkContext.parallelize(places)
    df = spark.createDataFrame(rdd)
    
    # Add category column
    df = df.withColumn("category", lit(category))
    
    return df

24/05/28 23:20:06 WARN Utils: Your hostname, Shofiyyahs-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.0.17 instead (on interface en0)
24/05/28 23:20:06 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/28 23:20:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Load data for gyms, museums, and restaurants
df_gym = load_json_to_df('gym.json', 'gym')
df_museum = load_json_to_df('museum.json', 'museum')
df_restaurant = load_json_to_df('restaurant.json', 'restaurant')

                                                                                

In [4]:
# Combine DataFrames
df_places = df_gym.union(df_museum).union(df_restaurant)

df_places.show(1)

+----------+---------+--------------------+--------+
|  latitude|longitude|                name|category|
+----------+---------+--------------------+--------+
|41.3996418|2.1197347|Body Lab Fitness ...|     gym|
+----------+---------+--------------------+--------+
only showing top 1 row



## Meetup

In [5]:
# Load the meetup JSON file
file_path_meetup = 'meetup.json'
with open(file_path_meetup, 'r') as file:
    meetup_data = json.load(file)

In [6]:
# Extract place names and coordinates from the Google Maps links
meetup_places = []
for event in meetup_data:
    name = event['title']
    gmaps_link = event['gmaps_link']
    lat, lng = gmaps_link.split('query=')[1].split('%2C%20')
    meetup_places.append({'name': name, 'latitude': float(lat), 'longitude': float(lng)})

In [7]:
# Convert to RDD and DataFrame
rdd_meetup = spark.sparkContext.parallelize(meetup_places)
df_meetup = spark.createDataFrame(rdd_meetup)

df_meetup.show(1)

+--------+---------+--------------------+
|latitude|longitude|                name|
+--------+---------+--------------------+
|41.39655| 2.194162|Coffee Walk & Bea...|
+--------+---------+--------------------+
only showing top 1 row



## Nearest Places

In [8]:
# Function to calculate distance
def calculate_distance(lat1, lon1, lat2, lon2):
    return geodesic((lat1, lon1), (lat2, lon2)).kilometers

# Register UDF
distance_udf = udf(calculate_distance, DoubleType())

# Rename columns in df_places to avoid ambiguity
df_places_renamed = df_places.withColumnRenamed("latitude", "place_latitude").withColumnRenamed("longitude", "place_longitude").withColumnRenamed("name", "place_name")

# Cross join df_meetup with df_places
df_meetup_with_index = df_meetup.withColumn("key", lit(1))
df_places_with_index = df_places_renamed.withColumn("key", lit(1))

df_cross = df_meetup_with_index.join(df_places_with_index, "key").drop("key")

# Calculate distances
df_cross = df_cross.withColumn("distance", distance_udf(col("latitude"), col("longitude"), col("place_latitude"), col("place_longitude")))

# Select the relevant columns
df_cross = df_cross.select(
    col("name").alias("from_place"),
    col("latitude").alias("from_latitude"),
    col("longitude").alias("from_longitude"),
    col("place_name").alias("name"),
    col("place_latitude").alias("latitude"),
    col("place_longitude").alias("longitude"),
    col("distance")
)

# Find the nearest 4 places for each meetup location
window = Window.partitionBy("from_place", "from_latitude", "from_longitude").orderBy("distance")
df_nearest_places = df_cross.withColumn("rank", row_number().over(window)).filter(col("rank") <= 4).drop("rank")

# Remove duplicates
df_nearest_places = df_nearest_places.dropDuplicates(["from_place", "name", "latitude", "longitude", "distance"])

# Sort by from_place and distance
df_nearest_places_sorted = df_nearest_places.orderBy(col("from_place"), col("distance"))

# Show the DataFrame
df_nearest_places_sorted.show()

24/05/28 23:20:47 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors

+--------------------+-------------+--------------+--------------------+-----------------+---------+-------------------+
|          from_place|from_latitude|from_longitude|                name|         latitude|longitude|           distance|
+--------------------+-------------+--------------+--------------------+-----------------+---------+-------------------+
|#12 Meetup - UX S...|    41.397804|      2.159702|         Eurofitness|        41.399545|2.1617254| 0.2569419896415083|
|#12 Meetup - UX S...|    41.397804|      2.159702|La Pedrera-Casa Milà|       41.3952155|2.1619024|0.34133375576165587|
|#12 Meetup - UX S...|    41.397804|      2.159702|VivaGym Vía Augus...|       41.3969074|2.1544011|0.45434930668777357|
|#12 Meetup - UX S...|    41.397804|      2.159702|           Yesterday|       41.3996209|2.1545211|0.47794496916730517|
|(FRESH)    NETWOR...|     41.40046|      2.196119|Design Museum of ...|       41.4024531|2.1880815| 0.7076297832326779|
|(FRESH)    NETWOR...|     41.40

                                                                                