In [1]:
import warnings
warnings.filterwarnings('ignore')
import math
from pyspark import SparkContext

In [3]:
sc = SparkContext.getOrCreate()
capitalsRDD = sc.textFile("/Users/halilergul/Desktop/master/fall-23_24/datasets-20231023/Capitals.txt")
print(capitalsRDD.take(1))

['Afghanistan Flag Icon \tAfghanistan \tKabul \t34,53 \t69,17\t']


In [4]:
# This will extract capital name, latitude and longitude
def lineparser(line):
    parts = line.split("\t")
    country = parts[1]
    city = parts[2]
    lat, long = parts[3].split(","), parts[4].split(",")
    return (country, city, float(lat[0]) + float(lat[1]), float(long[0]) + float(long[1])) 

#A new RDD with the parsed data
cities = capitalsRDD.map(lineparser)
cities.take(5)

[('Afghanistan ', 'Kabul ', 87.0, 86.0),
 ('Albania ', 'Tirana ', 74.0, 101.0),
 ('Algeria ', 'Algiers ', 111.0, 7.0),
 ('American Samoa ', 'Pago Pago ', 14.0, -100.0),
 ('Andorra ', 'Andorra la Vella ', 93.0, 53.0)]

In [5]:
city_pairs = cities.cartesian(cities).filter(lambda x: x[0][1] != x[1][1]) #This creates an RDD of all possible combinations (pairs) of cities
# then ,lambda x: x[0][1] != x[1][1] is used to filter out pairs where the second element (the city name) of the first tuple is the same as the second element of the second tuple.
print(city_pairs.take(1))
print(city_pairs.count()) # there are 57838 combinations of cities

                                                                                

[(('Afghanistan ', 'Kabul ', 87.0, 86.0), ('Albania ', 'Tirana ', 74.0, 101.0))]
57838


                                                                                

In [6]:
# Getting cartesian product
city_pairs = cities.cartesian(cities).filter(lambda x: x[0][1] != x[1][1])
# This is the haversine formula to calculate the distance between two points on a sphere (I took it from the internet)
def haversine(lon1, lat1, lon2, lat2): 
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    km = 6371 * c # 6371 is the radius of the Earth
    return km

def distance(pair):
    city1, city2 = pair
    lat1, lon1 = city1[2], city1[3]
    lat2, lon2 = city2[2], city2[3]
    dist = haversine(lon1, lat1, lon2, lat2)
    return ((city1[1], city2[1]), dist)

distances = city_pairs.map(distance)
closest_cities = distances.min(key=lambda x: x[1]) # Finding the closest cities by minimum distance
print(f"I found {closest_cities[0][0]} and {closest_cities[0][1]} with a distance of {closest_cities[1]:.2f}.")


I found Jerusalem  and East Jerusalem  with a distance of 0.00.


                                                                                