In [0]:
import pyspark.sql.functions as F
from elasticsearch import Elasticsearch, helpers
from pyspark.sql.types import *
from pyspark.sql.functions import col, split, randn
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics, MulticlassMetrics
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np 
from pyspark.sql.window import Window

In [0]:
schema = StructType([
                    StructField("line_num",StringType(),True),
                    StructField("sourceAndDest",StringType(),True)])

lines_df = spark.read.csv('/data/bus_lines_source_destination.csv', header=True, schema=schema)

attralines_dfctions_df = lines_df.dropDuplicates()

display(lines_df) 

In [0]:
##Add source and dest cols 
lines_df = lines_df.withColumn("source", split(F.col("sourceAndDest"), "To")[0])
lines_df = lines_df.withColumn("source", split(F.col("source"), "From")[1]) 
lines_df = lines_df.withColumn("dest", split(F.col("sourceAndDest"), "To ")[1]) 
lines_df = lines_df.withColumn("line_num", F.upper(F.col("line_num")))
lines_df = lines_df.withColumn('direction', F.lit(0))
#display(lines_df) 

##duplicate df to change source-dest order (using direction)
other_direction_df = lines_df.withColumn('direction', F.lit(1))
other_direction_df = other_direction_df.withColumnRenamed('source', 'new_dest').withColumnRenamed('dest', 'new_source').withColumnRenamed('new_source', 'source').withColumnRenamed('new_dest', 'dest').select("line_num","sourceAndDest","source","dest", 'direction')

##Concat both source-dest directions 
other_direction_df = lines_df.withColumn('direction', F.lit(1))
other_direction_df = other_direction_df.withColumnRenamed('source', 'new_dest').withColumnRenamed('dest', 'new_source').withColumnRenamed('new_source', 'source').withColumnRenamed('new_dest', 'dest').select("line_num","sourceAndDest","source","dest", 'direction')

#Concat both source-dest directions 
source_dest_df = lines_df.union(other_direction_df)
display(source_dest_df)

In [0]:
source_dest_df.write.csv('source_dest_df_1.csv',header=True)

In [0]:
stops_df = spark.read.csv('/data/stops.txt',header=True)
stops_df=stops_df.withColumn("busStop", split(F.col("stop_name"), ",")[1])
stops_df=stops_df.withColumn("busStop", split(F.col("busStop"), " ")[2])
stops_df=stops_df[["stop_lat","stop_lon","busStop"]]
stops_df = stops_df.withColumnRenamed('stop_lat', 'latitude').withColumnRenamed('stop_lon', 'longitude')
display(stops_df)

In [0]:
schema = StructType([
                    StructField("Name",StringType(),True),
                    StructField("Url",StringType(),True),
                    StructField("Telephone",StringType(),True),
                    StructField("Longitude",DoubleType(),True),
                    StructField("Latitude",DoubleType(),True),
                    StructField("AddressRegion",StringType(),True),
                    StructField("AddressLocality",StringType(),True),
                    StructField("AddressCountry",StringType(),True)])

attractions_df = spark.read.csv('/data/Attractions.csv', header=True, schema=schema)

attractions_df = attractions_df.dropDuplicates()
display(attractions_df) 

In [0]:
attractions_df = attractions_df.withColumn("Loc_A",F.concat_ws(",",attractions_df["Longitude"], attractions_df["Latitude"]))
attractions_df = attractions_df.withColumn("Loc_A", split(F.col("Loc_A"), ",\s*").cast(ArrayType(DoubleType())).alias("Loc_A"))
attractions_df = attractions_df[attractions_df.AddressRegion=='Dublin']
attractions_df = attractions_df.drop('AddressCountry','AddressRegion')
attractions_df = attractions_df.dropDuplicates(['Name'])
attractions_df = attractions_df.filter((attractions_df.Name != 'Keel Blue Flag Beach 2019') & (attractions_df.Name != 'Cliffs of Moher Tour - Dublin Tour Company'))
attractions_df=attractions_df["Name","Longitude","Latitude","Loc_A","AddressLocality"]
display(attractions_df)

In [0]:
stops_df.createOrReplaceTempView('bus_stops_names')
attractions_df.createOrReplaceTempView('attractions_df')

In [0]:
attractions_with_busStops=sqlContext.sql("select A.Name, A.Longitude, A.Latitude,A.Loc_A,A.AddressLocality,B.busStop, abs(A.Longitude-B.longitude) as long_dist, abs(A.Latitude-B.latitude) as lat_dist from attractions_df as A left join bus_stops_names as B on abs(A.Longitude-B.longitude)<=0.00120 and abs(A.Latitude-B.latitude)<=0.00120")

In [0]:
attractions_with_busStops=attractions_with_busStops.withColumn("distance",F.sqrt(F.pow(F.col("long_dist"),2)+F.pow(F.col("lat_dist"),2)))

window = Window.partitionBy(attractions_with_busStops['Name']).orderBy(attractions_with_busStops["distance"])
closest_stops_df=attractions_with_busStops.select('*', F.rank().over(window).alias('rank')).filter(F.col('rank') <= 3)

In [0]:
closest_stops_df=closest_stops_df.dropna()

In [0]:
att_with_stops_df=closest_stops_df.drop("Loc_A","long_dist","lat_dist")
display(att_with_stops_df)

In [0]:
att_with_stops_df.write.csv('atrractions_with_busStops.csv',header=True)