In [2]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Calculate Distances") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [71]:
import numpy as np
import string
from pyspark.sql.functions import concat_ws

def deg2rad(deg):
    return deg/360*(2*np.pi)

def append_m_prefix(row):
    prefix = 'm'
    new_row={}
    new_row['mid'] = prefix+str(row['m_id'])
    new_row['m_id'] = row['m_id']
    return new_row

dfMetroAreasRaw = spark.read.load("../rawdata/Gaz_ua_national.txt", format="csv", delimiter="\t", header=True, inferSchema=True)
dfMetroAreasRaw = dfMetroAreasRaw.withColumnRenamed(dfMetroAreasRaw.columns[-1],dfMetroAreasRaw.columns[-1].strip(string.whitespace))
dfMetroAreas = dfMetroAreasRaw.select('GEOID','NAME','UATYPE','POP10','HU10','ALAND_SQMI',\
                                      'AWATER_SQMI','INTPTLAT','INTPTLONG')\
                            .withColumnRenamed('GEOID','m_id')\
                            .withColumnRenamed('NAME', 'name')\
                            .withColumnRenamed('POP10','m_pop')\
                            .withColumnRenamed('HU10','m_house_unit')\
                            .withColumnRenamed('ALAND_SQMI','m_land')\
                            .withColumnRenamed('AWATER_SQMI','m_water')\
                            .withColumnRenamed('INTPTLAT', 'm_lat_d')\
                            .withColumnRenamed('INTPTLONG', 'm_long_d')


dfMetroAreas = dfMetroAreas.withColumn('m_lat_r', deg2rad(dfMetroAreas.m_lat_d)).withColumn('m_long_r', deg2rad(dfMetroAreas.m_long_d))

temp = dfMetroAreas.rdd.map(append_m_prefix)
dfR=spark.createDataFrame(temp)
    
dfMetroAreas = dfR.join(dfMetroAreas,dfMetroAreas.m_id==dfR.m_id).drop(dfR.m_id)

dfMetroAreas.coalesce(1).write.csv("/media/keir/Data/workspace/project/metroareas.csv",header=True,mode='overwrite')



In [41]:
def append_z_prefix(row):
    prefix = 'z'
    new_row={}
    new_row['zid'] = prefix+str(row['z_id'])
    new_row['z_id'] = row['z_id']
    return new_row

dfZCTAsRaw = spark.read.load("../rawdata/Gaz_zcta_national.txt", format="csv", delimiter="\t", header=True, inferSchema=True)
dfZCTAsRaw = dfZCTAsRaw.withColumnRenamed(dfZCTAsRaw.columns[-1],dfZCTAsRaw.columns[-1].strip(string.whitespace))
dfZCTAs = dfZCTAsRaw.select('GEOID','POP10','HU10','ALAND_SQMI','AWATER_SQMI','INTPTLAT','INTPTLONG')\
                .withColumnRenamed('GEOID','z_id')\
                .withColumnRenamed('POP10','z_pop')\
                .withColumnRenamed('HU10','z_house_unit')\
                .withColumnRenamed('ALAND_SQMI','z_land')\
                .withColumnRenamed('AWATER_SQMI','z_water')\
                .withColumnRenamed('INTPTLAT', 'z_lat_d')\
                .withColumnRenamed('INTPTLONG', 'z_long_d')
                
dfZCTAs = dfZCTAs.withColumn('z_lat_r', deg2rad(dfZCTAs.z_lat_d)).withColumn('z_long_r', deg2rad(dfZCTAs.z_long_d))

temp = dfZCTAs.rdd.map(append_z_prefix)
dfR=spark.createDataFrame(temp)

dfZCTAs = dfR.join(dfZCTAs,dfZCTAs.z_id==dfR.z_id).drop(dfR.z_id)

dfZCTAs.show(5)

dfZCTAs.coalesce(1).write.csv("/media/keir/Data/workspace/project/zctas.csv",header=True,mode='overwrite')



+----+----+-----+------------+------+-------+---------+----------+-------------------+-------------------+
| zid|z_id|z_pop|z_house_unit|z_land|z_water|  z_lat_d|  z_long_d|            z_lat_r|           z_long_r|
+----+----+-----+------------+------+-------+---------+----------+-------------------+-------------------+
|z601| 601|18570|        7744|64.348|  0.309|18.180555|-66.749961|0.31731054458991764|-1.1650065950278068|
|z602| 602|41520|       18073|30.613|  1.717|18.362268| -67.17613| 0.3204820347335941|-1.1724446472477383|
|z603| 603|54689|       25653|31.614|  0.071|18.455183|-67.119887| 0.3221037074080847|-1.1714630217165392|
|z606| 606| 6615|        2877|42.309|  0.005|18.158345|-66.932911|0.31692290696304976|-1.1681996748943304|
|z610| 610|29016|       12618|35.916|  1.611|18.290955|-67.125868| 0.3192373880841194| -1.171567409859101|
+----+----+-----+------------+------+-------+---------+----------+-------------------+-------------------+
only showing top 5 rows



In [66]:
# Wikipedia, "Great-circle distance," https://en.wikipedia.org/wiki/Great-circle_distance retrieved 12/9/2016
from pyspark.sql.functions import acos, cos, sin, abs

spark.conf.set("spark.sql.crossJoin.enabled", True)

#.where(dfMetroAreas.m_pop>20000)

dfDist = dfZCTAs.select('zid','z_lat_r','z_long_r').join(dfMetroAreas.select('mid','m_lat_r','m_long_r'))

RMETERS = 6371000 #meters
RMILES = RMETERS*0.000621371

dfDist = dfDist.withColumn('dist',acos(
                            sin(dfDist.z_lat_r)*sin(dfDist.m_lat_r)+
                            cos(dfDist.z_lat_r)*cos(dfDist.m_lat_r)*cos(abs(dfDist.z_long_r-dfDist.m_long_r))
                            )*RMILES)
dfDist = dfDist.select('zid','mid','dist')
print(dfDist.count())
dfDist.coalesce(1).write.csv("/media/keir/Data/workspace/project/gc_distance.csv",header=True,mode='overwrite')

118967040


In [74]:
dfZData = spark.read.csv("../systemg/nodes.csv",header=True)
temp = dfZData.rdd.map(append_z_prefix)
dfR=spark.createDataFrame(temp)
    
dfZData = dfR.join(dfZData,dfZData.z_id==dfR.z_id).drop(dfR.z_id)
dfZData.printSchema()
dfZData.coalesce(1).write.csv("/media/keir/Data/workspace/project/nodes_2.csv",header=True,mode='overwrite')



root
 |-- zid: string (nullable = true)
 |-- z_id: string (nullable = true)
 |-- z_lat_d: string (nullable = true)
 |-- z_long_d: string (nullable = true)
 |-- z_land: string (nullable = true)
 |-- z_water: string (nullable = true)
 |-- z_pop: string (nullable = true)
 |-- z_households: string (nullable = true)
 |-- z_comm: string (nullable = true)
 |-- z_med_inc: string (nullable = true)
 |-- z_house_unit: string (nullable = true)
 |-- z_comm_miles: string (nullable = true)
 |-- z_comm_miles_ph: string (nullable = true)
 |-- z_carb_ton_ph: string (nullable = true)
 |-- z_pov: string (nullable = true)
 |-- z_per_comm: string (nullable = true)



In [75]:
dfZData.count()

26170

In [5]:
def append_m_prefix(row):
    prefix = 'm'
    new_row={}
    new_row['mid'] = prefix+str(row['m_id'])
    new_row['m_id'] = row['m_id']
    return new_row

def append_z_prefix(row):
    prefix = 'z'
    new_row={}
    new_row['zid'] = prefix+str(row['z_id'])
    new_row['z_id'] = row['z_id']
    return new_row

dfDDist = spark.read.parquet("../processeddata/driv_dist.parquet")

temp = dfDDist.select('z_id').distinct().rdd.map(append_z_prefix)
dfZ=spark.createDataFrame(temp)

temp = dfDDist.select('m_id').distinct().rdd.map(append_m_prefix)
dfM=spark.createDataFrame(temp)

dfMD = dfM.join(dfDDist,dfDDist.m_id==dfM.m_id)
dfDDist = dfZ.join(dfMD,dfMD.z_id==dfZ.z_id)



dfDDist = dfDDist.select('zid','mid','ddist')
dfDDist.coalesce(1).write.csv("../systemg/driv_dist.csv",header=True,mode='overwrite')

