In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Calculate Distances") \
    .getOrCreate()

In [2]:
import string
PATH_RAWDATA = '../rawdata/'
PATH_PROCESSEDDATA = '../processeddata/'
DRIV_DIST_FN='driv_dist.parquet'
PATH_BING = '../rawdata/bing_results/'
INCOME_FN = 'ACS_15_5YR_S1903b.csv'
FINANCIAL_FN = 'ACS_15_5YR_DP03b.csv'
ZCARBON_FN = 'zcarbon.csv'


In [3]:
dfDist = spark.read.parquet(PATH_PROCESSEDDATA+DRIV_DIST_FN)
dfDist.printSchema()
dfDist.count()

root
 |-- UATYPE: string (nullable = true)
 |-- ddist: double (nullable = true)
 |-- dist: double (nullable = true)
 |-- id: string (nullable = true)
 |-- m_house_unit: long (nullable = true)
 |-- m_id: long (nullable = true)
 |-- m_land: double (nullable = true)
 |-- m_lat_d: double (nullable = true)
 |-- m_lat_r: double (nullable = true)
 |-- m_long_d: double (nullable = true)
 |-- m_long_r: double (nullable = true)
 |-- m_pop: long (nullable = true)
 |-- m_water: double (nullable = true)
 |-- name: string (nullable = true)
 |-- z_house_unit: long (nullable = true)
 |-- z_id: long (nullable = true)
 |-- z_land: double (nullable = true)
 |-- z_lat_d: double (nullable = true)
 |-- z_lat_r: double (nullable = true)
 |-- z_long_d: double (nullable = true)
 |-- z_long_r: double (nullable = true)
 |-- z_pop: long (nullable = true)
 |-- z_water: double (nullable = true)



111727

In [4]:
dfIncomesRaw = spark.read.load(PATH_RAWDATA+INCOME_FN, format="csv", delimiter=",", header=True, inferSchema=True)
dfIncomesRaw.count()
dfIncomesRaw.printSchema()

dfIncomes = dfIncomesRaw.select('GEOid2','HC01_EST_VC02','HC02_EST_VC02')\
                            .withColumnRenamed('GEOid2','z_id')\
                            .withColumnRenamed('HC01_EST_VC02','z_households')\
                            .withColumnRenamed('HC02_EST_VC02', 'z_med_inc')
dfIncomes = dfIncomes.select(dfIncomes.z_id.cast('long'),\
                             dfIncomes.z_households.cast('long'),\
                             dfIncomes.z_med_inc.cast('long'))
dfIncomes.printSchema()

root
 |-- GEOid: string (nullable = true)
 |-- GEOid2: integer (nullable = true)
 |-- GEOdisplaylabel: string (nullable = true)
 |-- HC01_EST_VC02: integer (nullable = true)
 |-- HC01_MOE_VC02: integer (nullable = true)
 |-- HC02_EST_VC02: integer (nullable = true)
 |-- HC02_MOE_VC02: integer (nullable = true)
 |-- HC01_EST_VC04: double (nullable = true)
 |-- HC01_MOE_VC04: double (nullable = true)
 |-- HC02_EST_VC04: integer (nullable = true)
 |-- HC02_MOE_VC04: integer (nullable = true)
 |-- HC01_EST_VC05: double (nullable = true)
 |-- HC01_MOE_VC05: double (nullable = true)
 |-- HC02_EST_VC05: integer (nullable = true)
 |-- HC02_MOE_VC05: integer (nullable = true)
 |-- HC01_EST_VC06: double (nullable = true)
 |-- HC01_MOE_VC06: double (nullable = true)
 |-- HC02_EST_VC06: integer (nullable = true)
 |-- HC02_MOE_VC06: integer (nullable = true)
 |-- HC01_EST_VC07: double (nullable = true)
 |-- HC01_MOE_VC07: double (nullable = true)
 |-- HC02_EST_VC07: integer (nullable = true)
 |-- H

In [5]:
dfFinancialsRaw = spark.read.load(PATH_RAWDATA+FINANCIAL_FN, format="csv", delimiter=",", header=True, inferSchema=True)
dfFinancialsRaw.count()
dfFinancialsRaw.printSchema()

dfFinancials = dfFinancialsRaw.select('Geoid2','HC03_VC161','HC03_VC28')
dfFinancials = dfFinancials.withColumnRenamed('Geoid2','z_id')\
                            .withColumnRenamed('HC03_VC161','z_pov')\
                            .withColumnRenamed('HC03_VC28','z_per_comm')

root
 |-- GEOid: string (nullable = true)
 |-- GEOid2: integer (nullable = true)
 |-- GEOdisplay-label: string (nullable = true)
 |-- HC01_VC03: integer (nullable = true)
 |-- HC02_VC03: integer (nullable = true)
 |-- HC03_VC03: integer (nullable = true)
 |-- HC04_VC03: string (nullable = true)
 |-- HC01_VC04: integer (nullable = true)
 |-- HC02_VC04: integer (nullable = true)
 |-- HC03_VC04: double (nullable = true)
 |-- HC04_VC04: double (nullable = true)
 |-- HC01_VC05: integer (nullable = true)
 |-- HC02_VC05: integer (nullable = true)
 |-- HC03_VC05: double (nullable = true)
 |-- HC04_VC05: double (nullable = true)
 |-- HC01_VC06: integer (nullable = true)
 |-- HC02_VC06: integer (nullable = true)
 |-- HC03_VC06: double (nullable = true)
 |-- HC04_VC06: double (nullable = true)
 |-- HC01_VC07: integer (nullable = true)
 |-- HC02_VC07: integer (nullable = true)
 |-- HC03_VC07: double (nullable = true)
 |-- HC04_VC07: double (nullable = true)
 |-- HC01_VC08: integer (nullable = true

In [6]:
MAX_DDIST = 65
COMM_PER = .25
BUS_DAYS = 250

dfFull=dfDist.join(dfIncomes,dfIncomes.z_id==dfDist.z_id).drop(dfIncomes.z_id).where(dfDist.ddist<MAX_DDIST)
dfCommuteTo = dfFull.groupBy(dfFull.z_id).agg({"m_pop":"sum"}).orderBy(dfFull.z_id)\
                .withColumnRenamed('sum(m_pop)','z_m_tot')
dfFull2 = dfFull.join(dfCommuteTo,dfFull.z_id==dfCommuteTo.z_id).drop(dfCommuteTo.z_id)
dfFull3 = dfFull2.withColumn('z_comm',(dfFull2.z_pop*COMM_PER).cast('long'))\
                .withColumn('z_m_comm',(dfFull2.z_pop*COMM_PER*dfFull2.m_pop/dfFull2.z_m_tot).cast('long'))
dfFull3 = dfFull3.withColumn('z_m_tot_miles_yr',dfFull3.ddist*2*BUS_DAYS*dfFull3.z_m_comm)

dfFull3.printSchema()

root
 |-- UATYPE: string (nullable = true)
 |-- ddist: double (nullable = true)
 |-- dist: double (nullable = true)
 |-- id: string (nullable = true)
 |-- m_house_unit: long (nullable = true)
 |-- m_id: long (nullable = true)
 |-- m_land: double (nullable = true)
 |-- m_lat_d: double (nullable = true)
 |-- m_lat_r: double (nullable = true)
 |-- m_long_d: double (nullable = true)
 |-- m_long_r: double (nullable = true)
 |-- m_pop: long (nullable = true)
 |-- m_water: double (nullable = true)
 |-- name: string (nullable = true)
 |-- z_house_unit: long (nullable = true)
 |-- z_land: double (nullable = true)
 |-- z_lat_d: double (nullable = true)
 |-- z_lat_r: double (nullable = true)
 |-- z_long_d: double (nullable = true)
 |-- z_long_r: double (nullable = true)
 |-- z_pop: long (nullable = true)
 |-- z_water: double (nullable = true)
 |-- z_households: long (nullable = true)
 |-- z_med_inc: long (nullable = true)
 |-- z_id: long (nullable = true)
 |-- z_m_tot: long (nullable = true)
 |--

In [7]:
FUEL_ECON = 21.4 #MPG
CARB_PER_GAL = 0.008887
MILES2CARB = 1/FUEL_ECON*CARB_PER_GAL
PERSONSPERHH = 4

# Normalize Data
dfZData = dfFull3.select('z_id','z_lat_d','z_long_d','z_land','z_water','z_pop','z_households','z_comm','z_med_inc','z_house_unit')\
                    .distinct()
dfTemp = dfFull3.groupBy('z_id').agg({'z_m_tot_miles_yr':'sum'})
dfZData = dfZData.join(dfTemp,dfZData.z_id==dfTemp.z_id).drop(dfZData.z_id).withColumnRenamed('sum(z_m_tot_miles_yr)','z_comm_miles')

dfZData = dfZData.withColumn('z_comm_miles_ph',dfZData.z_comm_miles/dfZData.z_pop*PERSONSPERHH)
dfZData = dfZData.withColumn('z_carb_ton_ph',dfZData.z_comm_miles_ph*MILES2CARB)
dfZData = dfZData.join(dfFinancials,dfZData.z_id==dfFinancials.z_id).drop(dfZData.z_id)

dfZData = dfZData.select('z_id', 'z_lat_d', 'z_long_d', 'z_land', 'z_water', \
                         'z_pop', 'z_households', 'z_comm', 'z_med_inc', 'z_house_unit', \
                         'z_comm_miles', 'z_comm_miles_ph', 'z_carb_ton_ph', 'z_pov', 'z_per_comm')

In [8]:
dfZData.coalesce(1).write.csv(PATH_PROCESSEDDATA + ZCARBON_FN,header=True, mode='overwrite')