In [1]:
from bokeh.plotting import figure, show
from bokeh.tile_providers import get_provider, Vendors
from bokeh.io import save, reset_output, output_notebook
import folium
from pyspark.sql import SparkSession
import warnings
warnings.filterwarnings("ignore")
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import col, datediff, round, to_date, lit, desc, date_format, to_timestamp
import pyspark.sql.functions as F
import geopandas as gpd
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from shutil import rmtree
from os import path
plt.rcParams['figure.dpi'] = 300

from pyspark.sql.types import *
from pyspark.sql.functions import col
from pyspark import SparkContext

# create a spark session (which will run spark jobs)
spark = SparkSession.builder.getOrCreate()

sc = SparkContext.getOrCreate(conf=swan_spark_conf) #Start the spark context
spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', True)
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)



Traffic Data Source: https://data.cityofnewyork.us/Transportation/Traffic-Volume-Counts-2014-2019-/ertz-hr4r

API:https://data.cityofnewyork.us/resource/ertz-hr4r.json

## Generate Traffic Daily Data With WeekDay Label

In [2]:
sdf_traffic = spark.read.csv("../raw_data/Traffic_Volume_Counts__2014-2019_.csv", header=True)
sdf_traffic.limit(3)

ID,Segment ID,Roadway Name,From,To,Direction,Date,12:00-1:00 AM,1:00-2:00AM,2:00-3:00AM,3:00-4:00AM,4:00-5:00AM,5:00-6:00AM,6:00-7:00AM,7:00-8:00AM,8:00-9:00AM,9:00-10:00AM,10:00-11:00AM,11:00-12:00PM,12:00-1:00PM,1:00-2:00PM,2:00-3:00PM,3:00-4:00PM,4:00-5:00PM,5:00-6:00PM,6:00-7:00PM,7:00-8:00PM,8:00-9:00PM,9:00-10:00PM,10:00-11:00PM,11:00-12:00AM
2,70376,3 Avenue,East 154 Street,East 155 Street,NB,09/13/2014,204,177,133,126,141,134,121,180,223,272,386,339,513,506,520,611,573,546,582,528,432,328,282,240
2,70376,3 Avenue,East 155 Street,East 154 Street,SB,09/13/2014,140,51,128,116,144,146,153,219,226,273,317,325,403,414,379,376,329,362,418,335,282,247,237,191
56,176365,Bedford Park Boul...,Grand Concourse,Valentine Avenue,EB,09/13/2014,94,73,65,61,64,73,65,113,169,210,182,245,244,233,280,272,264,236,213,190,199,183,147,103


In [3]:



sdf_traffic = sdf_traffic.withColumn("date", to_date(sdf_traffic["Date"],"MM/dd/yyyy"))

sdf_traffic = sdf_traffic.filter(sdf_traffic["date"] >= lit("2018-01-01"))\
       .filter(sdf_traffic["date"] <= lit("2018-05-31"))

traffic_df = sdf_traffic.toPandas()
traffic_df["Volume"] = traffic_df.iloc[:,7:].astype(int).sum(axis=1)

cols = list(traffic_df.columns)

traffic_df.drop(cols[7:-1], inplace=True, axis = 1)
traffic_df.drop(cols[0:6], inplace=True, axis = 1)



traffic_df.sort_values("date",inplace=True)

In [4]:
sdf_traffic

ID,Segment ID,Roadway Name,From,To,Direction,date,12:00-1:00 AM,1:00-2:00AM,2:00-3:00AM,3:00-4:00AM,4:00-5:00AM,5:00-6:00AM,6:00-7:00AM,7:00-8:00AM,8:00-9:00AM,9:00-10:00AM,10:00-11:00AM,11:00-12:00PM,12:00-1:00PM,1:00-2:00PM,2:00-3:00PM,3:00-4:00PM,4:00-5:00PM,5:00-6:00PM,6:00-7:00PM,7:00-8:00PM,8:00-9:00PM,9:00-10:00PM,10:00-11:00PM,11:00-12:00AM
317,69696,River Avenue,East 150 Street,East 151 Street,NB,2018-01-20,88,62,41,27,37,30,46,58,147,174,211,251,311,323,310,362,309,319,296,258,203,177,153,127
317,69696,River Avenue,East 150 Street,East 151 Street,SB,2018-01-20,50,37,25,15,21,30,50,78,99,115,146,174,216,186,205,237,240,237,247,211,175,142,104,99
318,70607,East 163 Street,EAGLE Avenue,CAULDWELL Avenue,EB,2018-01-20,274,220,158,139,153,120,168,275,356,439,461,480,507,570,616,605,600,589,617,528,501,446,437,347
318,70607,East 163 Street,EAGLE Avenue,CAULDWELL Avenue,WB,2018-01-20,243,180,152,138,164,119,153,194,276,350,368,420,436,523,536,544,556,546,522,516,439,466,384,331
319,78940,East 174 Street,Hoe Avenue,Vyse Avenue,EB,2018-01-20,97,57,38,45,35,63,73,119,196,265,322,346,369,395,394,363,347,360,283,287,221,235,152,175
319,78940,East 174 Street,Hoe Avenue,Vyse Avenue,WB,2018-01-20,140,84,64,56,38,44,50,77,145,202,279,331,366,325,324,363,426,351,318,305,239,210,173,170
320,69836,East 149 Street,Park Avenue,Morris Avenue,EB,2018-01-20,196,96,66,46,40,55,77,207,307,341,395,433,431,522,515,538,554,498,560,434,449,356,334,279
320,69836,East 149 Street,Park Avenue,Morris Avenue,WB,2018-01-20,249,216,165,162,160,127,196,238,368,404,442,491,495,572,535,563,713,582,536,522,442,398,399,388
321,111203,Exterior Street/1...,3 Avenue,East 138 Street,SB,2018-01-20,833,605,431,374,427,579,751,884,952,983,1017,1112,1171,1266,1455,1464,1454,1419,1358,1275,1166,1057,1032,1038
322,151663,Woodhaven Bouleva...,97 Avenue,101 Avenue,NB,2018-01-20,488,325,246,219,266,255,359,480,667,797,940,998,1205,1230,1350,1351,1396,1241,1175,1225,1083,915,814,654


In [5]:
## Aggregate by Date
traffic_daily = traffic_df.groupby("date").sum()
traffic_daily.reset_index(inplace=True)
traffic_daily["date"] = pd.to_datetime(traffic_daily["date"], format='%Y-%m-%d')
traffic_daily["DayOfWeek"] = traffic_daily["date"].dt.day_name().astype(str).str[0:3]
traffic_daily.head(3)

Unnamed: 0,date,Volume,DayOfWeek
0,2018-01-20,166016,Sat
1,2018-01-21,132531,Sun
2,2018-01-22,173544,Mon


In [6]:
fpath = '../preprocessed_data/traffic_daily.parquet'

traffic_daily.to_parquet(fpath)