# Data Retrieval / Download

In [1]:
from urllib.request import urlretrieve

## TLC Datasets

In [5]:
YEARS = ('2022', '2023')

MONTHS = range(1, 13)

VEHICLE_TYPES = ('yellow', 'green', 'fhvhv')

In [6]:
import os

# navigate to the raw data directory
output_relative_dir = '../data/raw/tlc'

# check if it exists as it makedir will raise an error if it does exist
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)
    
# now, for each type of data set we will need, we will create the paths
for type in VEHICLE_TYPES:
    for year in YEARS:
        if not os.path.exists(output_relative_dir + type + '/' + year):
            os.makedirs(output_relative_dir + type + '/' + year)

if not os.path.exists(output_relative_dir + 'taxi_zones'):
    os.makedirs(output_relative_dir + 'taxi_zones')

if not os.path.exists('../data/raw/mv_collisions'):
    os.makedirs('../data/raw/mv_collisions')

In [4]:
# this is the URL template as of 07/2023
URL_TEMPLATE = "https://d37ci6vzurychx.cloudfront.net/trip-data/"#vehicleType_tripdata_year-month.parquet

In [5]:
# data output relative directory is `../data/raw/`

# download tlc datasets
for type in VEHICLE_TYPES:
    print(f"Begin {type}")
    for year in YEARS:
        tlc_output_dir = output_relative_dir + type + '/' + year
        print(f"Begin year {year}")
        for month in MONTHS:
            if (year == '2023' and month > 4):
                break
            # 0-fill i.e 1 -> 01, 2 -> 02, etc
            month = str(month).zfill(2) 
            print(f"Begin month {month}")
            
            # generate url
            url = f'{URL_TEMPLATE}{type}_tripdata_{year}-{month}.parquet'

            # generate output location and filename
            output_dir = f"{tlc_output_dir}/{year}-{month}.parquet"
            # download
            urlretrieve(url, output_dir) 
            
            print(f"Completed month {month}")
        print(f"Completed year {year}")
    print(f"Completed {type}")
    print(f"##########################################################")

Begin yellow
Begin year 2022
Begin month 01
Completed month 01
Begin month 02
Completed month 02
Begin month 03
Completed month 03
Begin month 04
Completed month 04
Begin month 05
Completed month 05
Begin month 06
Completed month 06
Begin month 07
Completed month 07
Begin month 08
Completed month 08
Begin month 09
Completed month 09
Begin month 10
Completed month 10
Begin month 11
Completed month 11
Begin month 12
Completed month 12
Completed year 2022
Begin year 2023
Begin month 01
Completed month 01
Begin month 02
Completed month 02
Begin month 03
Completed month 03
Begin month 04
Completed month 04
Completed year 2023
Completed yellow
##########################################################
Begin green
Begin year 2022
Begin month 01
Completed month 01
Begin month 02
Completed month 02
Begin month 03
Completed month 03
Begin month 04
Completed month 04
Begin month 05
Completed month 05
Begin month 06
Completed month 06
Begin month 07
Completed month 07
Begin month 08
Completed mont

In [7]:
# download taxi zones csv and shapefile

import zipfile

output_dir = f"{output_relative_dir}/taxi_zones/taxi+_zone_lookup.csv"

url = 'https://d37ci6vzurychx.cloudfront.net/misc/taxi+_zone_lookup.csv'

urlretrieve(url, output_dir) 

output_dir = f"{output_relative_dir}/taxi_zones/taxi_zones.zip"

url = 'https://d37ci6vzurychx.cloudfront.net/misc/taxi_zones.zip'

urlretrieve(url, output_dir) 

# unzip shapefile
with zipfile.ZipFile(output_dir, 'r') as zip_ref:
    zip_ref.extractall(f"{output_relative_dir}/taxi_zones/")



## External Datasets

### Motor Vehicle Collisions NYC

In [7]:
from urllib.request import urlretrieve

url = 'https://data.cityofnewyork.us/api/views/h9gi-nx95/rows.csv?accessType=DOWNLOAD'

output_dir = '../data/raw/mv_collisions/mv_collisions.csv'

urlretrieve(url, output_dir) 





('../data/raw/mv_collisions/mv_collisions.csv',
 <http.client.HTTPMessage at 0x7f5c41786e90>)

In [15]:
import pandas as pd 

pd.read_csv(output_dir, dtype={'ZIP CODE': object}).to_parquet('../data/raw/mv_collisions/mv_collisions.parquet')

#mv_collisions = pd.read_csv(output_dir)

#### Testing mv_collisions parquet file

In [16]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC") # fix timestamps loaded by spark
    .getOrCreate()
)

your 131072x1 screen size is bogus. expect trouble
23/07/28 23:02:19 WARN Utils: Your hostname, DESKTOP-SATV84A resolves to a loopback address: 127.0.1.1; using 172.26.254.29 instead (on interface eth0)
23/07/28 23:02:19 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/07/28 23:02:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [17]:
mv_collisions = spark.read.parquet('../data/raw/mv_collisions/mv_collisions.parquet')



In [21]:
mv_collisions.select('ZIP CODE').limit(5)

ZIP CODE
""
""
""
11208.0
11233.0


### Bus Breakdowns / Delays NYC

In [30]:
from urllib.request import urlretrieve

if not os.path.exists(output_relative_dir + 'bus_breakdowns_delays'):
    os.makedirs(output_relative_dir + 'bus_breakdowns_delays')

url = 'https://data.cityofnewyork.us/api/views/ez4e-fazm/rows.csv?accessType=DOWNLOAD'

output_dir = '../data/raw/bus_breakdowns_delays/bus_breakdowns_delays.csv'

urlretrieve(url, output_dir) 

('../data/raw/bus_breakdowns_delays/bus_breakdowns_delays.csv',
 <http.client.HTTPMessage at 0x7f5b9e0dfe80>)

In [31]:
import pandas as pd 

pd.read_csv(output_dir, dtype={'Incident_Number': object}).to_parquet('../data/raw/bus_breakdowns_delays/bus_breakdowns_delays.parquet')

#pd.read_csv(output_dir).dtypes

#mv_collisions = pd.read_csv(output_dir)

#### Testing bus_breakdowns parquet file

In [32]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC") # fix timestamps loaded by spark
    .getOrCreate()
)

In [38]:
bus_breakdowns = spark.read.parquet('../data/raw/bus_breakdowns_delays/bus_breakdowns_delays.parquet')


bus_breakdowns.limit(5)

bus_breakdowns.count()

596085