# Data Retrieval / Download

In [2]:
from urllib.request import urlretrieve

import os

output_relative_dir = '../data/landing/'

## TLC Datasets

In [2]:
YEARS = ('2021', '2022', '2023')

MONTHS = range(1, 13)

VEHICLE_TYPES = ('yellow', 'green')

In [8]:
import os

# navigate to the raw data directory
output_relative_dir = '../data/landing/tlc/'

# check if it exists as it makedir will raise an error if it does exist
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)
    
# now, for each type of data set we will need, we will create the paths
for type in VEHICLE_TYPES:
    for year in YEARS:
        if not os.path.exists(output_relative_dir + type + '/' + year):
            os.makedirs(output_relative_dir + type + '/' + year)

if not os.path.exists(output_relative_dir + 'taxi_zones'):
    os.makedirs(output_relative_dir + 'taxi_zones')

In [9]:
# this is the URL template as of 07/2023
URL_TEMPLATE = "https://d37ci6vzurychx.cloudfront.net/trip-data/"#vehicleType_tripdata_year-month.parquet

In [10]:
# data output relative directory is `../data/landing/`

# download tlc datasets
for type in VEHICLE_TYPES:
    print(f"Begin {type}")
    for year in YEARS:
        tlc_output_dir = output_relative_dir + type + '/' + year
        print(f"Begin year {year}")
        for month in MONTHS:
            if (year == '2023' and month > 4): # stop at april for 2023
                break
            if (year == '2021' and month != 12): # only grab december for 2021
                continue
            # 0-fill i.e 1 -> 01, 2 -> 02, etc
            month = str(month).zfill(2) 
            print(f"Begin month {month}")
            
            # generate url
            url = f'{URL_TEMPLATE}{type}_tripdata_{year}-{month}.parquet'

            # generate output location and filename
            output_dir = f"{tlc_output_dir}/{year}-{month}.parquet"
            # download
            urlretrieve(url, output_dir) 
            
            print(f"Completed month {month}")
        print(f"Completed year {year}")
    print(f"Completed {type}")
    print(f"##########################################################")

Begin yellow
Begin year 2021
Begin month 12
Completed month 12
Completed year 2021
Begin year 2022
Begin month 01
Completed month 01
Begin month 02
Completed month 02
Begin month 03
Completed month 03
Begin month 04
Completed month 04
Begin month 05
Completed month 05
Begin month 06
Completed month 06
Begin month 07
Completed month 07
Begin month 08
Completed month 08
Begin month 09
Completed month 09
Begin month 10
Completed month 10
Begin month 11
Completed month 11
Begin month 12
Completed month 12
Completed year 2022
Begin year 2023
Begin month 01
Completed month 01
Begin month 02
Completed month 02
Begin month 03
Completed month 03
Begin month 04
Completed month 04
Completed year 2023
Completed yellow
##########################################################
Begin green
Begin year 2021
Begin month 12
Completed month 12
Completed year 2021
Begin year 2022
Begin month 01
Completed month 01
Begin month 02
Completed month 02
Begin month 03
Completed month 03
Begin month 04
Completed 

In [7]:
# download taxi zones csv and shapefile

import zipfile

output_dir = f"{output_relative_dir}/taxi_zones/taxi+_zone_lookup.csv"

url = 'https://d37ci6vzurychx.cloudfront.net/misc/taxi+_zone_lookup.csv'

urlretrieve(url, output_dir) 

output_dir = f"{output_relative_dir}/taxi_zones/taxi_zones.zip"

url = 'https://d37ci6vzurychx.cloudfront.net/misc/taxi_zones.zip'

urlretrieve(url, output_dir) 

# unzip shapefile
with zipfile.ZipFile(output_dir, 'r') as zip_ref:
    zip_ref.extractall(f"{output_relative_dir}/taxi_zones/")



## External Datasets

### Motor Vehicle Collisions NYC

In [40]:
from urllib.request import urlretrieve
import os

if not os.path.exists('../data/landing/mv_collisions'):
    os.makedirs('../data/landing/mv_collisions')

url = 'https://data.cityofnewyork.us/api/views/h9gi-nx95/rows.csv?accessType=DOWNLOAD'

output_dir = '../data/landing/mv_collisions/mv_collisions.csv'

urlretrieve(url, output_dir) 





('../data/landing/mv_collisions/mv_collisions.csv',
 <http.client.HTTPMessage at 0x7fb22822ce50>)

In [41]:
import pandas as pd 

pd.read_csv(output_dir, dtype={'ZIP CODE': object}).to_parquet('../data/landing/mv_collisions/mv_collisions.parquet')

#mv_collisions = pd.read_csv(output_dir)

#### Testing mv_collisions parquet file

In [42]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC") # fix timestamps loaded by spark
    .getOrCreate()
)

In [43]:
mv_collisions = spark.read.parquet('../data/landing/mv_collisions/mv_collisions.parquet')



In [44]:
mv_collisions.schema

mv_collisions.count()

2013583

### Transit Subway Entrances and Exits

In [20]:
from urllib.request import urlretrieve

if not os.path.exists(output_relative_dir + 'subway_entr_exit'):
    os.makedirs(output_relative_dir + 'subway_entr_exit')

url = 'https://data.ny.gov/resource/i9wp-a4ja.csv?$query=SELECT%0A%20%20median(%60division%60)%20AS%20%60division%60%2C%0A%20%20median(%60line%60)%20AS%20%60line%60%2C%0A%20%20%60station_location%60%0AGROUP%20BY%20%60station_location%60'

output_dir = '../data/landing/subway_entr_exit/subway_entr_exit.csv'

urlretrieve(url, output_dir) 

('../data/landing/subway_entr_exit/subway_entr_exit.csv',
 <http.client.HTTPMessage at 0x7fb2285f4eb0>)

In [21]:
import pandas as pd 

pd.read_csv(output_dir).to_parquet('../data/landing/subway_entr_exit/subway_entr_exit.parquet')


#### Testing subway_entr_exit parquet file

In [22]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC") # fix timestamps loaded by spark
    .getOrCreate()
)

In [38]:
subway_entr_exit = spark.read.parquet('../data/landing/subway_entr_exit/subway_entr_exit.parquet')


subway_entr_exit.limit(5)

subway_entr_exit.count()

472

### Transit Subway Hourly Ridership

In [14]:
from urllib.request import urlretrieve

if not os.path.exists(output_relative_dir + 'subway_hourly'):
    os.makedirs(output_relative_dir + 'subway_hourly')
    
url = 'https://data.ny.gov/api/geospatial/wujg-7c2s?accessType=DOWNLOAD&method=export&format=GeoJSON'

output_dir = '../data/landing/subway_hourly/subway_hourly.geojson'

urlretrieve(url, output_dir) 

url = 'https://data.ny.gov/api/views/wujg-7c2s/rows.csv?accessType=DOWNLOAD&sorting=true'

output_dir = '../data/landing/subway_hourly/subway_hourly.csv'

urlretrieve(url, output_dir) 



('../data/landing/subway_hourly/subway_hourly.csv',
 <http.client.HTTPMessage at 0x7f7045dc8550>)

In [15]:
import pandas as pd 

pd.read_csv(output_dir).to_parquet('../data/landing/subway_hourly/subway_hourly.parquet')


#### Testing subway_hourly parquet file

In [16]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC") # fix timestamps loaded by spark
    .getOrCreate()
)

In [37]:
subway_hourly = spark.read.parquet('../data/landing/subway_hourly/subway_hourly.parquet')


subway_hourly.limit(5)

subway_hourly.count()

5364324

### Transit Bus Hourly Ridership

In [45]:
from urllib.request import urlretrieve

if not os.path.exists(output_relative_dir + 'bus_hourly'):
    os.makedirs(output_relative_dir + 'bus_hourly')

url = 'https://data.ny.gov/api/views/kv7t-n8in/rows.csv?accessType=DOWNLOAD&sorting=true'

output_dir = '../data/landing/bus_hourly/bus_hourly.csv'

urlretrieve(url, output_dir) 

('../data/landing/bus_hourly/bus_hourly.csv',
 <http.client.HTTPMessage at 0x7fb22822e440>)

In [46]:
import pandas as pd 

pd.read_csv(output_dir).to_parquet('../data/landing/bus_hourly/bus_hourly.parquet')


#### Testing bus_hourly parquet file

In [47]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC") # fix timestamps loaded by spark
    .getOrCreate()
)

In [48]:
bus_hourly = spark.read.parquet('../data/landing/bus_hourly/bus_hourly.parquet')


bus_hourly.limit(20)

bus_hourly.count()

3199937

### Hotel Properties Citywide

In [33]:
from urllib.request import urlretrieve

if not os.path.exists(output_relative_dir + 'hotels'):
    os.makedirs(output_relative_dir + 'hotels')

url = 'https://data.cityofnewyork.us/api/views/tjus-cn27/rows.csv?accessType=DOWNLOAD'

output_dir = '../data/landing/hotels/hotels.csv'

urlretrieve(url, output_dir) 

('../data/landing/hotels/hotels.csv',
 <http.client.HTTPMessage at 0x7fa0350df940>)

In [34]:
import pandas as pd 

pd.read_csv(output_dir).to_parquet('../data/landing/hotels/hotels.parquet')


#### Testing pluto parquet file

In [35]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC") # fix timestamps loaded by spark
    .getOrCreate()
)

In [36]:
hotels = spark.read.parquet('../data/landing/hotels/hotels.parquet')


hotels.limit(5)

hotels.count()

5519

### NYC Airbnb Data


In [14]:
from urllib.request import urlretrieve

if not os.path.exists(output_relative_dir + 'airbnb'):
    os.makedirs(output_relative_dir + 'airbnb')

url = 'http://data.insideairbnb.com/united-states/ny/new-york-city/2022-12-04/visualisations/listings.csv'

output_dir = '../data/landing/airbnb/airbnb.csv'

urlretrieve(url, output_dir) 

('../data/landing/airbnb/airbnb.csv',
 <http.client.HTTPMessage at 0x7fa036f80310>)

In [25]:
import pandas as pd 

pd.read_csv(output_dir, dtype={"license": str}).to_parquet('../data/landing/airbnb/airbnb.parquet')




In [26]:
airbnb.schema

StructType([StructField('id', LongType(), True), StructField('name', StringType(), True), StructField('host_id', LongType(), True), StructField('host_name', StringType(), True), StructField('neighbourhood_group', StringType(), True), StructField('neighbourhood', StringType(), True), StructField('latitude', DoubleType(), True), StructField('longitude', DoubleType(), True), StructField('room_type', StringType(), True), StructField('price', LongType(), True), StructField('minimum_nights', LongType(), True), StructField('number_of_reviews', LongType(), True), StructField('last_review', StringType(), True), StructField('reviews_per_month', DoubleType(), True), StructField('calculated_host_listings_count', LongType(), True), StructField('availability_365', LongType(), True), StructField('number_of_reviews_ltm', LongType(), True), StructField('license', StringType(), True)])

#### Testing airbnb parquet file

In [27]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC") # fix timestamps loaded by spark
    .getOrCreate()
)

In [31]:
airbnb = spark.read.parquet('../data/landing/airbnb/airbnb.parquet')


airbnb.limit(5)

airbnb.count()

41533

### Census Data


In [43]:
import zipfile

# unzip zip
with zipfile.ZipFile('../data/landing/census/census.zip', 'r') as zip_ref:
    zip_ref.extractall('../data/landing/census/')

In [41]:
import pandas as pd 

pd.read_csv('../data/landing/census/census_block_loc.csv').to_parquet('../data/landing/census/census_block_loc.parquet')

pd.read_csv('../data/landing/census/nyc_census_tracts.csv').to_parquet('../data/landing/census/census_tracts.parquet')




In [None]:
airbnb.schema

StructType([StructField('id', LongType(), True), StructField('name', StringType(), True), StructField('host_id', LongType(), True), StructField('host_name', StringType(), True), StructField('neighbourhood_group', StringType(), True), StructField('neighbourhood', StringType(), True), StructField('latitude', DoubleType(), True), StructField('longitude', DoubleType(), True), StructField('room_type', StringType(), True), StructField('price', LongType(), True), StructField('minimum_nights', LongType(), True), StructField('number_of_reviews', LongType(), True), StructField('last_review', StringType(), True), StructField('reviews_per_month', DoubleType(), True), StructField('calculated_host_listings_count', LongType(), True), StructField('availability_365', LongType(), True), StructField('number_of_reviews_ltm', LongType(), True), StructField('license', StringType(), True)])

#### Testing census parquet files

In [None]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC") # fix timestamps loaded by spark
    .getOrCreate()
)

In [42]:
census_block_loc = spark.read.parquet('../data/landing/census/census_block_loc.parquet')


census_block_loc.limit(5)

census_block_loc.count()

38396

In [None]:
census_tracts = spark.read.parquet('../data/landing/census/census_tracts.parquet')


census_tracts.limit(5)

census_tracts.count()

41533