# Data Retrieval / Download

In [3]:
from urllib.request import urlretrieve

import os

output_relative_dir = '../data/landing/'

## TLC Datasets

In [4]:
YEARS = ['2022']

MONTHS = range(5, 12)

VEHICLE_TYPES = ['yellow']

In [5]:
import os

# navigate to the raw data directory
output_relative_dir = '../data/landing/tlc/'

# check if it exists as it makedir will raise an error if it does exist
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)
    
# now, for each type of data set we will need, we will create the paths
for type in VEHICLE_TYPES:
    for year in YEARS:
        if not os.path.exists(output_relative_dir + type):
            os.makedirs(output_relative_dir + type)

if not os.path.exists(output_relative_dir + 'taxi_zones'):
    os.makedirs(output_relative_dir + 'taxi_zones')

In [6]:
# this is the URL template as of 07/2023
URL_TEMPLATE = "https://d37ci6vzurychx.cloudfront.net/trip-data/"#vehicleType_tripdata_year-month.parquet

In [7]:
# data output relative directory is `../data/landing/`

# download tlc datasets
for type in VEHICLE_TYPES:
    print(f"Begin {type}")
    for year in YEARS:
        tlc_output_dir = output_relative_dir + type
        print(f"Begin year {year}")
        for month in MONTHS:
            # 0-fill i.e 1 -> 01, 2 -> 02, etc
            month = str(month).zfill(2) 
            print(f"Begin month {month}")
            
            # generate url
            url = f'{URL_TEMPLATE}{type}_tripdata_{year}-{month}.parquet'

            # generate output location and filename
            output_dir = f"{tlc_output_dir}/{year}-{month}.parquet"
            # download
            urlretrieve(url, output_dir) 
            
            print(f"Completed month {month}")
        print(f"Completed year {year}")
    print(f"Completed {type}")
    print(f"##########################################################")

Begin yellow
Begin year 2022
Begin month 05


Completed month 05
Begin month 06
Completed month 06
Begin month 07
Completed month 07
Begin month 08
Completed month 08
Begin month 09
Completed month 09
Begin month 10
Completed month 10
Begin month 11
Completed month 11
Completed year 2022
Completed yellow
##########################################################


In [8]:
# download taxi zones csv and shapefile

import zipfile

output_dir = f"{output_relative_dir}/taxi_zones/taxi+_zone_lookup.csv"

url = 'https://d37ci6vzurychx.cloudfront.net/misc/taxi+_zone_lookup.csv'

urlretrieve(url, output_dir) 

output_dir = f"{output_relative_dir}/taxi_zones/taxi_zones.zip"

url = 'https://d37ci6vzurychx.cloudfront.net/misc/taxi_zones.zip'

urlretrieve(url, output_dir) 


('../data/landing/tlc//taxi_zones/taxi_zones.zip',
 <http.client.HTTPMessage at 0x7faf08bda1a0>)

## External Datasets

In [9]:
output_relative_dir = '../data/landing/'

### Transit Subway Entrances and Exits

In [10]:
'''
from urllib.request import urlretrieve

if not os.path.exists(output_relative_dir + 'subway_entr_exit'):
    os.makedirs(output_relative_dir + 'subway_entr_exit')

url = 'https://data.ny.gov/resource/i9wp-a4ja.csv?$query=SELECT%0A%20%20median(%60division%60)%20AS%20%60division%60%2C%0A%20%20median(%60line%60)%20AS%20%60line%60%2C%0A%20%20%60station_location%60%0AGROUP%20BY%20%60station_location%60'

output_dir = '../data/landing/subway_entr_exit/subway_entr_exit.csv'

urlretrieve(url, output_dir) 
'''

# Download link doesn't work anymore

"\nfrom urllib.request import urlretrieve\n\nif not os.path.exists(output_relative_dir + 'subway_entr_exit'):\n    os.makedirs(output_relative_dir + 'subway_entr_exit')\n\nurl = 'https://data.ny.gov/resource/i9wp-a4ja.csv?$query=SELECT%0A%20%20median(%60division%60)%20AS%20%60division%60%2C%0A%20%20median(%60line%60)%20AS%20%60line%60%2C%0A%20%20%60station_location%60%0AGROUP%20BY%20%60station_location%60'\n\noutput_dir = '../data/landing/subway_entr_exit/subway_entr_exit.csv'\n\nurlretrieve(url, output_dir) \n"

In [11]:
import pandas as pd 

output_dir = '../data/landing/subway_entr_exit/subway_entr_exit.csv'

pd.read_csv(output_dir).to_parquet('../data/landing/subway_entr_exit/subway_entr_exit.parquet')


#### Testing subway_entr_exit parquet file

In [12]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC") # fix timestamps loaded by spark
    .getOrCreate()
)

your 131072x1 screen size is bogus. expect trouble
23/08/21 05:18:16 WARN Utils: Your hostname, DESKTOP-SATV84A resolves to a loopback address: 127.0.1.1; using 172.26.254.29 instead (on interface eth0)
23/08/21 05:18:16 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/21 05:18:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [13]:
subway_entr_exit = spark.read.parquet('../data/landing/subway_entr_exit/subway_entr_exit.parquet')


subway_entr_exit.limit(5)

subway_entr_exit.count()

472

### Hotel Properties Citywide

In [14]:
from urllib.request import urlretrieve

if not os.path.exists(output_relative_dir + 'hotels'):
    os.makedirs(output_relative_dir + 'hotels')

url = 'https://data.cityofnewyork.us/api/views/tjus-cn27/rows.csv?accessType=DOWNLOAD'

output_dir = '../data/landing/hotels/hotels.csv'

urlretrieve(url, output_dir) 

('../data/landing/hotels/hotels.csv',
 <http.client.HTTPMessage at 0x7faf08bd91b0>)

In [15]:
import pandas as pd 

pd.read_csv(output_dir).to_parquet('../data/landing/hotels/hotels.parquet')


#### Testing hotels parquet file

In [16]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC") # fix timestamps loaded by spark
    .getOrCreate()
)

In [17]:
hotels = spark.read.parquet('../data/landing/hotels/hotels.parquet')


hotels.limit(5)

hotels.count()

5519

### NYC Airbnb Data


In [18]:
from urllib.request import urlretrieve

if not os.path.exists(output_relative_dir + 'airbnb'):
    os.makedirs(output_relative_dir + 'airbnb')

url = 'http://data.insideairbnb.com/united-states/ny/new-york-city/2022-12-04/visualisations/listings.csv'

output_dir = '../data/landing/airbnb/airbnb.csv'

urlretrieve(url, output_dir) 

('../data/landing/airbnb/airbnb.csv',
 <http.client.HTTPMessage at 0x7faf00113340>)

In [19]:
import pandas as pd 

pd.read_csv(output_dir, dtype={"license": str}).to_parquet('../data/landing/airbnb/airbnb.parquet')




#### Testing airbnb parquet file

In [20]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC") # fix timestamps loaded by spark
    .getOrCreate()
)

In [21]:
airbnb = spark.read.parquet('../data/landing/airbnb/airbnb.parquet')


airbnb.limit(5)

airbnb.count()

41533

### Census Data


In [22]:
import zipfile

# unzip zip
with zipfile.ZipFile('../data/landing/census/census.zip', 'r') as zip_ref:
    zip_ref.extractall('../data/landing/census/')

In [23]:
import pandas as pd 

pd.read_csv('../data/landing/census/census_block_loc.csv').to_parquet('../data/landing/census/census_block_loc.parquet')

pd.read_csv('../data/landing/census/nyc_census_tracts.csv').to_parquet('../data/landing/census/census_tracts.parquet')




In [24]:
airbnb.schema

StructType([StructField('id', LongType(), True), StructField('name', StringType(), True), StructField('host_id', LongType(), True), StructField('host_name', StringType(), True), StructField('neighbourhood_group', StringType(), True), StructField('neighbourhood', StringType(), True), StructField('latitude', DoubleType(), True), StructField('longitude', DoubleType(), True), StructField('room_type', StringType(), True), StructField('price', LongType(), True), StructField('minimum_nights', LongType(), True), StructField('number_of_reviews', LongType(), True), StructField('last_review', StringType(), True), StructField('reviews_per_month', DoubleType(), True), StructField('calculated_host_listings_count', LongType(), True), StructField('availability_365', LongType(), True), StructField('number_of_reviews_ltm', LongType(), True), StructField('license', StringType(), True)])

#### Testing census parquet files

In [25]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC") # fix timestamps loaded by spark
    .getOrCreate()
)

In [26]:
census_block_loc = spark.read.parquet('../data/landing/census/census_block_loc.parquet')


census_block_loc.limit(5)

census_block_loc.count()

38396

In [27]:
census_tracts = spark.read.parquet('../data/landing/census/census_tracts.parquet')


census_tracts.limit(5)

census_tracts.count()

2167

### Parking Munimeters

In [28]:
from urllib.request import urlretrieve

if not os.path.exists(output_relative_dir + 'parking'):
    os.makedirs(output_relative_dir + 'parking')
    
url = 'https://data.cityofnewyork.us/api/geospatial/693u-uax6?accessType=DOWNLOAD&method=export&format=GeoJSON'

output_dir = '../data/landing/parking/parking.geojson'

urlretrieve(url, output_dir) 

url = 'https://data.cityofnewyork.us/api/views/693u-uax6/rows.csv?accessType=DOWNLOAD'

output_dir = '../data/landing/parking/parking.csv'

urlretrieve(url, output_dir) 


('../data/landing/parking/parking.csv',
 <http.client.HTTPMessage at 0x7faf0009a230>)

In [29]:
import pandas as pd 

pd.read_csv(output_dir).to_parquet('../data/landing/parking/parking.parquet')




#### Testing parking parquet file

In [30]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC") # fix timestamps loaded by spark
    .getOrCreate()
)

In [31]:
parking = spark.read.parquet('../data/landing/parking/parking.parquet')


parking.limit(5)

#parking.count()

ObjectID,Meter Number,Status,Pay By Cell Number,Meter_Hours,Parking_Facility_Name,Facility,Borough,On_Street,Side_of_Street,From_Street,To_Street,Latitude,Longitude,X,Y,Location
11750,1083076,Active,113091,2HR Pas Mon-Sat 0...,,On Street,Manhattan,West 85 Street,N,Amsterdam Avenue,Broadway,40.7875051180999,-73.9765950135923,990731.235713214,226187.112964138,POINT (-73.976595...
9857,1443404,Active,106172,3HR Com Mon-Fri 0...,,On Street,Manhattan,West 30 Street,N,Broadway,5 Avenue,40.7461997160805,-73.9869404709221,987868.647606388,211137.590897888,POINT (-73.986940...
11780,1193006,Active,113823,3HR Com Mon-Fri 0...,,On Street,Manhattan,Lexington Avenue,W,East 85 Street,East 84 Street,40.7784865293823,-73.9564275260737,996317.58740139,222903.46992515,POINT (-73.956427...
1111,4512493,Active,427918,6HR Pas Mon-Fri 0...,,On Street,Queens,Kissena Boulevard,W,Horace Harding Ex...,Melbourne Avenue,40.7354393855834,-73.8148380161902,1035564.53745064,207271.225271061,POINT (-73.814838...
10122,1443694,Active,106557,2HR Pas Mon-Sat 0...,,On Street,Manhattan,3 Avenue,W,East 24 Street,East 25 Street,40.7399045161829,-73.9825104553208,989096.612670138,208844.263135478,POINT (-73.982510...
