In [2]:
# Importing the libraries
from urllib.request import urlretrieve
import os
from zipfile import ZipFile

### Making data directories

In [2]:
# Code adapted from MAST30034 Tutorial 1
# from the current `tute_1` directory, go back two levels to the `MAST30034` directory
output_relative_dir = '../data/raw/'

# check if it exists as it makedir will raise an error if it does exist
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)

# Define the directory names
main_dirs = ['raw_train', 'raw_test']
taxi_type_dir = ['yellow', 'FHVHV']

# now, for each type of data set we will need, we will create the paths
for target_dir in main_dirs: # taxi_zones should already exist
    if not os.path.exists(output_relative_dir + target_dir):
        os.makedirs(output_relative_dir + target_dir)

# Contruct the path strings
final_output_train_dir = ''
final_output_test_dir = ''
final_output_train_dir = output_relative_dir  + main_dirs[0] + '/'
final_output_test_dir = output_relative_dir + main_dirs[1] + '/'

# Make new directories if the desired directory name is not present
for maindirs in main_dirs:
    if maindirs == main_dirs[0]:
        for sub_target_dir in taxi_type_dir:
            if not os.path.exists(final_output_train_dir + sub_target_dir):
                os.makedirs(final_output_train_dir + sub_target_dir)
    else:
        for sub_target_dir in taxi_type_dir:
            if not os.path.exists(final_output_test_dir + sub_target_dir):
                os.makedirs(final_output_test_dir + sub_target_dir)

 

### Downloading the raw training data

In [None]:
# Define the train  data range
YEAR_TRAIN = '2021'
MONTHS_TRAIN_DATA = range(1,13)


# Yellow and HVFHV URLs
URL_YELLOW_TAXI = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata"#2021-01.parquet
URL_FHVHV = "https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata"#2021-01.parquet


# Making the directory paths
output_dir_raw_train_yellow = final_output_train_dir + taxi_type_dir[0]
output_dir_raw_train_FHVHV = final_output_train_dir + taxi_type_dir[1]
output_dir_raw_test_yellow = final_output_test_dir + taxi_type_dir[0]
output_dir_raw_test_FHVHV = final_output_test_dir + taxi_type_dir[1]

#Downloading raw train data
for URL in (URL_YELLOW_TAXI, URL_FHVHV):

    # Downloading the training data for the yellow
    if URL == URL_YELLOW_TAXI:
        for month in MONTHS_TRAIN_DATA:
            if month < 10:
                month = str(month).zfill(2)
            else:

            # generate url
            url = f'{URL}_{YEAR_TRAIN}-{month}.parquet'
     
            # generate output location and filename
            output_dir = f"{output_dir_raw_train_yellow}/{YEAR_TRAIN}-{month}.parquet"
          
            # download
            urlretrieve(url, output_dir) 

    # Download the training data for HVFHV
    else:
        for month in MONTHS_TRAIN_DATA:
        
            if month < 10:
                month = str(month).zfill(2)
            else:
            # generate url
            url = f'{URL}_{YEAR_TRAIN}-{month}.parquet'
       
            # generate output location and filename
            output_dir = f"{output_dir_raw_train_FHVHV}/{YEAR_TRAIN}-{month}.parquet"
       
            # download
            urlretrieve(url, output_dir) 

### Downloading the raw testing data

In [None]:
# Define the test data range
YEAR_TEST = '2022'
MONTHS_TEST_DATA = range(1,5)

# Making the directory paths
output_dir_raw_test_yellow = final_output_test_dir + taxi_type_dir[0]
output_dir_raw_test_FHVHV = final_output_test_dir + taxi_type_dir[1]


# Downloading raw test data
for URL in (URL_YELLOW_TAXI, URL_FHVHV):

    # Yellow taxi test data
    if URL == URL_YELLOW_TAXI:
        for month in MONTHS_TEST_DATA:
            # 0-fill i.e 1 -> 01, 2 -> 02, etc
            if month < 10:
                month = str(month).zfill(2)
            else:
           
            # generate url
            url = f'{URL}_{YEAR_TEST}-{month}.parquet'
      
            # generate output location and filename
            output_dir = f"{output_dir_raw_test_yellow}/{YEAR_TEST}-{month}.parquet"
          
            # download
            urlretrieve(url, output_dir) 
      
    else:
        for month in MONTHS_TEST_DATA:
            # 0-fill i.e 1 -> 01, 2 -> 02, etc
            if month < 10:
                month = str(month).zfill(2)  
            else:
            # generate url
            url = f'{URL}_{YEAR_TEST}-{month}.parquet'
           
            # generate output location and filename
            output_dir = f"{output_dir_raw_test_FHVHV}/{YEAR_TEST}-{month}.parquet"
          
            # download
            urlretrieve(url, output_dir) 

### Downloading the geospatial data

In [8]:
# Downloading the geospatial data 

URL_LOOKUP = "https://d37ci6vzurychx.cloudfront.net/misc/taxi+_zone_lookup.csv"
URL_SHAPE = "https://d37ci6vzurychx.cloudfront.net/misc/taxi_zones.zip"

# Define the file names
output_csv = "../data/raw/nyc.csv"
output_zip = "../data/raw/nyc.zip"

# Download the data
urlretrieve(URL_LOOKUP, output_csv) 
urlretrieve(URL_SHAPE, output_zip)

('../data/raw/nyc.zip', <http.client.HTTPMessage at 0x7fb150dcbeb0>)

In [10]:
# Extracting the zip file of the geospatial data

# specifying the zip file name
file_name = "../data/raw/nyc.zip"
  
# opening the zip file in READ mode
with ZipFile(file_name, 'r') as zip:
    # extracting all the files
    zip.extractall(path = "../data/raw/")

### Downloading the Covid-19 data

In [None]:
# Downloading the Covid-19 data

url_covid = "https://health.data.ny.gov/api/views/xdss-u53e/rows.csv?accessType=DOWNLOAD"

# generate output location and filename
output_dir = "../data/raw/covid_county.csv"

# download
urlretrieve(url_covid, output_dir) 

In [3]:
# Downloading the geospatial data for the NYC's 5 boroughs

# Define the URL
borough_boundaries_url = "https://data.cityofnewyork.us/api/geospatial/tqmj-j8zm?method=export&format=Shapefile"

# generate output location and filename
output_dir = "../data/raw/borough_boundaries.zip"

# download
urlretrieve(borough_boundaries_url, output_dir) 

# Extracting the zip file
file_name = "../data/raw/borough_boundaries.zip"
with ZipFile(file_name, 'r') as zip:
    # extracting all the files
    zip.extractall(path = "../data/raw/")