# Data Retrieval / Download

In [3]:
from urllib.request import urlretrieve

In [23]:
YEARS = ('2022', '2023')
# adjust the range function to the numerical months i.e 1 = jan, 2 = feb, etc...
MONTHS = range(1, 13)

VEHICLE_TYPES = ('yellow', 'green', 'fhv', 'fhvhv')

In [6]:
import os

# from the current `tute_1` directory, go back two levels to the `MAST30034` directory
output_relative_dir = '../data/raw/'

# check if it exists as it makedir will raise an error if it does exist
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)
    
# now, for each type of data set we will need, we will create the paths
for target_dir in VEHICLE_TYPES: # taxi_zones should already exist
    if not os.path.exists(output_relative_dir + target_dir):
        os.makedirs(output_relative_dir + target_dir)

In [8]:
# this is the URL template as of 07/2023
URL_TEMPLATE = "https://d37ci6vzurychx.cloudfront.net/trip-data/"#vehicleType_tripdata_year-month.parquet

In [25]:
# data output directory is `data/tlc_data/`

for type in VEHICLE_TYPES:
    tlc_output_dir = output_relative_dir + type
    print(f"Begin {type}")
    for year in YEARS:
        print(f"Begin year {year}")
        for month in MONTHS:
            if (year == '2023' and month > 4):
                break
            # 0-fill i.e 1 -> 01, 2 -> 02, etc
            month = str(month).zfill(2) 
            print(f"Begin month {month}")
            
            # generate url
            url = f'{URL_TEMPLATE}{type}_tripdata_{year}-{month}.parquet'
            # generate output location and filename
            output_dir = f"{tlc_output_dir}/{year}-{month}.parquet"
            # download
            urlretrieve(url, output_dir) 
            
            print(f"Completed month {month}")
        print(f"Completed year {year}")
    print(f"Completed {type}")
    print(f"##########################################################")

Begin fhvhv
Begin year 2022
Begin month 01


Completed month 01
Begin month 02
Completed month 02
Begin month 03
Completed month 03
Begin month 04
Completed month 04
Begin month 05
Completed month 05
Begin month 06
Completed month 06
Begin month 07
Completed month 07
Begin month 08
Completed month 08
Begin month 09
Completed month 09
Begin month 10
Completed month 10
Begin month 11
Completed month 11
Begin month 12
Completed month 12
Completed year 2022
Begin year 2023
Begin month 01
Completed month 01
Begin month 02
Completed month 02
Begin month 03
Completed month 03
Begin month 04
Completed month 04
Completed year 2023
Completed fhvhv
##########################################################
Begin fhv
Begin year 2022
Begin month 01
Completed month 01
Begin month 02
Completed month 02
Begin month 03
Completed month 03
Begin month 04
Completed month 04
Begin month 05
Completed month 05
Begin month 06
Completed month 06
Begin month 07
Completed month 07
Begin month 08
Completed month 08
Begin month 09
Completed month 09
Begin mo