# Notebook 1: Extracting Data

In [1]:
# Importing required libraries and packages
import csv
import zipfile
import pandas as pd
from urllib.request import urlopen
from os.path import getsize
from urllib.request import urlretrieve

### TLC Yellow and Green Taxi Data (2019 and 2020)

In [2]:
# Downloads requested taxi data
def taxi_data(output_dir, 
              taxi_color,
              years):
    # Iterates through the years requested
    for year in years:
        file_name = (taxi_color
                     + "_tripdata_"
                     + str(year))
        # Iterates through all 12 months
        for m in range(1, 13):
            month = str(m).zfill(2)
            out = f'{file_name}-{month}.csv'
            url = f"https://s3.amazonaws.com/nyc-tlc/trip+data/{out}"
            urlretrieve(url, f"{output_dir}/{out}")

In [None]:
# Setting the output directory
output_dir = "../raw_data"
# Setting the years required
years = [2019, 2020]

# Download taxi data
taxi_data(output_dir, "yellow", years)
taxi_data(output_dir, "green", years)

### TLC Taxi Zone Data

In [5]:
# Download the zone lookup file
url = "https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv"
urlretrieve(url, f"{output_dir}/taxi+_zone_lookup.csv")

# Download the zone shape file
zip_url = "https://s3.amazonaws.com/nyc-tlc/misc/taxi_zones.zip"
urlretrieve(zip_url, f"{output_dir}/taxi_zones.zip")

# Unzip the zone shape file
with zipfile.ZipFile("../raw_data/taxi_zones.zip", 'r') as zf:
    zf.extractall("../raw_data")

### Unemployment Data

In [6]:
# Refer to the README.md file for the dataset source

# Creating the series for the index
index = pd.Series(['Bronx', 'Brooklyn',
                   'Manhattan', 'Queens',
                   'Staten Island'], name = "Borough")
# Creating the series for 2019
column_2019 = pd.Series([5.3, 4.0,
                    3.4, 3.4,
                    3.8], name = "2019") 
# Creating the series for 2020
column_2020 = pd.Series([16.0, 12.5,
                    9.5, 12.5,
                    10.6], name = "2020")
  
# Merge all series to a single dataframe
# Merge index and 2019
dftemp = pd.merge(index, column_2019,
                  left_index = True,
                  right_index = True)
# Merging the remaining 2020 series
unemployment = pd.merge(dftemp, column_2020,
                        left_index = True,
                        right_index = True)

# Set the index as borough names
unemployment = unemployment.set_index("Borough", drop = True)
unemployment.to_csv("../raw_data/unemployment.csv")