In [26]:
!pip install tqdm




In [27]:
import os
import pandas as pd
import requests
import gzip
import shutil
from tqdm import tqdm  # For progress bar

# Define constants
BASE_URL = "https://github.com/DataTalksClub/nyc-tlc-data/releases/download/"

# Function to download files with progress bar
def download_csv(taxi, year, month):
    filename = f"{taxi}_tripdata_{year}-{month:02d}.csv.gz"
    url = f"{BASE_URL}{taxi}/{filename}"
    
    print(f"Downloading: {url}")
    
    # Stream the request with timeout
    try:
        response = requests.get(url, stream=True, timeout=15)
        response.raise_for_status()  # Raise an error for bad status codes

        # Get the total file size
        total_size = int(response.headers.get('content-length', 0))
        block_size = 1024  # 1 Kibibyte

        # Download with progress bar
        with open(filename, "wb") as file, tqdm(
            total=total_size, unit='iB', unit_scale=True, desc=filename
        ) as bar:
            for data in response.iter_content(block_size):
                bar.update(len(data))
                file.write(data)
        
        print(f"Downloaded: {filename}")
        return filename
    except requests.exceptions.Timeout:
        print("Download timed out!")
        return None
    except requests.exceptions.RequestException as e:
        print(f"Error downloading file: {e}")
        return None

# Function to extract .gz files using Python
def extract_gz(gz_file):
    if gz_file and os.path.exists(gz_file):
        csv_file = gz_file.replace(".gz", "")
        with gzip.open(gz_file, 'rb') as f_in:
            with open(csv_file, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        print(f"Extracted: {csv_file}")
        return csv_file
    else:
        print(f"File not found: {gz_file}")
        return None

# Download & extract Yellow Taxi December 2020 dataset
yellow_dec_2020_file = download_csv("yellow", 2020, 12)
csv_file = extract_gz(yellow_dec_2020_file)

# Compute file size if extraction is successful
if csv_file:
    file_size = os.path.getsize(csv_file) / (1024 * 1024)  # Convert bytes to MB
    print(f"Uncompressed file size of {csv_file}: {file_size:.1f} MB")

# Function to load and count rows
def count_rows(taxi, year, months):
    total_rows = 0
    for month in months:
        file = download_csv(taxi, year, month)
        csv_file = extract_gz(file)
        if csv_file:
            df = pd.read_csv(csv_file)
            total_rows += df.shape[0]
    return total_rows

# Get row count for Yellow Taxi 2020
yellow_2020_rows = count_rows("yellow", 2020, list(range(1, 13)))
print(f"Total Yellow Taxi rows for 2020: {yellow_2020_rows}")

# Get row count for Green Taxi 2020
green_2020_rows = count_rows("green", 2020, list(range(1, 13)))
print(f"Total Green Taxi rows for 2020: {green_2020_rows}")

# Get row count for Yellow Taxi March 2021
yellow_march_2021_rows = count_rows("yellow", 2021, [3])
print(f"Yellow Taxi rows for March 2021: {yellow_march_2021_rows}")

# Verify file name rendering
rendered_file_name = f"green_tripdata_2020-04.csv"
print(f"Rendered file name: {rendered_file_name}")

# Verify timezone setting
timezone_setting = "America/New_York"
print(f"Correct timezone configuration: timezone = '{timezone_setting}'")


Downloading: https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow/yellow_tripdata_2020-12.csv.gz


yellow_tripdata_2020-12.csv.gz: 100%|██████████| 26.5M/26.5M [00:02<00:00, 11.9MiB/s]


Downloaded: yellow_tripdata_2020-12.csv.gz
Extracted: yellow_tripdata_2020-12.csv
Uncompressed file size of yellow_tripdata_2020-12.csv: 128.3 MB
Downloading: https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow/yellow_tripdata_2020-01.csv.gz


yellow_tripdata_2020-01.csv.gz: 100%|██████████| 116M/116M [00:03<00:00, 38.2MiB/s] 


Downloaded: yellow_tripdata_2020-01.csv.gz
Extracted: yellow_tripdata_2020-01.csv


  df = pd.read_csv(csv_file)


Downloading: https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow/yellow_tripdata_2020-02.csv.gz


yellow_tripdata_2020-02.csv.gz: 100%|██████████| 115M/115M [00:05<00:00, 21.5MiB/s] 


Downloaded: yellow_tripdata_2020-02.csv.gz
Extracted: yellow_tripdata_2020-02.csv


  df = pd.read_csv(csv_file)


Downloading: https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow/yellow_tripdata_2020-03.csv.gz


yellow_tripdata_2020-03.csv.gz: 100%|██████████| 55.0M/55.0M [00:04<00:00, 13.3MiB/s]


Downloaded: yellow_tripdata_2020-03.csv.gz
Extracted: yellow_tripdata_2020-03.csv


  df = pd.read_csv(csv_file)


Downloading: https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow/yellow_tripdata_2020-04.csv.gz


yellow_tripdata_2020-04.csv.gz: 100%|██████████| 4.43M/4.43M [00:00<00:00, 7.64MiB/s]


Downloaded: yellow_tripdata_2020-04.csv.gz
Extracted: yellow_tripdata_2020-04.csv


  df = pd.read_csv(csv_file)


Downloading: https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow/yellow_tripdata_2020-05.csv.gz


yellow_tripdata_2020-05.csv.gz: 100%|██████████| 6.53M/6.53M [00:00<00:00, 10.8MiB/s]


Downloaded: yellow_tripdata_2020-05.csv.gz
Extracted: yellow_tripdata_2020-05.csv


  df = pd.read_csv(csv_file)


Downloading: https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow/yellow_tripdata_2020-06.csv.gz


yellow_tripdata_2020-06.csv.gz: 100%|██████████| 10.2M/10.2M [00:01<00:00, 7.88MiB/s]


Downloaded: yellow_tripdata_2020-06.csv.gz
Extracted: yellow_tripdata_2020-06.csv


  df = pd.read_csv(csv_file)


Downloading: https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow/yellow_tripdata_2020-07.csv.gz


yellow_tripdata_2020-07.csv.gz: 100%|██████████| 14.7M/14.7M [00:00<00:00, 16.1MiB/s]


Downloaded: yellow_tripdata_2020-07.csv.gz
Extracted: yellow_tripdata_2020-07.csv


  df = pd.read_csv(csv_file)


Downloading: https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow/yellow_tripdata_2020-08.csv.gz


yellow_tripdata_2020-08.csv.gz: 100%|██████████| 18.5M/18.5M [00:00<00:00, 20.4MiB/s]


Downloaded: yellow_tripdata_2020-08.csv.gz
Extracted: yellow_tripdata_2020-08.csv


  df = pd.read_csv(csv_file)


Downloading: https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow/yellow_tripdata_2020-09.csv.gz


yellow_tripdata_2020-09.csv.gz: 100%|██████████| 24.5M/24.5M [00:00<00:00, 26.3MiB/s]


Downloaded: yellow_tripdata_2020-09.csv.gz
Extracted: yellow_tripdata_2020-09.csv


  df = pd.read_csv(csv_file)


Downloading: https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow/yellow_tripdata_2020-10.csv.gz


yellow_tripdata_2020-10.csv.gz: 100%|██████████| 30.7M/30.7M [00:01<00:00, 15.4MiB/s]


Downloaded: yellow_tripdata_2020-10.csv.gz
Extracted: yellow_tripdata_2020-10.csv


  df = pd.read_csv(csv_file)


Downloading: https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow/yellow_tripdata_2020-11.csv.gz


yellow_tripdata_2020-11.csv.gz: 100%|██████████| 27.5M/27.5M [00:01<00:00, 24.5MiB/s]


Downloaded: yellow_tripdata_2020-11.csv.gz
Extracted: yellow_tripdata_2020-11.csv


  df = pd.read_csv(csv_file)


Downloading: https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow/yellow_tripdata_2020-12.csv.gz


yellow_tripdata_2020-12.csv.gz: 100%|██████████| 26.5M/26.5M [00:02<00:00, 12.5MiB/s]


Downloaded: yellow_tripdata_2020-12.csv.gz
Extracted: yellow_tripdata_2020-12.csv


  df = pd.read_csv(csv_file)


Total Yellow Taxi rows for 2020: 24648499
Downloading: https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2020-01.csv.gz


green_tripdata_2020-01.csv.gz: 100%|██████████| 7.54M/7.54M [00:00<00:00, 12.8MiB/s]


Downloaded: green_tripdata_2020-01.csv.gz
Extracted: green_tripdata_2020-01.csv


  df = pd.read_csv(csv_file)


Downloading: https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2020-02.csv.gz


green_tripdata_2020-02.csv.gz: 100%|██████████| 6.89M/6.89M [00:00<00:00, 9.23MiB/s]


Downloaded: green_tripdata_2020-02.csv.gz
Extracted: green_tripdata_2020-02.csv


  df = pd.read_csv(csv_file)


Downloading: https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2020-03.csv.gz


green_tripdata_2020-03.csv.gz: 100%|██████████| 3.92M/3.92M [00:00<00:00, 6.02MiB/s]


Downloaded: green_tripdata_2020-03.csv.gz
Extracted: green_tripdata_2020-03.csv


  df = pd.read_csv(csv_file)


Downloading: https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2020-04.csv.gz


green_tripdata_2020-04.csv.gz: 100%|██████████| 670k/670k [00:00<00:00, 2.31MiB/s]


Downloaded: green_tripdata_2020-04.csv.gz
Extracted: green_tripdata_2020-04.csv


  df = pd.read_csv(csv_file)


Downloading: https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2020-05.csv.gz


green_tripdata_2020-05.csv.gz: 100%|██████████| 1.06M/1.06M [00:00<00:00, 2.03MiB/s]


Downloaded: green_tripdata_2020-05.csv.gz
Extracted: green_tripdata_2020-05.csv


  df = pd.read_csv(csv_file)


Downloading: https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2020-06.csv.gz


green_tripdata_2020-06.csv.gz: 100%|██████████| 1.15M/1.15M [00:00<00:00, 3.08MiB/s]


Downloaded: green_tripdata_2020-06.csv.gz
Extracted: green_tripdata_2020-06.csv
Downloading: https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2020-07.csv.gz


green_tripdata_2020-07.csv.gz: 100%|██████████| 1.31M/1.31M [00:00<00:00, 3.39MiB/s]


Downloaded: green_tripdata_2020-07.csv.gz
Extracted: green_tripdata_2020-07.csv


  df = pd.read_csv(csv_file)


Downloading: https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2020-08.csv.gz


green_tripdata_2020-08.csv.gz: 100%|██████████| 1.47M/1.47M [00:00<00:00, 3.79MiB/s]


Downloaded: green_tripdata_2020-08.csv.gz
Extracted: green_tripdata_2020-08.csv


  df = pd.read_csv(csv_file)


Downloading: https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2020-09.csv.gz


green_tripdata_2020-09.csv.gz: 100%|██████████| 1.58M/1.58M [00:00<00:00, 3.33MiB/s]


Downloaded: green_tripdata_2020-09.csv.gz
Extracted: green_tripdata_2020-09.csv


  df = pd.read_csv(csv_file)


Downloading: https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2020-10.csv.gz
Error downloading file: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
File not found: None
Downloading: https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2020-11.csv.gz


green_tripdata_2020-11.csv.gz: 100%|██████████| 1.57M/1.57M [00:00<00:00, 3.76MiB/s]


Downloaded: green_tripdata_2020-11.csv.gz
Extracted: green_tripdata_2020-11.csv


  df = pd.read_csv(csv_file)


Downloading: https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2020-12.csv.gz


green_tripdata_2020-12.csv.gz: 100%|██████████| 1.47M/1.47M [00:00<00:00, 3.01MiB/s]


Downloaded: green_tripdata_2020-12.csv.gz
Extracted: green_tripdata_2020-12.csv


  df = pd.read_csv(csv_file)


Total Green Taxi rows for 2020: 1638931
Downloading: https://github.com/DataTalksClub/nyc-tlc-data/releases/download/yellow/yellow_tripdata_2021-03.csv.gz


yellow_tripdata_2021-03.csv.gz: 100%|██████████| 35.2M/35.2M [00:01<00:00, 24.9MiB/s]


Downloaded: yellow_tripdata_2021-03.csv.gz
Extracted: yellow_tripdata_2021-03.csv


  df = pd.read_csv(csv_file)


Yellow Taxi rows for March 2021: 1925152
Rendered file name: green_tripdata_2020-04.csv
Correct timezone configuration: timezone = 'America/New_York'
