In [1]:
#!/usr/bin/env python3
"""
Downloader Template - Functional Version
"""

# import libraries for use
import urllib.request
import urllib.error
import zipfile
import json
import os
import sys
from pathlib import Path
from datetime import datetime, timedelta
import argparse  

In [2]:
# setup project directories and return paths 

def setup_directories(project_root=None):
    """Set up project directories and return paths"""
    if project_root is None:
        project_root = Path.cwd()
    
    project_root = Path(project_root)
    raw_data_dir = project_root / "data" / "raw"
    external_data_dir = project_root / "data" / "external"
    
    # Create directories
    for dir_path in [raw_data_dir, external_data_dir]:
        dir_path.mkdir(parents=True, exist_ok=True)
    
    return raw_data_dir, external_data_dir



In [3]:
# call setup_directories function 

setup_directories(project_root=None)

(PosixPath('/Users/jordan/Documents/GitHub/citibike-data-analysis/data/raw'),
 PosixPath('/Users/jordan/Documents/GitHub/citibike-data-analysis/data/external'))

In [4]:
# actual data download 

def download_file(year_month, raw_data_dir):
    """Download data for specific year-month"""
    filename = f"{year_month}-citibike-tripdata.csv.zip"  # modify this line with filenames based on ingest
    url = f"https://s3.amazonaws.com/tripdata/{filename}"  # modify this line with correct URL
    filepath = raw_data_dir / filename
    
    print(f"Downloading {filename}...")

    # this is a nice to have -- updates as the files download
    class ProgressHook:
        def __init__(self):
            self.last_percent = 0
        
        def __call__(self, block_num, block_size, total_size):
            if total_size > 0:
                downloaded = block_num * block_size
                percent = min(100, (downloaded / total_size) * 100)
                if percent - self.last_percent >= 5:  # Update every 5%
                    print(f"\rProgress: {percent:.1f}%", end="", flush=True)
                    self.last_percent = percent
    
    try:
        urllib.request.urlretrieve(url, filepath, reporthook=ProgressHook())
        file_size = filepath.stat().st_size
        print(f"\n✓ Downloaded {filename} ({file_size:,} bytes)")
        return filepath
        
    except urllib.error.HTTPError as e:
        if e.code == 404:
            print(f"\n✗ File not found - {filename} may not exist for this month")
        else:
            print(f"\n✗ HTTP Error {e.code}: {e.reason}")
        return None
    except urllib.error.URLError as e:
        print(f"\n✗ URL Error: {e.reason}")
        return None
    except Exception as e:
        print(f"\n✗ Failed to download {filename}: {e}")
        return None



In [5]:
# extract zip files
def extract_zip(zip_path, raw_data_dir):
    """Extract zip file"""
    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            # get the CSV filename from the zip
            csv_files = [name for name in zip_ref.namelist() if name.endswith('.csv')]
            if csv_files:
                csv_filename = csv_files[0]
                zip_ref.extract(csv_filename, raw_data_dir)
                csv_path = raw_data_dir / csv_filename
                print(f"✓ Extracted {csv_filename}")
                return csv_path
            else:
                print("✗ No CSV file found in zip")
                return None
    except Exception as e:
        print(f"✗ Failed to extract {zip_path}: {e}")
        return None



In [6]:
def get_recent_months(num_months=3):
    """Get recent month strings in YYYYMM format"""
    months = []
    current_date = datetime.now()
    
    for i in range(num_months):
        # calculate date for i months ago
        year = current_date.year
        month = current_date.month - i
        
        # fix year rollover
        while month <= 0:
            month += 12
            year -= 1
        
        months.append(f"{year:04d}{month:02d}")
    
    return sorted(months)  # chronological order



In [7]:
# create a summary of downloaded data -- not essential,  
# but helpful when there's a lot of files 

def create_summary(downloaded_files, external_data_dir):
    """Create a summary of downloaded data"""
    summary = {
        "download_timestamp": datetime.now().isoformat(),
        "files": []
    }
    
    for file_path in downloaded_files:
        if file_path and file_path.exists():
            file_size = file_path.stat().st_size
            
            # if it's a CSV, try to get basic info -- not essential, but nice to have
            # remove this if the files are big or you're paying for compute
            
            if file_path.suffix == '.csv':
                try:
                    # Count lines quickly
                    with open(file_path, 'r') as f:
                        line_count = sum(1 for line in f) - 1  # Subtract header
                    
                    # Get header
                    with open(file_path, 'r') as f:
                        header = f.readline().strip().split(',')
                except:
                    line_count = "unknown"
                    header = []
            else:
                line_count = "N/A (zip file)"
                header = []
            
            summary["files"].append({
                "filename": file_path.name,
                "size_mb": round(file_size / (1024 * 1024), 2),
                "rows": line_count,
                "columns": header[:5] if header else []  # First 5 columns only
            })
    
    # Save summary
    summary_path = external_data_dir / "download_summary.json"
    with open(summary_path, 'w') as f:
        json.dump(summary, f, indent=2)
    
    print(f"\n📋 Summary saved to {summary_path}")
    return summary



In [8]:
# download recent X months of data 

def download_recent_data(num_months=3, extract=True, raw_data_dir=None): #change num_months if needed
    """Download recent months of data"""
    if raw_data_dir is None:
        raw_data_dir, _ = setup_directories()
    
    months = get_recent_months(num_months)
    print(f"📅 Downloading {num_months} months: {', '.join(months)}")
    
    downloaded_files = []
    extracted_files = []
    
    for month in months:
        print(f"\n--- Processing {month} ---")
        
        # Download
        zip_path = download_file(month, raw_data_dir)
        if zip_path:
            downloaded_files.append(zip_path)
            
            # Extract if requested
            if extract:
                csv_path = extract_zip(zip_path, raw_data_dir)
                if csv_path:
                    extracted_files.append(csv_path)
    
    return downloaded_files, extracted_files



In [9]:
# defines functions


def download_data(month=None, recent=12, no_extract=False, summary=False): #change recent for number of months to download
    """
    Download data function that can be called directly without argparse
    
    Args:
        month (str): Specific month in YYYYMM format
        recent (int): Number of recent months to download
        no_extract (bool): Keep zip files only
        summary (bool): Create data summary
    """
    # Set up directories
    raw_data_dir, external_data_dir = setup_directories()
    
    if month:
        # Download specific month
        print(f"📥 Downloading data for {month}")
        zip_path = download_file(month, raw_data_dir)
        files = [zip_path] if zip_path else []
        
        if zip_path and not no_extract:
            csv_path = extract_zip(zip_path, raw_data_dir)
            if csv_path:
                files.append(csv_path)
    else:
        # Download recent months
        downloaded_files, extracted_files = download_recent_data(
            recent, 
            extract=not no_extract,
            raw_data_dir=raw_data_dir
        )
        files = downloaded_files + extracted_files
    
    # Create summary if requested
    if summary and files:
        create_summary(files, external_data_dir)
    
    print(f"\n🎉 Download complete!")
    print(f"📁 Data location: {raw_data_dir}")
    return files

In [10]:
# this is for when you get an argparse error
def main():
    """Command line interface - only works when run as script"""
    
    # Check if we're in Jupyter/IPython
    if 'ipykernel' in sys.modules or 'IPython' in sys.modules:
        print("Running in Jupyter/IPython environment.")
        print("Use the download_data() function directly instead of main().")
        print("Example: download_data(month='202401', summary=True)")
        return
    
    parser = argparse.ArgumentParser(description='Download Citi Bike trip data')
    parser.add_argument('--month', type=str, help='Specific month (YYYYMM format)')
    parser.add_argument('--recent', type=int, default=3, help='Number of recent months')
    parser.add_argument('--no-extract', action='store_true', help='Keep zip files only')
    parser.add_argument('--summary', action='store_true', help='Create data summary')
    
    args = parser.parse_args()
    
    # Call the download function
    download_data(
        month=args.month,
        recent=args.recent,
        no_extract=args.no_extract,
        summary=args.summary
    )

if __name__ == "__main__":
    main()

Running in Jupyter/IPython environment.
Use the download_data() function directly instead of main().
Example: download_data(month='202401', summary=True)


In [11]:
download_data(month='202401', summary=True)

📥 Downloading data for 202401
Downloading 202401-citibike-tripdata.csv.zip...
Progress: 95.2%
✓ Downloaded 202401-citibike-tripdata.csv.zip (84,234,943 bytes)
✓ Extracted 202401-citibike-tripdata.csv

📋 Summary saved to /Users/jordan/Documents/GitHub/citibike-data-analysis/data/external/download_summary.json

🎉 Download complete!
📁 Data location: /Users/jordan/Documents/GitHub/citibike-data-analysis/data/raw


[PosixPath('/Users/jordan/Documents/GitHub/citibike-data-analysis/data/raw/202401-citibike-tripdata.csv.zip'),
 PosixPath('/Users/jordan/Documents/GitHub/citibike-data-analysis/data/raw/202401-citibike-tripdata.csv')]