# 1. Setup and Drive Mount

Mounts Google Drive to allow for persistent data storage and sets up the workspace directory where the project repository will be located.

In [1]:
from google.colab import drive
from pathlib import Path
import sys
import os

# 1. Mount Drive
drive.mount('/content/drive')

# 2. Set workspace
WORKSPACE_DIR = Path('/content/drive/MyDrive/temp-data-pipeline')
WORKSPACE_DIR.mkdir(parents=True, exist_ok=True)
os.chdir(WORKSPACE_DIR)

# 3. Clone repository
# TODO: REPLACE 'YOUR_USERNAME' WITH YOUR ACTUAL GITHUB USERNAME BELOW
if not (WORKSPACE_DIR / '.git').exists():
    print("Cloning repository...")
    !git clone https://github.com/kyler505/temp-data-pipeline.git .
else:
    print("Repository already exists, pulling latest changes...")
    !git pull

# Add workspace to python path
sys.path.insert(0, str(WORKSPACE_DIR))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Repository already exists, pulling latest changes...
Already up to date.


# 2. Install Dependencies

Installs the necessary Python libraries and the current project in editable mode so that changes to the code are immediately reflected.

In [2]:
print("Installing dependencies...")
!pip install -e .

Installing dependencies...
Obtaining file:///content/drive/MyDrive/temp-data-pipeline
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: temp-data-pipeline
  Building editable for temp-data-pipeline (pyproject.toml) ... [?25l[?25hdone
  Created wheel for temp-data-pipeline: filename=temp_data_pipeline-0.1.0-0.editable-py3-none-any.whl size=1374 sha256=ab96e7ed975a571116f2e3937f9a6dc9d36b21e16af9a33f15635b862f243765
  Stored in directory: /tmp/pip-ephem-wheel-cache-0h7s5qel/wheels/78/a9/9c/fcbaf7e053bef092e59418d008e31e1346c7b345f02b6ae767
Successfully built temp-data-pipeline
Installing collected packages: temp-data-pipeline
  Attempting uninstall: temp-data-pipeline
    Found existing installation: temp-data-pipeline 0.1.0
    Uninstalling te

# 3. Configure Data Directories

Sets up the path variables for where data will be stored in Google Drive, ensuring the directories exist before running the pipeline.

In [3]:
from pathlib import Path

DRIVE_DATA_DIR = Path('/content/drive/MyDrive/temp-data-pipeline-data')
DRIVE_DATA_DIR.mkdir(parents=True, exist_ok=True)

print(f"Data directory: {DRIVE_DATA_DIR}")

Data directory: /content/drive/MyDrive/temp-data-pipeline-data


In [None]:
# Diagnostic: Test imports step by step
import sys
from pathlib import Path

# Ensure we can import the package
# Sometimes Colab needs the src directory explicitly in the path
src_path = Path(WORKSPACE_DIR) / 'src'
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))
    print(f"Added {src_path} to sys.path")

print("Testing imports...")
try:
    print("1. Testing tempdata import...")
    import tempdata
    print(f"   ✓ tempdata imported (version: {tempdata.__version__})")

    print("2. Testing tempdata.fetch import...")
    import tempdata.fetch
    print("   ✓ tempdata.fetch imported")

    print("3. Testing tempdata.fetch.noaa_hourly import...")
    from tempdata.fetch.noaa_hourly import fetch_noaa_hourly
    print("   ✓ fetch_noaa_hourly imported")

    print("\n✓ All imports successful!")
except Exception as e:
    print(f"\n✗ Import failed at step:")
    import traceback
    traceback.print_exc()

# 4. Run Pipeline

Runs the main data fetching function `fetch_noaa_hourly`. It downloads data for the specified station and date range, saving the results as Parquet files in the configured Drive directory.

In [None]:
# @title
import traceback
import sys
from pathlib import Path

# Ensure we can import the package
# Sometimes Colab needs the src directory explicitly in the path
src_path = Path(WORKSPACE_DIR) / 'src'
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

try:
    from tempdata.fetch.noaa_hourly import fetch_noaa_hourly

    print("\nRunning pipeline for KLGA (Jan 2024)...")
    written = fetch_noaa_hourly(
        station_id='KLGA',
        start_date='2024-01-01',
        end_date='2024-02-01',
        out_dir=str(DRIVE_DATA_DIR / 'raw' / 'noaa_hourly' / 'KLGA'),
        cache_dir=str(DRIVE_DATA_DIR / 'cache' / 'isd_csv' / 'KLGA'),
    )

    print(f"\n✓ Pipeline completed. Wrote {len(written)} files.")

except ImportError as e:
    print("\n[ERROR] Import failed. Full traceback:")
    traceback.print_exc()
    print("\n[WARNING] Package not installed or path not setup correctly yet.")
    print("If the pip install command above succeeded, you may need to Restart the Runtime and run this cell again.")
except Exception as e:
    print(f"\nAn error occurred: {e}")
    traceback.print_exc()


If the pip install command above succeeded, you may need to Restart the Runtime and run this cell again.


# 5. Verify Results

Scans the output directory for the generated Parquet files and loads the first one using pandas to verify that the data is readable and correct.

In [5]:
import pandas as pd

# Verify Results
parquet_files = sorted(
    (DRIVE_DATA_DIR / 'raw' / 'noaa_hourly' / 'KLGA').glob('*.parquet')
)

if parquet_files:
    df = pd.read_parquet(parquet_files[0])
    print(f"Loaded {len(df)} rows from {parquet_files[0].name}")
    print("\nFirst few rows:")
    print(df.head())
    print(f"\nDate range: {df['ts_utc'].min()} to {df['ts_utc'].max()}")
else:
    print("No parquet files found.")

No parquet files found.
