<a href="https://colab.research.google.com/github/ipeirotis-org/datasets/blob/main/Citibike/Copy_Citibike_Trips.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import auth

auth.authenticate_user()

In [None]:
import os
import requests
from google.cloud import storage

# Replace these variables with your details
gcs_bucket_name = 'citibike-archive'
gcs_folder = 'tripdata'

# Initialize the Google Cloud Storage client
storage_client = storage.Client()
bucket = storage_client.bucket(gcs_bucket_name)

# Function to download a file from a URL
def download_file(url, local_filename):
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
    return local_filename

# Function to upload a file to Google Cloud Storage
def upload_to_gcs(local_file, bucket_name, destination_blob_name):
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(local_file)
    print(f"File {local_file} uploaded to {destination_blob_name}.")

# Main function to handle the download and upload
def main():
    base_url = 'https://s3.amazonaws.com/tripdata/'
    years = range(2015, 2024 + 1)  # Adjust the range as needed
    months = range(1, 12 + 1)

    for year in years:
        for month in months:
            if year == 2015 and month <= 8:
                continue
            if year == 2024 and month >= 4:
                continue
            if year == 2022 and month == 7:
                file_name = f"JC-202207-citbike-tripdata.csv.zip"
            elif year == 2017 and month == 8:
              file_name = f"JC-201708%20citibike-tripdata.csv.zip"
            else:
              file_name = f"JC-{year:04d}{month:02d}-citibike-tripdata.csv.zip"
            url = base_url + file_name
            local_file = file_name

            try:
                print(f"Downloading {url}...")
                download_file(url, local_file)

                gcs_destination = f"{gcs_folder}/{file_name}"
                print(f"Uploading {local_file} to gs://{gcs_bucket_name}/{gcs_destination}...")
                upload_to_gcs(local_file, gcs_bucket_name, gcs_destination)

                # Remove local file after upload
                os.remove(local_file)
            except requests.exceptions.RequestException as e:
                print(f"Failed to download {url}: {e}")
            except Exception as e:
                print(f"An error occurred: {e}")

if __name__ == "__main__":
    main()


In [None]:
from google.cloud import bigquery

def load_csv_from_gcs(project_id, dataset_id, table_id, gcs_path):
    client = bigquery.Client(project=project_id)

    dataset_ref = client.dataset(dataset_id)
    table_ref = dataset_ref.table(table_id)

    job_config = bigquery.LoadJobConfig(
        source_format=bigquery.SourceFormat.CSV,
        skip_leading_rows=1,  # Adjust if your CSV has a header row
        autodetect=True,
    )

    load_job = client.load_table_from_uri(
        gcs_path, table_ref, job_config=job_config
    )

    print(f"Starting job {load_job.job_id}")

    load_job.result()  # Waits for the job to complete.

    print(f"Job finished. Loaded {load_job.output_rows} rows into {dataset_id}:{table_id}")


if __name__ == "__main__":
    project_id = 'nyu-datasets'
    dataset_id = 'citibike'
    table_id = 'trips'
    gcs_path = 'gs://citibike-archive/tripdata/*.csv.zip'

    load_csv_from_gcs(project_id, dataset_id, table_id, gcs_path)
