<a href="https://colab.research.google.com/github/ipeirotis-org/datasets/blob/main/Citibike/Copy_Citibike_Trips_After_2021.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import auth

auth.authenticate_user()

In [None]:
import os
import requests
from google.cloud import storage
from google.cloud import bigquery

import pandas as pd

import pyarrow as pa
import pyarrow.parquet as pq


In [None]:
project_id = "nyu-datasets"
gcs_bucket_name = 'citibike-archive'
gcs_folder = 'rides'
bq_dataset_id = "citibike"
bq_table_id = "rides"


# Initialize the Google Cloud Storage client
storage_client = storage.Client(project=project_id)
bucket = storage_client.bucket(gcs_bucket_name)
bigquery_client = bigquery.Client(project=project_id)

In [None]:
# Function to download a file from a URL
def download_file(url, local_filename):
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
    return local_filename

# Function to upload a file to Google Cloud Storage
def upload_to_gcs(local_file, bucket_name, destination_blob_name):
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(local_file)
    print(f"File {local_file} uploaded to {destination_blob_name}.")

  # Function to normalize column names
def normalize_columns(df):

    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')

    column_mapping = {
        "tripduration": "trip_duration",
        "starttime": "start_time",
        "stoptime": "stop_time",
        "bikeid": "bike_id",
        "usertype": "user_type",
        # After 2021
        "started_at": "start_time",
        "ended_at": "stop_time",
        "start_lat": "start_station_latitude",
        "start_lng": "start_station_longitude",
        "end_lat": "end_station_latitude",
        "end_lng": "end_station_longitude",
    }

    # Renaming columns
    df.rename(columns=column_mapping, inplace=True)

    return df

In [None]:
# Function to create a BigQuery table on top of Parquet files in GCS
def create_bigquery_table_from_gcs_parquet(dataset_id, table_id, gcs_folder):
    dataset_ref = bigquery_client.dataset(dataset_id)
    table_ref = dataset_ref.table(table_id)

    # Define the schema
    schema = [
        bigquery.SchemaField("ride_id", "STRING"),
        bigquery.SchemaField("rideable_type", "STRING"),
        bigquery.SchemaField("start_time", "TIMESTAMP"),
        bigquery.SchemaField("stop_time", "TIMESTAMP"),
        bigquery.SchemaField("start_station_id", "STRING"),
        bigquery.SchemaField("start_station_name", "STRING"),
        bigquery.SchemaField("start_station_latitude", "FLOAT64"),
        bigquery.SchemaField("start_station_longitude", "FLOAT64"),
        bigquery.SchemaField("end_station_id", "STRING"),
        bigquery.SchemaField("end_station_name", "STRING"),
        bigquery.SchemaField("end_station_latitude", "FLOAT64"),
        bigquery.SchemaField("end_station_longitude", "FLOAT64"),
        bigquery.SchemaField("member_casual", "STRING"),
    ]

    external_config = bigquery.ExternalConfig("PARQUET")
    external_config.source_uris = [f"gs://{gcs_bucket_name}/{gcs_folder}/parquet/*.parquet"]
    external_config.schema = schema

    table = bigquery.Table(table_ref)
    table.external_data_configuration = external_config

    table = bigquery_client.create_table(table)
    print(f"Table {table_id} created in dataset {dataset_id}.")


In [None]:


# Define the standardized schema
standard_schema = {
    "ride_id": "str",
    "rideable_type": "str",
    "start_time": "datetime64[ns]",
    "stop_time": "datetime64[ns]",
    "start_station_id": "str",
    "start_station_name": "str",
    "start_station_latitude": "float64",
    "start_station_longitude": "float64",
    "end_station_id": "str",
    "end_station_name": "str",
    "end_station_latitude": "float64",
    "end_station_longitude": "float64",
    "member_casual": "str",
}


In [None]:
def convert_to_parquet(folder_path, gcs_folder):
# Iterate over all files in the folder
  for file_name in sorted(os.listdir(folder_path)):
    if file_name.endswith(".csv"):
        local_file = os.path.join(folder_path, file_name)
        # gcs_destination = f"{gcs_folder}/csv/{os.path.basename(local_file)}"
        # upload_to_gcs(local_file, gcs_bucket_name, gcs_destination)

        # Read the CSV file into a DataFrame
        df = pd.read_csv(local_file, low_memory=False, dtype='str')

        df = normalize_columns(df)

        df.replace(r"NULL", None, inplace=True)
        df.replace(r"\N", None, inplace=True)
        df.replace(r"\.0$", r"", regex=True, inplace=True)


        for column, dtype in standard_schema.items():
            if column in df.columns:
              # print(f"Converting column '{column}' to {dtype}...")
              try:
                  df[column] = df[column].astype(dtype)
              except Exception as e:
                  print(f"Error converting column '{column}': {e}")
            else:
                print(f"Column '{column}' not found in CSV file.")
                df[column] = None

        for column in df.columns:
          if column not in standard_schema:
            print(f"Column '{column}' found in CSV file but not in standard schema.")

        # Convert datetime columns to pandas datetime format


        # df['birth_year'] = pd.to_numeric(df['birth_year'])
        df['start_time'] = pd.to_datetime(df['start_time'])
        df['stop_time'] = pd.to_datetime(df['stop_time'])


        # Define PyArrow schema with timestamp annotations
        schema = pa.schema([
            pa.field('ride_id', pa.string()),
            pa.field('rideable_type', pa.string()),
            pa.field('start_time', pa.timestamp('us')),
            pa.field('stop_time', pa.timestamp('us')),
            pa.field('start_station_id', pa.string()),
            pa.field('start_station_name', pa.string()),
            pa.field('start_station_latitude', pa.float64()),
            pa.field('start_station_longitude', pa.float64()),
            pa.field('end_station_id', pa.string()),
            pa.field('end_station_name', pa.string()),
            pa.field('end_station_latitude', pa.float64()),
            pa.field('end_station_longitude', pa.float64()),
            pa.field('member_casual', pa.string()),
        ])

        # Save DataFrame as Parquet with the defined schema
        table = pa.Table.from_pandas(df, schema=schema)
        parquet_file = local_file.replace(".csv", ".parquet")
        pq.write_table(table, parquet_file)

        # parquet_file = local_file.replace(".csv", ".parquet")
        # df.to_parquet(parquet_file, index=False)

        # Upload Parquet file to GCS
        gcs_destination = f"{gcs_folder}/parquet/{os.path.basename(parquet_file)}"
        upload_to_gcs(parquet_file, gcs_bucket_name, gcs_destination)



        # Remove local Parquet file after upload
        os.remove(parquet_file)


# 2021 -- Different schema starts

In [None]:
!curl https://s3.amazonaws.com/tripdata/2021-citibike-tripdata.zip -o 2021-citibike-tripdata.zip
!unzip 2021-citibike-tripdata.zip

In [None]:
!mv 2021-citibike-tripdata/*/* 2021-citibike-tripdata/

!rm -rf __MACOSX*
!rm 2021-citibike-tripdata.zip

In [None]:
!rm 2021-citibike-tripdata/202101*

In [None]:
# Define the folder containing the CSV files
folder_path = "2021-citibike-tripdata"

convert_to_parquet(folder_path, gcs_folder)


# Starting Feb 2021 with the new schema

In [None]:
!curl https://s3.amazonaws.com/tripdata/2022-citibike-tripdata.zip -o 2022-citibike-tripdata.zip
!unzip 2022-citibike-tripdata.zip

In [None]:
!mv 2022-citibike-tripdata/*/* 2022-citibike-tripdata/

!rm -rf __MACOSX*
!rm 2022-citibike-tripdata.zip

In [None]:
# Define the folder containing the CSV files
folder_path = "2022-citibike-tripdata"

convert_to_parquet(folder_path, gcs_folder)


In [None]:
!curl https://s3.amazonaws.com/tripdata/2023-citibike-tripdata.zip -o 2023-citibike-tripdata.zip
!unzip 2023-citibike-tripdata.zip

In [None]:
!mv 2023-citibike-tripdata/*/* 2023-citibike-tripdata/

!rm -rf __MACOSX*
!rm 2023-citibike-tripdata.zip

In [None]:
# Define the folder containing the CSV files
folder_path = "2023-citibike-tripdata"

convert_to_parquet(folder_path, gcs_folder)


In [None]:
!rm -rf __MACOSX/ 2024-citibike-tripdata

In [None]:
!mkdir -p 2024-citibike-tripdata
!curl https://s3.amazonaws.com/tripdata/202401-citibike-tripdata.csv.zip -o 2024-citibike-tripdata/202401-citibike-tripdata.csv.zip
!cd 2024-citibike-tripdata && unzip 202401-citibike-tripdata.csv.zip && rm 202401-citibike-tripdata.csv.zip
!rm -rf 2024-citibike-tripdata/__MACOSX/

In [None]:
!curl https://s3.amazonaws.com/tripdata/202402-citibike-tripdata.csv.zip -o 2024-citibike-tripdata/202402-citibike-tripdata.csv.zip
!cd 2024-citibike-tripdata && unzip 202402-citibike-tripdata.csv.zip && rm 202402-citibike-tripdata.csv.zip
!rm -rf 2024-citibike-tripdata/__MACOSX/

In [None]:
!curl https://s3.amazonaws.com/tripdata/202403-citibike-tripdata.csv.zip -o 2024-citibike-tripdata/202403-citibike-tripdata.csv.zip
!cd 2024-citibike-tripdata && unzip 202403-citibike-tripdata.csv.zip && rm 202403-citibike-tripdata.csv.zip
!rm -rf 2024-citibike-tripdata/__MACOSX/

In [None]:
!curl https://s3.amazonaws.com/tripdata/202404-citibike-tripdata.zip -o 2024-citibike-tripdata/202404-citibike-tripdata.csv.zip
!cd 2024-citibike-tripdata && unzip 202404-citibike-tripdata.csv.zip && rm 202404-citibike-tripdata.csv.zip
!rm -rf 2024-citibike-tripdata/__MACOSX/

In [None]:
!curl https://s3.amazonaws.com/tripdata/202405-citibike-test-tripdata.csv.zip -o 2024-citibike-tripdata/202405-citibike-test-tripdata.csv.zip
!cd 2024-citibike-tripdata && unzip 202405-citibike-test-tripdata.csv.zip && rm 202405-citibike-test-tripdata.csv.zip
!rm -rf 2024-citibike-tripdata/__MACOSX/

In [None]:
# Define the folder containing the CSV files
folder_path = "2024-citibike-tripdata"

convert_to_parquet(folder_path, gcs_folder)
