<a href="https://colab.research.google.com/github/ipeirotis-org/datasets/blob/main/Citibike/Copy_Citibike_Trips.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import auth

auth.authenticate_user()

In [None]:
import os
import requests
from google.cloud import storage

# Replace these variables with your details
gcs_bucket_name = 'citibike-archive'
gcs_folder = 'tripdata'

# Initialize the Google Cloud Storage client
storage_client = storage.Client()
bucket = storage_client.bucket(gcs_bucket_name)

# Function to download a file from a URL
def download_file(url, local_filename):
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
    return local_filename

# Function to upload a file to Google Cloud Storage
def upload_to_gcs(local_file, bucket_name, destination_blob_name):
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(local_file)
    print(f"File {local_file} uploaded to {destination_blob_name}.")

  # Function to normalize column names
def normalize_columns(df):
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
    return df

In [None]:

from google.cloud import bigquery

def load_csv_from_gcs(project_id, dataset_id, table_id, gcs_path):
    client = bigquery.Client(project=project_id)

    dataset_ref = client.dataset(dataset_id)
    table_ref = dataset_ref.table(table_id)

    job_config = bigquery.LoadJobConfig(
        source_format=bigquery.SourceFormat.CSV,
        skip_leading_rows=1,  # Adjust if your CSV has a header row
        autodetect=True,
    )

    load_job = client.load_table_from_uri(
        gcs_path, table_ref, job_config=job_config
    )

    print(f"Starting job {load_job.job_id}")

    load_job.result()  # Waits for the job to complete.

    print(f"Job finished. Loaded {load_job.output_rows} rows into {dataset_id}:{table_id}")


In [None]:
import pandas as pd

import os

# Define the standardized schema
standard_schema = {
    "tripduration": "int64",
    "starttime": "datetime64[ns]",
    "stoptime": "datetime64[ns]",
    "start_station_id": "str",
    "start_station_name": "str",
    "start_station_latitude": "float64",
    "start_station_longitude": "float64",
    "end_station_id": "str",
    "end_station_name": "str",
    "end_station_latitude": "float64",
    "end_station_longitude": "float64",
    "bikeid": "str",
    "usertype": "str",
    "birth_year": "int64",
    "gender": "int64"
}


# 2013

In [None]:
def convert_to_parquet(folder_path, bucket_name, gcs_folder):
# Iterate over all files in the folder
  for file_name in os.listdir(folder_path):
    if file_name.endswith(".csv"):
        local_file = os.path.join(folder_path, file_name)

        # Read the CSV file into a DataFrame
        df = pd.read_csv(local_file, low_memory=False, dtype='str')

        df = normalize_columns(df)

        df.replace(r"NULL", pd.NA, inplace=True)
        df.replace(r"\N", pd.NA, inplace=True)
        df.replace(r"\.0$", r"", regex=True, inplace=True)


        for column, dtype in standard_schema.items():
            if column in df.columns:
                # print(f"Converting column '{column}' to {dtype}...")
                df[column] = df[column].astype(dtype, errors='ignore')

            else:
                print(f"Column '{column}' not found in CSV file.")
                df[column] = None

        parquet_file = local_file.replace(".csv", ".parquet")
        df.to_parquet(parquet_file, index=False)

        # Upload Parquet file to GCS
        gcs_destination = f"{gcs_folder}/{os.path.basename(parquet_file)}"
        upload_to_gcs(parquet_file, bucket_name, gcs_destination)

        # Remove local Parquet file after upload
        os.remove(parquet_file)


In [None]:
# Function to create a BigQuery table on top of Parquet files in GCS
def create_bigquery_table_from_gcs_parquet(project_id, dataset_id, table_id, gcs_folder):

    client = bigquery.Client(project=project_id)

    dataset_ref = client.dataset(dataset_id)
    table_ref = dataset_ref.table(table_id)

    external_config = bigquery.ExternalConfig("PARQUET")
    external_config.source_uris = [f"gs://{bucket_name}/{gcs_folder}/*.parquet"]
    table = bigquery.Table(table_ref)
    table.external_data_configuration = external_config

    table = client.create_table(table)
    print(f"Table {table_id} created in dataset {dataset_id}.")



In [None]:
!curl https://s3.amazonaws.com/tripdata/2013-citibike-tripdata.zip -o 2013-citibike-tripdata.zip
!unzip 2013-citibike-tripdata.zip
!rm -rf __MACOSX*
!rm 2013-citibike-tripdata.zip

# Remove the files under the folder structure
# Seems that the files in the root folder use quotes and they are also one file per month
!rm -rf 2013-citibike-tripdata/*_*

In [None]:
# Define the bucket name and dataset/table details
bucket_name = "citibike-archive"
dataset_id = "citibike"
table_id = "trips"
create_bigquery_table_from_gcs_parquet("nyu-datasets", dataset_id, table_id, "tripdata")


In [None]:

# Define the folder containing the CSV files
folder_path = "2013-citibike-tripdata"

convert_to_parquet(folder_path, bucket_name, gcs_folder)



In [None]:
!curl https://s3.amazonaws.com/tripdata/2014-citibike-tripdata.zip -o 2014-citibike-tripdata.zip
!unzip 2014-citibike-tripdata.zip

!mv 2014-citibike-tripdata/1_January/201401-citibike-tripdata_1.csv 2014-citibike-tripdata/201401-citibike-tripdata.csv
!mv 2014-citibike-tripdata/2_February/201402-citibike-tripdata_1.csv 2014-citibike-tripdata/201402-citibike-tripdata.csv
!mv 2014-citibike-tripdata/3_March/201403-citibike-tripdata_1.csv 2014-citibike-tripdata/201403-citibike-tripdata.csv
!mv 2014-citibike-tripdata/4_April/201404-citibike-tripdata_1.csv 2014-citibike-tripdata/201404-citibike-tripdata.csv
!mv 2014-citibike-tripdata/5_May/201405-citibike-tripdata_1.csv 2014-citibike-tripdata/201405-citibike-tripdata.csv
!mv 2014-citibike-tripdata/6_June/201406-citibike-tripdata_1.csv 2014-citibike-tripdata/201406-citibike-tripdata.csv
!mv 2014-citibike-tripdata/7_July/201407-citibike-tripdata_1.csv 2014-citibike-tripdata/201407-citibike-tripdata.csv
!mv 2014-citibike-tripdata/8_August/201408-citibike-tripdata_1.csv 2014-citibike-tripdata/201408-citibike-tripdata.csv
!mv 2014-citibike-tripdata/9_September/201409-citibike-tripdata_1.csv 2014-citibike-tripdata/201409-citibike-tripdata.csv
!mv 2014-citibike-tripdata/10_October/201410-citibike-tripdata_1.csv 2014-citibike-tripdata/201410-citibike-tripdata.csv
!mv 2014-citibike-tripdata/11_November/201411-citibike-tripdata_1.csv 2014-citibike-tripdata/201411-citibike-tripdata.csv
!mv 2014-citibike-tripdata/12_December/201412-citibike-tripdata_1.csv 2014-citibike-tripdata/201412-citibike-tripdata.csv


!rm -rf __MACOSX*
!rm 2014-citibike-tripdata.zip

# Remove the files under the folder structure
# Seems that the files in the root folder use quotes and they are also one file per month
!rm -rf 2014-citibike-tripdata/*_*

In [None]:
# Define the folder containing the CSV files
folder_path = "2014-citibike-tripdata"

convert_to_parquet(folder_path, bucket_name, gcs_folder)


In [None]:
!curl https://s3.amazonaws.com/tripdata/2015-citibike-tripdata.zip -o 2015-citibike-tripdata.zip
!unzip 2015-citibike-tripdata.zip


In [None]:

!mv 2015-citibike-tripdata/1_January/* 2015-citibike-tripdata/
!mv 2015-citibike-tripdata/2_February/* 2015-citibike-tripdata/
!mv 2015-citibike-tripdata/3_March/* 2015-citibike-tripdata/
!mv 2015-citibike-tripdata/4_April/* 2015-citibike-tripdata/
!mv 2015-citibike-tripdata/5_May/* 2015-citibike-tripdata/
!mv 2015-citibike-tripdata/6_June/* 2015-citibike-tripdata/
!mv 2015-citibike-tripdata/7_July/* 2015-citibike-tripdata/
!mv 2015-citibike-tripdata/8_August/* 2015-citibike-tripdata/
!mv 2015-citibike-tripdata/9_September/* 2015-citibike-tripdata/
!mv 2015-citibike-tripdata/10_October/* 2015-citibike-tripdata/
!mv 2015-citibike-tripdata/11_November/* 2015-citibike-tripdata/
!mv 2015-citibike-tripdata/12_December/* 2015-citibike-tripdata/




In [None]:
# Define the folder containing the CSV files
folder_path = "2015-citibike-tripdata"

convert_to_parquet(folder_path, bucket_name, gcs_folder)


In [None]:
!cut -d',' -f14 2015-citibike-tripdata/*.csv | sort | uniq

In [None]:
df.birth_year.unique()

In [None]:

!rm -rf __MACOSX*
!rm 2015-citibike-tripdata.zip

