In [15]:
# install google-cloud-bigquery package
!pip install google-cloud-bigquery



In [16]:
# function to upload CSV file to Google Cloud Storage (GCS)

from google.cloud import storage

def upload_csv_to_gcs(bucket_name, source_file_path, destination_blob_name):
    """Uploads a file to GCS."""
    # Initialize the GCS client
    storage_client = storage.Client()
    
    # Get the bucket
    bucket = storage_client.bucket(bucket_name)
    
    # Create a new blob (object) in the bucket
    blob = bucket.blob(destination_blob_name)
    
    # Upload the file
    blob.upload_from_filename(source_file_path)
    
    print(f"Uploaded {source_file_path} to gs://{bucket_name}/{destination_blob_name}")


In [None]:
# upload sellers csv to GCS

CSV_FILE_PATH = "/Users/taysk/sctp/Project/Olist/olist_sellers_dataset.csv"

upload_csv_to_gcs(
    bucket_name="sctp-grp5-olist",
    source_file_path=CSV_FILE_PATH,
    destination_blob_name="olist_sellers_dataset.csv"  
)

Uploaded /Users/taysk/sctp/Project/Olist/olist_sellers_dataset.csv to gs://sctp-grp5-olist/olist_sellers_dataset.csv


In [18]:
from google.cloud import bigquery

In [19]:
# function to load CSV file from GCS into BigQuery

def load_csv_from_gcs(project_id, dataset_id, table_id, gcs_uri):
    
    client = bigquery.Client(project=project_id)

    table_ref = f"{project_id}.{dataset_id}.{table_id}"

    job_config = bigquery.LoadJobConfig(
        source_format=bigquery.SourceFormat.CSV,
        skip_leading_rows=1,         # Skip header row
        autodetect=True,             # Infer schema automatically
        write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE  # Overwrite table
    )
    
    load_job = client.load_table_from_uri(
        gcs_uri,
        table_ref,
        job_config=job_config
    )
    
    print("Starting job:", load_job.job_id)
    
    load_job.result()  # Wait for job to finish
    
    print(f"Loaded {load_job.output_rows} rows into {table_ref}.")
    
    

In [20]:
# load CSV file from GCS into BigQuery

load_csv_from_gcs(
    project_id="swift-habitat-460607-h9",
    dataset_id="sctp_olist",
    table_id="Sellers",
    gcs_uri="gs://sctp-grp5-olist/olist_sellers_dataset.csv"
)


Starting job: 0b81092b-cab0-421e-b4f6-e22cc4a586cf
Loaded 3095 rows into swift-habitat-460607-h9.sctp_olist.Sellers.


In [None]:
# load products CSV file from GCS into BigQuery

load_csv_from_gcs(
    project_id="swift-habitat-460607-h9",
    dataset_id="sctp_olist",
    table_id="Products",
    gcs_uri="gs://sctp-grp5-olist/olist_products_dataset.csv"
)


Starting job: d6691ac5-3590-44c0-a779-dc78114b4fdb
Loaded 32951 rows into swift-habitat-460607-h9.sctp_olist.Products.


In [None]:
# load geolocation CSV file from GCS into BigQuery
load_csv_from_gcs(
    project_id="swift-habitat-460607-h9",
    dataset_id="sctp_olist",
    table_id="Geolocation",
    gcs_uri="gs://sctp-grp5-olist/olist_geolocation_dataset.csv"
)

Starting job: 5d6045c4-61d3-4bab-afc3-1f540adcf622
Loaded 1000163 rows into swift-habitat-460607-h9.sctp_olist.Geolocation.
